In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# imporing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# set the display option to show up t0 100 rows
pd.set_option('display.max_rows',100)

In [None]:
# importing the dataset
df = pd.read_csv('/kaggle/input/predicta-1-0-predict-the-unpredictable-part-2/daily_data.csv')

print(df[:10])

In [None]:
# dataset info
df.info()

In [None]:
# analyzing the dataset
def df_analyze(dataframe):
    df = pd.DataFrame()
    cl=[]; u=[]; s=[]; nans=[]
    
    for col in dataframe.columns:
        cl.append(col); u.append(dataframe[col].unique()); s.append(dataframe[col].unique().size); nans.append(dataframe[col].isnull().sum()) 
        
    df['Columns']=cl; df['Uniques']=u; df['Cardinality']=s; df['NaNs']=nans;

    return df

df_info = df_analyze(df)
df_info

In [None]:
# converting the sunset and sunrise columns into datetime format
for state in ['sunrise', 'sunset']:
    df[state] = pd.to_datetime(df[state])
    
    df[f'{state}_hour'] = df[state].dt.hour
    df[f'{state}_min'] = df[state].dt.minute
    
df = df.drop(['sunrise', 'sunset'], axis=1)
df[:10]

In [None]:
# encoding categorical variables
df['city_id'] = df['city_id'].astype('category').cat.codes

In [None]:
# encoding the day ids
df['day_id'] = pd.factorize(df['day_id'])[0] + 1
df

In [None]:
# analyzing the dataset
df_info = df_analyze(df)
df_info

In [None]:
# importing libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [None]:
# create training and validation sets
train_df = df[df['condition_text'].notnull()].copy()
val_df = df[df['condition_text'].isnull()].copy()

# drop the 'condition_text' column from the training set
X_dum = train_df.drop('condition_text', axis=1)
y_dum = train_df['condition_text']

# drop the 'condition_text' column from the validation set
X_val = val_df.drop('condition_text', axis=1)
y_val = val_df['condition_text']

# check the lengths match
assert len(df) == len(train_df) + len(val_df)

# check the lengths match for the target column
assert len(df[df['condition_text'].notnull()]) == len(y_dum)
assert len(df[df['condition_text'].isnull()]) == len(y_val)

# check the lengths match for the features
assert len(X_dum) == len(train_df)
assert len(X_val) == len(val_df)

# display the first few rows of the training set
print("Training set:")
print(len(train_df))
print(train_df[:5])

# display the first few rows of the validation set
print("\nValidation set:")
print(len(val_df))
print(val_df[:5])

In [None]:
# analyzing the dataset
df_info = df_analyze(train_df)
df_info

In [None]:
# build the classification model
clf = RandomForestClassifier()

In [None]:
# data split
X_train, X_test, y_train, y_test = train_test_split(X_dum, y_dum, test_size=0.2, random_state=42)

In [None]:
# getting the shape of the training data
X_train.shape, y_train.shape

In [None]:
# getting the shape of the test data
X_test.shape, y_test.shape

In [None]:
# rebuilding the model
clf.fit(X_train, y_train)

In [None]:
# predicting the target on the test set
y_pred = clf.predict(X_test)

In [None]:
# Create a DataFrame to display y_test and y_pred side by side
results_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

# Display the results side by side
print(results_df[:30])

In [None]:
# predicting the missing classes
y_val = clf.predict(X_val)
y_val[:10]

In [None]:
# getting the accuracy of the model
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
# concatenate y_val and X_val
y_val_df = pd.DataFrame({'condition_text': y_val})
concatenated_val_df = pd.concat([X_val.reset_index(drop=True), y_val_df.reset_index(drop=True)], axis=1)

# display the concatenated dataset
print(concatenated_val_df.head())

In [None]:
# concatenate train_df and concatenated_val_df
final_df = pd.concat([train_df, concatenated_val_df], ignore_index=True)

# sort by day id
final_df = final_df.sort_values(by='day_id')

# Add a new column with day IDs formatted as text
final_df['day_id'] = 'D' + final_df['day_id'].astype(str).str.zfill(4)

# display the final concatenated dataset
print(final_df.shape)

In [None]:
# analyzing the dataset
df_info = df_analyze(final_df)
df_info

In [None]:
# select the neccessary columns
submission_key_final = final_df[['day_id', 'condition_text']]
print(submission_key_final[:20])

# save to a new CSV file
submission_key_final.to_csv('submission_condition_class.csv', index=False)

# display the saved file for download
from IPython.display import FileLink

# provide a link to download the file
FileLink('submission_condition_class.csv')