In [153]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/predicta-1-0-predict-the-unpredictable/submission_key.csv
/kaggle/input/predicta-1-0-predict-the-unpredictable/sample_submission.csv
/kaggle/input/predicta-1-0-predict-the-unpredictable/historical_weather.csv


In [154]:
# imporing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# set the display option to show up t0 100 rows
pd.set_option('display.max_rows',100)

In [None]:
# importing the dataset
weather = pd.read_csv('/kaggle/input/predicta-1-0-predict-the-unpredictable/historical_weather.csv')
submission = pd.read_csv('/kaggle/input/predicta-1-0-predict-the-unpredictable/submission_key.csv')

print(weather[:10])
print(submission[:5])

In [None]:
# dataset info
weather.info()

In [None]:
# converting the date columns to datetime
weather['date'] = pd.to_datetime(weather['date'])

weather['day'] = weather['date'].dt.day
weather['month'] = weather['date'].dt.month
weather['year'] = weather['date'].dt.year

weather = weather.drop('date', axis=1)
weather[:10]

In [None]:
# analyzing the dataset
def df_analyze(dataframe):
    df = pd.DataFrame()
    cl=[]; u=[]; s=[]; nans=[]
    
    for col in dataframe.columns:
        cl.append(col); u.append(dataframe[col].unique()); s.append(dataframe[col].unique().size); nans.append(dataframe[col].isnull().sum()) 
        
    df['Columns']=cl; df['Uniques']=u; df['Cardinality']=s; df['NaNs']=nans;

    return df

df_info = df_analyze(weather)

# df_info.sort_values('NaNs', ascending=False)
df_info

In [None]:
# missing values
missing_percent = pd.DataFrame((weather.isna().sum(axis=0)/len(weather)) * 100, columns=['missing percentage']).sort_values('missing percentage', ascending=False)
missing_percent         

In [None]:
# removing the columns that has more than 70% missing value percentage
weather = weather.drop('snow_depth_mm', axis=1)
weather[:10]

In [None]:
# misising values
def add_missing_indicators(df):
    for col in df.columns:
        if df[col].isnull().sum() > 0:
            df[f'{col}_missing'] = df[col].isnull().astype(int)
    return df

weather = add_missing_indicators(weather)
weather

In [None]:
# converts the city_id column from the weather DataFrame into a dictionary with city IDs as keys and their counts as values.
weather_df_freqency_map = weather.city_id.value_counts().to_dict()
weather_df_freqency_map

In [None]:
# frequency encoding
# replacing the city IDs with their frequency
weather.city_id = weather.city_id.map(weather_df_freqency_map)
weather[:10]

In [None]:
# # function to calculate target encoding
# def target_encode(train_series, target, min_samples_leaf=1, smoothing=1):
#     assert len(train_series) == len(target)
#     temp = pd.concat([train_series, target], axis=1)
#     averages = temp.groupby(by=train_series.name)[target.name].agg(["mean", "count"])
#     smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))

#     prior = target.mean()
#     averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
#     averages.drop(["mean", "count"], axis=1, inplace=True)

#     return train_series.map(averages.to_dict()[target.name])

# # apply target encoding
# weather['city_id'] = target_encode(weather['city_id'], weather['avg_temp_c'])
# print(weather.head(10))

In [None]:
# dataset info
weather.info()

In [None]:
# handling the misising values

# importing the libraries
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [None]:
# preparing the varibles with missing values to be imputed
imputing_columns = ['avg_temp_c', 'min_temp_c', 'max_temp_c', 'precipitation_mm', 'avg_wind_dir_deg', 'avg_wind_speed_kmh']

# preparing the iterative imputer
iterate_imp = IterativeImputer(max_iter=10, random_state=0)

# fit and transform the data
imputed_data = iterate_imp.fit_transform(weather[imputing_columns])

# create a new DataFrame with the imputed values
imputed_df = pd.DataFrame(imputed_data, columns=imputing_columns)

# replace the original columns with the imputed values
weather[imputing_columns] = imputed_df

weather[:10]

In [None]:
# missing values
missing_percent = pd.DataFrame((weather.isna().sum(axis=0)/len(weather)) * 100, columns=['missing percentage']).sort_values('missing percentage', ascending=False)
missing_percent     

In [None]:
# plotting the distributions with box plot
import plotly.graph_objects as go
from plotly.subplots import make_subplots

feature_set = ['city_id', 'avg_temp_c', 'min_temp_c', 'max_temp_c', 'precipitation_mm', 'avg_wind_dir_deg', 'avg_wind_speed_kmh', 'day', 'month', 'year']

fig = make_subplots(rows=4, cols=3, subplot_titles=feature_set)
positions = [(1,1), (1,2), (1,3), (2,1), (2,2), (2,3), (3,1), (3,2), (3,3), (4,1), (4,2), (4,3)]

for i, pos in zip(feature_set, positions):
    trace = go.Box(y=weather[i], name=i)
    fig.add_trace(trace, row=pos[0], col=pos[1])
    
fig.update_layout(height=900, width=1000, title_text='Box Plots for Distributions')
fig.show()

In [None]:
# calculate the correlation matrix for the DataFrame
correlation_matrix = weather.corr()

In [None]:
# plotting the correleation analysis of the data using heatmaps
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='Purples', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

In [None]:
# calculate the absolute correlation values with the target variable
correlation_with_target = correlation_matrix['avg_temp_c'].abs()

# sort the correlation values in descending order and select the top features
top_features = correlation_with_target.sort_values(ascending=False)[1:16]

print(top_features)

In [None]:
## building and tarining the model

# importing the libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.layers import Dropout, BatchNormalization

In [None]:
# prepare the dataset with the selected features
X = weather.drop(['avg_temp_c', 'avg_temp_c_missing'], axis=1)
y = weather['avg_temp_c']

# split the dataset into training and test tests
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#building the model
model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(BatchNormalization())
model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(1, activation='linear'))

# compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error'])

# train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=1)

# evaluate the model
loss, mae = model.evaluate(X_test, y_test, verbose=1)
print(f'Test Mean Absolute Error: {mae}')

In [None]:
# plotting the validation loss with Plotly
import plotly.express as px

fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(len(history.history['val_loss']))), 
                         y=history.history['val_loss'], 
                         mode='lines', 
                         name='Validation Loss'))
fig.add_trace(go.Scatter(x=list(range(len(history.history['loss']))), 
                         y=history.history['loss'], 
                         mode='lines', 
                         name='Training Loss'))
fig.update_layout(title='Model Loss Over Epochs',
                  xaxis_title='Epochs',
                  yaxis_title='Loss')
fig.show()


In [None]:
# make predictions
y_pred = model.predict(X_test)

# create a DataFrame to compare predictions with actual values
comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred.flatten(), 'Error': (y_test - y_pred.flatten()).abs()})

# print the comparison DataFrame
print(comparison_df[:50])

**Predicting the target on the new dataset**

In [None]:
# importing the new dataset
new_df = pd.read_csv('/kaggle/input/predicta-1-0-predict-the-unpredictable/submission_key.csv')

# converting the date columns to datetime
new_df['date'] = pd.to_datetime(new_df['date'])

new_df['day'] = new_df['date'].dt.day
new_df['month'] = new_df['date'].dt.month
new_df['year'] = new_df['date'].dt.year

new_df = new_df.drop('date', axis=1)

# encoding the categorical variables
# converts the city_id column from the weatherDataFrame into a dictionary with city IDs as keys and their counts as values.
new_df_freqency_map = new_df.city_id.value_counts().to_dict()
# new_df_freqency_map

# frequency encoding
# replacing the city IDs with their frequency
new_df.city_id = new_df.city_id.map(new_df_freqency_map)
new_df[:100]

In [None]:
#predicting the target
#list of all features used in the original training
original_features = ['city_id', 'min_temp_c', 'max_temp_c',
                     'precipitation_mm', 'avg_wind_dir_deg', 
                     'avg_wind_speed_kmh', 'day', 'month', 
                     'year', 'min_temp_c_missing', 'max_temp_c_missing', 
                     'precipitation_mm_missing', 'avg_wind_dir_deg_missing',
                     'avg_wind_speed_kmh_missing']

# Features available in the new dataset
new_features = ['city_id', 'day', 'month', 'year']

# Add missing features with default values (zeros) to the new dataset
for feature in original_features:
    if feature not in new_features:
        new_df[feature] = 0
        
# ensuring the columns are in the same order as the original training set
# new_df = new_df[original_features]

# transform the new dataset using the previously fitted scaler
X_new = scaler.transform(new_df)

# make predictions
new_predictions = model.predict(X_new)

# display the first 10 predictions
new_predictions[:10]

In [None]:
# read the submission key file
submission_key = pd.read_csv('/kaggle/input/predicta-1-0-predict-the-unpredictable/submission_key.csv')

# add the sold_qty column with the rounded predictions
submission_key['avg_temp_c'] = new_predictions

# select only the ID and sold_qty columns
submission_key_final = submission_key[['submission_ID', 'avg_temp_c']]
print(submission_key_final[:20])

# save to a new CSV file
submission_key_final.to_csv('submission_avg_temp_predictions.csv', index=False)

# display the saved file for download
from IPython.display import FileLink

# provide a link to download the file
FileLink('submission_avg_temp_predictions.csv')