# Baseline model

- Value of Product: Improve health of general public

- Prediction: PM2.5 (particle concentration per cubic meter)

- Hypothesis: Temperature and windspeed are the most important features.

- Model: Linear regression with features ['temp', 'wind_spd']

- Metric: RMSE (as recommended by Zindi) = 43.45

In [None]:
# Data manipulation
import pandas as pd
import numpy as np
import missingno as msno

# Modeling
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
rs = 42

In [None]:
df_main = pd.read_csv('data/train_daily_mean.csv')

In [None]:
df_main.isna().sum()

In [None]:
msno.matrix(df_main);

In [None]:
df_clean = df_main.dropna(axis=0)

In [None]:
rows_dropped_percent = np.round(100 - (df_clean.shape[0] / df_main.shape[0]) * 100, 2)
print(f"Dropped {rows_dropped_percent}% of rows.")

In [None]:
msno.matrix(df_clean);

In [None]:
# check for incomplete 5-day observations
labels = df_clean.groupby('ID')['day'].count() == 5
labels = labels.reset_index()
# labels = labels[labels['day'] == False]
#labels['ID'].to_list()

type(labels)

In [None]:
df_clean.info()
labels.info()

In [None]:
# drop where ... == False
df_clean = df_clean.merge(labels, left_on='ID', right_on='ID')
df_clean = df_clean[df_clean['day_y'] == True]
df_clean

rows_dropped_percent = 100 - (df_clean.shape[0] / df_main.shape[0]) * 100
print(f"Dropped {rows_dropped_percent}% of rows.")

In [None]:
# readding target
targets = pd.read_csv('data/Train.csv', usecols=['ID', 'target'])
targets

In [None]:
df_clean = df_clean.merge(targets, on='ID', how='inner')

## Baseline model

In [None]:
corrs = df_clean.iloc[:, 2:].drop('day_y', axis=1).corr()
sns.heatmap(corrs, cmap=plt.colormaps.get('RdBu'), annot=True);

In [None]:
features = ['temp', 'wind_spd', 'target']
df_baseline = df_clean.groupby('ID')[features].mean()
df_baseline = df_baseline.reset_index().drop('ID', axis=1)
df_baseline.head(1)

In [None]:
y = df_baseline.pop('target')
X = df_baseline

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=rs)

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [None]:
y_pred = lr.predict(X_test)

In [None]:
print(f"RMSE for model {lr}: {np.round(mean_squared_error(y_test, y_pred, squared=False), 2)}")

In [None]:
fig = sns.regplot(x=y_pred, y=y_test)
fig.set_title(f"Linear Regression for target with features {features}")
fig.set_ylabel('Truth')
fig.set_xlabel('Predictions');