Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_curve
from sklearn.metrics import f1_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error


%matplotlib inline

In [None]:
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

Loading the training dataset

In [None]:
train_df = pd.read_csv(r'C:\Users\gonca\OneDrive\Ambiente de Trabalho\MEI\1sem\DAA\Pratical Assigment\training_data.csv')

In [None]:
train_df.head()

train_df.describe()

train_df.info()

In [None]:
for column in train_df.columns:
    train_df[column].value_counts()

In [None]:
train_df[['magnitude_of_delay','delay_in_seconds']]

In [None]:
sns.heatmap(train_df.corr(numeric_only=True))

In [None]:
sns.displot(train_df['avg_rain'], kde=True)

In [None]:
sns.histplot(train_df['avg_rain'])

In [None]:
train_df.groupby(by=['avg_rain', 'incidents']).count()

### Data preparation

In [None]:
X = train_df.drop(['city_name', 'magnitude_of_delay', 'affected_roads', 'luminosity', 'avg_rain', 'incidents'], axis=1)

In [None]:
from sklearn import preprocessing

estimator = preprocessing.KBinsDiscretizer(n_bins=3, random_state=2021)

#X['delay_in_seconds_binned'] = pd.qcut(X['delay_in_seconds'], q=3)

In [None]:
record_date = pd.DatetimeIndex(X['record_date'])

X['record_date_hour'] = record_date.hour
X['record_date_weekday'] = record_date.weekday

X.drop(columns=['record_date'], inplace=True)

X

In [None]:
y = train_df['incidents']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

------

In [None]:
# Criar bins (experimentar) -> avg temperature
train_df.loc[train_df['avg_temperature'].between(10, 19, 'both'), 'temperature'] = 'MEAN'
train_df.loc[train_df['avg_temperature'].between(0, 10, 'right'), 'grade'] = 'LOW'
x = train_df.loc[train_df['avg_temperature'].between(19, 35, 'right'), 'grade'] = 'HIGH'

In [None]:
# Eliminar (initialmente) -> avg humidity
train_df.drop(['avg_humidity'], axis=1)

# Eliminar -> avg precipitation
train_df.drop(['avg_precipitation'], axis=1)

In [None]:
sns.displot(train_df['avg_wind_speed'])

train_df['avg_wind_speed'].value_counts()

# Remover outliners -> avg wind speed
# outliers = train_df["avg_wind_speed"].quantile(8.0)

train_df[train_df["avg_wind_speed"] < 8.0]

### Model Fitting

In [None]:
clf = DecisionTreeClassifier(random_state=2021)

clf.fit(X_train, y_train)

scores = cross_val_score(clf, X, y, cv=5)

In [None]:
predictions = clf.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, predictions)
# TP FP
# FN TN
disp = ConfusionMatrixDisplay(cm)

disp.plot()

In [None]:
accuracy_score(y_test, predictions)
# (TP + TN) / (TP + FP + FN + TN)

precision_score(y_test, predictions, average='micro')
# TP / (TP + FP)

recall_score(y_test, predictions, average='micro')
# TP / (TP + FN)

In [None]:
predictions

pd.DataFrame(predictions).to_csv("submission.csv")