In [None]:
# Load libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
#from imblearn.over_sampling import RandomOverSampler # we can use this if we have class imbalances 

In [None]:
# Import data
df = pd.read_csv('data/clean_df.csv')

In [None]:
df.head()

In [None]:
df.set_index('date', inplace=True)
df.index = pd.to_datetime(df.index)

In [None]:
# Create new column converting our target ('governorate') into numbers 
df['label'] = pd.Categorical(df['governorate'])
df['label'] = df['label'].cat.codes

In [None]:
# Get dummy columns for our features that are categorical

df = pd.get_dummies(df, columns=['sub_category','confirmed_time'])

In [None]:
# The Random forrest model was very overfit so will remove a few features 
X = df.drop(columns=['area', 'target', 'governorate','district',
                     'incident_id','time_of_day','main_category','label'])

In [None]:
y = df['label']

In [None]:
#grid searching our Random Forest Classifier.
param_dict = {
    'n_estimators': [500, 800, 1000],
    'max_depth' : [2,4,5,6,7,8,9,10],
    'max_features' : ['auto','sqrt','log2'],
    'criterion' : ['gini','entropy']
}
grid = GridSearchCV(RandomForestClassifier(random_state= 42, ), param_grid= param_dict, cv = 5, n_jobs= -1, verbose= 1)
grid.fit(X, y)

In [None]:
1- y.mean()

In [None]:
grid.best_score_

In [None]:
grid.best_params_

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
ss = StandardScaler()
X_train_ss = ss.fit_transform(X_train)
X_test_ss = ss.transform(X_test)

In [None]:
rfc = RandomForestClassifier(n_estimators = 800, criterion= 'entropy', max_depth= 10, max_features= 'auto')
rfc.fit(X_train_ss, y_train)

In [None]:
rfc.score(X_train_ss, y_train)

In [None]:
rfc.score(X_test_ss, y_test)

In [None]:
y_pred = rfc.predict(X_test_ss)

In [None]:
df['governorate'].value_counts()

In [None]:
y.value_counts()

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
importances = rfc.feature_importances_
print("Feature ranking:")
indices = np.argsort(importances)[::-1]
for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]), X.columns[indices[f]])