In [19]:
# Load libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
#from imblearn.over_sampling import RandomOverSampler # we can use this if we have class imbalances 

In [20]:
# Import data
df = pd.read_csv('../data/clean_df.csv')

In [21]:
df.head()

Unnamed: 0,date,incident_id,governorate,district,area,target,main_category,sub_category,min_air_raids,max_air_raids,civilian_casualties,fatalities,woman_fatalities,child_fatalities,injured,woman_injured,child_injured,confirmed_time,time_of_day
0,2015-03-26,1,Sanaa,bani al-harith,Al-Rahabah,Al-Daylami Airbase,military_security_target,military site,1,2.0,0,0,0,0,0,0,0.0,2,morning
1,2015-03-26,2,Sanaa,bani al-harith,Airport,Sana'a International Airport,infrastructure,transport,1,2.0,0,0,0,0,0,0,0.0,2,morning
2,2015-03-26,3,Sanaa,bani al-harith,Al-Sonblah Neighbourhood,Residential Area,civilian,residential area,1,2.0,29,21,3,14,8,0,7.0,2,morning
3,2015-03-26,4,Sanaa,al-sab'ein,Al-Nahdain,Presidential Palace,military_security_target,military site,1,2.0,0,0,0,0,0,0,0.0,3,morning
4,2015-03-26,5,Sanaa,al-thawrah,Al-Nahdhah,Former 1st Armoured Division,military_security_target,military site,1,2.0,0,0,0,0,0,0,0.0,3,morning


In [22]:
df.set_index('date', inplace=True)
df.index = pd.to_datetime(df.index)

In [23]:
# Create new column converting our target ('governorate') into numbers 
df['label'] = pd.Categorical(df['governorate'])
df['label'] = df['label'].cat.codes

In [24]:
# Get dummy columns for our features that are categorical

df = pd.get_dummies(df, columns=['sub_category','confirmed_time'], drop_first=True)

In [25]:
# The Random forrest model was very overfit so will remove a few features 
X = df.drop(columns=['area', 'target', 'governorate','district',
                     'incident_id','time_of_day','main_category','label'])

In [26]:
y = df['label']
class_list = df['label'].sort_values(ascending=True)
weight = 1
class_dict = {label: weight for label in class_list}
class_dict
class_weight = [{0:1,1:1} for label in df['label'].unique()]
class_weight

[{0: 1, 1: 1},
 {0: 1, 1: 1},
 {0: 1, 1: 1},
 {0: 1, 1: 1},
 {0: 1, 1: 1},
 {0: 1, 1: 1},
 {0: 1, 1: 1},
 {0: 1, 1: 1},
 {0: 1, 1: 1},
 {0: 1, 1: 1},
 {0: 1, 1: 1},
 {0: 1, 1: 1},
 {0: 1, 1: 1},
 {0: 1, 1: 1},
 {0: 1, 1: 1},
 {0: 1, 1: 1},
 {0: 1, 1: 1},
 {0: 1, 1: 1},
 {0: 1, 1: 1},
 {0: 1, 1: 1},
 {0: 1, 1: 1},
 {0: 1, 1: 1}]

In [41]:
#grid searching our Random Forest Classifier.
param_dict = {
    'n_estimators': np.random.randint(100,500,3),
    'max_depth' : [2,4,5,6,7,8,9,10],
    'max_features' : ['auto','sqrt','log2'],
    'criterion' : ['gini','entropy'],

}
grid = GridSearchCV(RandomForestClassifier(random_state= 42), param_grid= param_dict, cv = 5, n_jobs= -1, verbose= 1)
grid.fit(X, y)

Fitting 5 folds for each of 144 candidates, totalling 720 fits




KeyboardInterrupt: 

In [28]:
1- y.mean()

-12.692817433844786

In [29]:
grid.best_score_

0.30104514120524795

In [30]:
grid.best_params_

{'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'auto',
 'n_estimators': 294}

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [32]:
ss = StandardScaler()
X_train_ss = ss.fit_transform(X_train)
X_test_ss = ss.transform(X_test)

In [33]:
rfc = RandomForestClassifier(n_estimators = 800, criterion= 'entropy', max_depth= 10, max_features= 'auto')
rfc.fit(X_train_ss, y_train)

RandomForestClassifier(criterion='entropy', max_depth=10, n_estimators=800)

In [34]:
rfc.score(X_train_ss, y_train)

0.3515388720868173

In [35]:
rfc.score(X_test_ss, y_test)

0.32017075773745995

In [36]:
y_pred = rfc.predict(X_test_ss)

In [37]:
df['governorate'].value_counts()

Saada        5199
Sanaa        3986
Taiz         2661
Hajja        2429
Marib        2248
Hudaydah     1811
Jawf         1284
Bayda         576
Amran         451
Lahj          395
Shabwa        320
Aden          293
Ibb           266
Dhalie        200
Dhamar        183
Abyan          85
Mahwit         72
Raymah         12
Hadramawt       8
Maharah         2
lahj            2
Marib           2
Name: governorate, dtype: int64

In [38]:
y.value_counts()

17    5199
18    3986
20    2661
7     2429
14    2248
8     1811
10    1284
3      576
2      451
11     395
19     320
1      293
9      266
4      200
5      183
0       85
13      72
16      12
6        8
21       2
12       2
15       2
Name: label, dtype: int64

In [39]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        24
           1       0.00      0.00      0.00        61
           2       0.00      0.00      0.00       116
           3       0.00      0.00      0.00       158
           4       0.00      0.00      0.00        47
           5       0.00      0.00      0.00        49
           6       0.00      0.00      0.00         3
           7       0.46      0.24      0.31       595
           8       0.57      0.02      0.03       479
           9       0.00      0.00      0.00        66
          10       0.00      0.00      0.00       310
          11       0.00      0.00      0.00        96
          12       0.00      0.00      0.00         1
          13       0.00      0.00      0.00        14
          14       0.36      0.04      0.08       554
          16       0.00      0.00      0.00         2
          17       0.28      0.84      0.42      1266
          18       0.41    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [40]:
importances = rfc.feature_importances_
print("Feature ranking:")
indices = np.argsort(importances)[::-1]
for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]), X.columns[indices[f]])

Feature ranking:
1. feature 1 (0.207664) max_air_raids
2. feature 30 (0.138344) sub_category_military site
3. feature 52 (0.106939) sub_category_unknown
4. feature 18 (0.062744) sub_category_forces
5. feature 79 (0.057570) confirmed_time_Unknown
6. feature 2 (0.040675) civilian_casualties
7. feature 43 (0.034294) sub_category_residential area
8. feature 6 (0.033074) injured
9. feature 3 (0.030080) fatalities
10. feature 48 (0.021274) sub_category_transport
11. feature 64 (0.018149) confirmed_time_18
12. feature 32 (0.015904) sub_category_moving target (weapons/fighters)
13. feature 15 (0.015656) sub_category_farms
14. feature 19 (0.011536) sub_category_gov. compounds
15. feature 70 (0.010709) confirmed_time_23
16. feature 71 (0.010616) confirmed_time_24
17. feature 65 (0.010216) confirmed_time_19
18. feature 44 (0.008759) sub_category_school
19. feature 5 (0.008264) child_fatalities
20. feature 62 (0.008156) confirmed_time_16
21. feature 63 (0.007267) confirmed_time_17
22. feature 69 (