 # Supervised Link Prediction with the Armed Conflict Location Event Database
 ## Modelling Notebook

In [1]:
import pandas as pd
import numpy as np
from joblib import dump, load
from sklearn.utils import resample
from itertools import product
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import confusion_matrix, recall_score, f1_score

 # Read in training data
 From feature engineering phase

In [2]:
mode = 'read'
train_df = pd.read_parquet('df.parquet.gzip')

 ## Define the time periods of interest

In [3]:
periods = [str(x)+"-"+str(y) for x, y in
           product(range(1997, 2019), range(1, 13))]

 ## Set the test/train split
 1998-1 - 2017-12 as Train <br>
 2018-1 - 2018-12 as Test

In [4]:
training_range = periods[12:252]
testing_range = periods[252:]
train_flag = train_df.period.isin(training_range)
test_flag = train_df.period.isin(testing_range)

In [5]:
print('Creating Test/Train Splits')
train = train_df[train_flag]
test = train_df[test_flag]

Creating Test/Train Splits


In [6]:
print('Splitting Data')
X_train = train.drop('target', axis=1)\
    .set_index(["agent1", "agent2", "period"])
y_train = train\
    .set_index(["agent1", "agent2", "period"])\
    .loc[:, "target"]

X_test = test.drop('target', axis=1)\
    .set_index(["agent1", "agent2", "period"])
y_test = test\
    .set_index(["agent1", "agent2", "period"])\
    .loc[:, "target"]
print('Complete')

Splitting Data
Complete


 # Model Training
 Here we shall be using a Balanced Random Forest from the imbalanced-learn package.
 A regular random forest takes random subsamples of the data when constructing each tree but
 a balanced random forest creates a subsample by upsampling the minority class.

In [7]:
if mode == 'train':
    rf = BalancedRandomForestClassifier(n_jobs=5, verbose=10,
                                        n_estimators=150, random_state=123)
    rf.fit(X_train, y_train)
    dump(rf, 'model.joblib')
if mode == 'read':
    rf = load('model.joblib')

 # Model performance
 This is a very very imbalanced dataset, so accuracy shouldn't be the metric of
 overall performance.
 Accuracy doesn't tell us enough about false positives or false negatives (independently).
 We shall choose the metric of interest as recall. This is because (for example) a
 UN agent would be more concerned about false negatives than false positives. People could
 die if you get false negatives!

 # Performance on Training data

In [8]:
y_train_pred = rf.predict(X_train)

[Parallel(n_jobs=5)]: Using backend ThreadingBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   3 tasks      | elapsed:    2.8s
[Parallel(n_jobs=5)]: Done   8 tasks      | elapsed:    6.1s
[Parallel(n_jobs=5)]: Done  15 tasks      | elapsed:    9.2s
[Parallel(n_jobs=5)]: Done  22 tasks      | elapsed:   15.1s
[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:   21.1s
[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:   26.0s
[Parallel(n_jobs=5)]: Done  51 tasks      | elapsed:   36.2s
[Parallel(n_jobs=5)]: Done  62 tasks      | elapsed:   42.8s
[Parallel(n_jobs=5)]: Done  75 tasks      | elapsed:   49.5s
[Parallel(n_jobs=5)]: Done  88 tasks      | elapsed:   58.2s
[Parallel(n_jobs=5)]: Done 103 tasks      | elapsed:  1.1min
[Parallel(n_jobs=5)]: Done 118 tasks      | elapsed:  1.3min
[Parallel(n_jobs=5)]: Done 135 tasks      | elapsed:  1.5min
[Parallel(n_jobs=5)]: Done 150 out of 150 | elapsed:  1.6min finished


 #### Confusion Matrix

In [9]:
print(confusion_matrix(y_train, y_train_pred))

[[17809097   667796]
 [     942     8885]]


In [10]:
print("Recall Score (TRAIN): " + str(recall_score(y_train, y_train_pred)))
print("F1 (TRAIN): " + str(f1_score(y_train, y_train_pred)))

Recall Score (TRAIN): 0.9041416505545945
F1 (TRAIN): 0.025884621883503177


 # Performance on Testing data

In [11]:
y_test_pred = rf.predict(X_test)

[Parallel(n_jobs=5)]: Using backend ThreadingBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   3 tasks      | elapsed:    0.1s
[Parallel(n_jobs=5)]: Done   8 tasks      | elapsed:    0.3s
[Parallel(n_jobs=5)]: Done  15 tasks      | elapsed:    0.4s
[Parallel(n_jobs=5)]: Done  22 tasks      | elapsed:    0.6s
[Parallel(n_jobs=5)]: Done  31 tasks      | elapsed:    0.8s
[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:    1.0s
[Parallel(n_jobs=5)]: Done  51 tasks      | elapsed:    1.3s
[Parallel(n_jobs=5)]: Done  62 tasks      | elapsed:    1.6s
[Parallel(n_jobs=5)]: Done  75 tasks      | elapsed:    1.9s
[Parallel(n_jobs=5)]: Done  88 tasks      | elapsed:    2.2s
[Parallel(n_jobs=5)]: Done 103 tasks      | elapsed:    2.6s
[Parallel(n_jobs=5)]: Done 118 tasks      | elapsed:    3.0s
[Parallel(n_jobs=5)]: Done 135 tasks      | elapsed:    3.4s
[Parallel(n_jobs=5)]: Done 150 out of 150 | elapsed:    3.7s finished


 #### Confusion Matrix

In [12]:
print(confusion_matrix(y_test, y_test_pred))

[[834243  89062]
 [    78    953]]


In [13]:
print("Recall Score (TEST): " + str(recall_score(y_test, y_test_pred)))
print("F1 (TEST): " + str(f1_score(y_test, y_test_pred)))

Recall Score (TEST): 0.9243452958292919
F1 (TEST): 0.02093447268413769


 # Feature Importances

In [14]:
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]
cols = list(X_train.columns)
# Print the feature ranking
print("Feature ranking:")
for f in range(X_train.shape[1]):
    print(str(f + 1)+'. Feature: '+ cols[indices[f]] + ' '+ str(importances[indices[f]]))

Feature ranking:
1. Feature: pref_attachment_1periods_prev 0.14901047971698075
2. Feature: pref_attachment_2periods_prev 0.10556877419792446
3. Feature: pref_attachment_5periods_prev 0.0948233891897029
4. Feature: pref_attachment_3periods_prev 0.08363538078858798
5. Feature: pref_attachment_7periods_prev 0.06590819953750122
6. Feature: pref_attachment_6periods_prev 0.05290802920080257
7. Feature: pref_attachment_4periods_prev 0.04488638263978326
8. Feature: pref_attachment_8periods_prev 0.0388324366070451
9. Feature: pref_attachment_12periods_prev 0.03707358381494876
10. Feature: pref_attachment_9periods_prev 0.034500763616727645
11. Feature: jaccard_coef_1periods_prev 0.027900475429689025
12. Feature: pref_attachment_11periods_prev 0.026411669369061325
13. Feature: pref_attachment_10periods_prev 0.025786304400292368
14. Feature: resource_alloc_com_2periods_prev 0.023037936758244424
15. Feature: resource_alloc_com_1periods_prev 0.022813145328699116
16. Feature: jaccard_coef_2periods_pr