In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import csv
import numpy as np

  from numpy.core.umath_tests import inner1d


### Data Setup

In [9]:
all_continents = ['afr', 'am', 'asia', 'eur', 'me']
all_data = {}

feature_names = ["warstds", "ager", "agexp", "anoc", "army85", "autch98", "auto4",
"autonomy", "avgnabo", "centpol3", "coldwar", "decade1", "decade2",
"decade3", "decade4", "dem", "dem4", "demch98", "dlang", "drel",
"durable", "ef", "ef2", "ehet", "elfo", "elfo2", "etdo4590",
"expgdp", "exrec", "fedpol3", "fuelexp", "gdpgrowth", "geo1", "geo2",
"geo34", "geo57", "geo69", "geo8", "illiteracy", "incumb", "infant",
"inst", "inst3", "life", "lmtnest", "ln_gdpen", "lpopns", "major", "manuexp", "milper",
"mirps0", "mirps1", "mirps2", "mirps3", "nat_war", "ncontig",
"nmgdp", "nmdp4_alt", "numlang", "nwstate", "oil", "p4mchg",
"parcomp", "parreg", "part", "partfree", "plural", "plurrel",
"pol4", "pol4m", "pol4sq", "polch98", "polcomp", "popdense",
"presi", "pri", "proxregc", "ptime", "reg", "regd4_alt", "relfrac", "seceduc",
"second", "semipol3", "sip2", "sxpnew", "sxpsq", "tnatwar", "trade",
"warhist", "xconst"]

for cont in all_continents:
    for ver in ['tr', 'test']:
        data = []
        names = []
        with open('{0}_{1}.csv'.format(cont, ver), newline='') as csvfile:
            reader = csv.reader(csvfile, delimiter=',', quotechar='|')
            for i, row in enumerate(reader):
                if i == 0:
                    names = row
                else:
                    data.append(row)
        data = np.array(data)
        feature_indices = [names.index(feat) for feat in feature_names[1:]]
        features = data[:, feature_indices]
        features = features.astype(float)
        labels = data[:, names.index('warstds')]
        labels = [int(float(label)) for label in labels]
        all_data['{0}_{1}'.format(cont, ver)] = [features, labels]
        

### Exp 1: Train on all continents, disaggregated test results on each continent

In [12]:
# train on all data
all_train_features = []
all_train_labels = []
for key in all_data.keys():
    if 'tr' in key:
        all_train_features.append(all_data[key][0])
        all_train_labels.append(all_data[key][1])
all_train_features = np.concatenate(all_train_features, axis=0)
all_train_labels = np.concatenate(all_train_labels, axis=0)

log_reg = LogisticRegression().fit(features, labels)
log_reg_score = log_reg.score(features, labels)
rdm_for = RandomForestClassifier(max_depth=5).fit(features, labels)
rdm_for_score = rdm_for.score(features, labels)
print("Training Scores for Logistic Regression: {0}, Random Forest: {1}\n".format(log_reg_score, rdm_for_score))

for key in all_data.keys():
    if 'test' in key:
        test_features = all_data[key][0]
        test_labels = all_data[key][1]
        log_reg_score = log_reg.score(test_features, test_labels)
        rdm_for_score = rdm_for.score(test_features, test_labels)
        print("Test on {0}, LR: {1}, RF: {2}".format(key.split('_')[0], round(log_reg_score, 5), round(rdm_for_score, 5)))

Training Scores for Logistic Regression: 0.981994459833795, Random Forest: 0.9875346260387812

Test on eur, LR: 0.98999, RF: 0.99312
Test on me, LR: 0.98199, RF: 0.98753
Test on afr, LR: 0.61613, RF: 0.964
Test on am, LR: 0.8339, RF: 0.98109
Test on asia, LR: 0.80673, RF: 0.97438


### Exp 2: Train on all continents except for x, and then test on x

In [16]:
for key in all_data.keys():
    if 'test' in key:
        cont = key.split('_')[0]
        
        train_continents = all_continents.copy()
        train_continents.remove(cont)
        all_train_features = []
        all_train_labels = []
        for t_cont in train_continents:
            all_train_features.append(all_data['{0}_tr'.format(t_cont)][0])
            all_train_labels.append(all_data['{0}_tr'.format(t_cont)][1])
        all_train_features = np.concatenate(all_train_features, axis=0)
        all_train_labels = np.concatenate(all_train_labels, axis=0)

        log_reg = LogisticRegression().fit(features, labels)
        rdm_for = RandomForestClassifier(max_depth=5).fit(features, labels)
        
        test_features = all_data[key][0]
        test_labels = all_data[key][1]
        log_reg_score = log_reg.score(test_features, test_labels)
        rdm_for_score = rdm_for.score(test_features, test_labels)
        
        print("Extrap to {0}, LR: {1}, RF: {2}".format(cont, round(log_reg_score, 5), round(rdm_for_score, 5)))

Extrap to eur, LR: 0.99312, RF: 0.98874
Extrap to me, LR: 0.98199, RF: 0.99169
Extrap to afr, LR: 0.60345, RF: 0.90314
Extrap to am, LR: 0.82647, RF: 0.97434
Extrap to asia, LR: 0.80893, RF: 0.94217
