In [289]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import csv

# Model parameters
reg = 1000
max_depth = 5

# Data Setup

In [282]:
datalist = list()
all_decades = [1, 2, 3, 4, 5]

# Get data for each decade
for decade in all_decades:
    df = pd.read_csv(f"decade{decade}.csv")
    datalist.append({
        "X": df.drop(columns=["warstds"]),
        "y": df["warstds"]
    })

all_data = dict()

# Get train-test splits for each decade
for decade, data in enumerate(datalist):
    X_train, X_test, y_train, y_test = train_test_split(data["X"], data["y"], test_size=0.3, stratify=data["y"])
    all_data[f"{decade+1}_tr"] = {
        "X": X_train,
        "y": y_train
    }
    all_data[f"{decade+1}_test"] = {
        "X": X_train,
        "y": y_train
    }

In [283]:
all_data.keys()

dict_keys(['1_tr', '1_test', '2_tr', '2_test', '3_tr', '3_test', '4_tr', '4_test', '5_tr', '5_test'])

# Baseline: train on all decades, predict on each decade individually

In [284]:
# Train on all decade training sets
log_reg = LogisticRegression(C=reg, class_weight="balanced")
rdm_for = RandomForestClassifier(max_depth=max_depth, class_weight="balanced")
for decade in all_decades:
    train_X = all_data[f"{decade}_tr"]["X"]
    train_y = all_data[f"{decade}_tr"]["y"]
    log_reg.fit(train_X, train_y)
    rdm_for.fit(train_X, train_y)

# Test on each decade test set individually
for decade in all_decades:
    test_X = all_data[f"{decade}_test"]["X"]
    test_y = all_data[f"{decade}_test"]["y"]
    log_reg_score = f1_score(test_y, log_reg.predict(test_X))
    rdm_for_score = f1_score(test_y, rdm_for.predict(test_X))
    print("Decade {2} -- Test Scores for Logistic Regression: {0}, Random Forest: {1}".format(log_reg_score, rdm_for_score, decade))

Decade 1 -- Test Scores for Logistic Regression: 0.0967741935483871, Random Forest: 0.6
Decade 2 -- Test Scores for Logistic Regression: 0.0404040404040404, Random Forest: 0.1111111111111111
Decade 3 -- Test Scores for Logistic Regression: 0.03805496828752642, Random Forest: 0.1111111111111111
Decade 4 -- Test Scores for Logistic Regression: 0.03544303797468354, Random Forest: 0.13333333333333333
Decade 5 -- Test Scores for Logistic Regression: 0.06626506024096385, Random Forest: 0.9500000000000001


# Exp 1: Predicting the past. Train on 1960-2000, test on 1945-1959

In [294]:
# Train on all training set but the first
train_X = pd.concat([
    all_data["2_tr"]["X"],
    all_data["3_tr"]["X"],
    all_data["4_tr"]["X"],
    all_data["5_tr"]["X"]
])

train_y = pd.concat([
    all_data["2_tr"]["y"],
    all_data["3_tr"]["y"],
    all_data["4_tr"]["y"],
    all_data["5_tr"]["y"]
])

# Test on first test set
test_X = all_data["1_test"]["X"]
test_y = all_data["1_test"]["y"]

log_reg = LogisticRegression(C=reg, class_weight="balanced").fit(train_X, train_y)
rdm_for = RandomForestClassifier(max_depth=max_depth, class_weight="balanced").fit(train_X, train_y)
log_reg_score = log_reg.score(test_X, test_y)
rdm_for_score = rdm_for.score(test_X, test_y)
log_reg_f1score = f1_score(test_y, log_reg.predict(test_X))
rdm_for_f1score = f1_score(test_y, rdm_for.predict(test_X))
log_reg_auc = roc_auc_score(test_y, log_reg.predict_proba(test_X)[:, 1])
rdm_for_auc = roc_auc_score(test_y, rdm_for.predict_proba(test_X)[:, 1])

print(f"Decade 1:")
print("Test on 1, LR: {0}, RF: {1}".format(round(log_reg_score, 5), round(rdm_for_score, 5)))
print("F1 for LR: {0}, RF: {1}".format(round(log_reg_f1score, 4), round(rdm_for_f1score, 4)))
print("AUC for LR: {0}, RF: {1}\n".format(round(log_reg_auc, 4), round(rdm_for_auc, 4)))

Decade 1:
Test on 1, LR: 0.88956, RF: 0.97573
F1 for LR: 0.1651, RF: 0.5238
AUC for LR: 0.8303, RF: 0.9931



# Exp 2: Predicting in-between timeframes.
- Train on 1945-1959 and 1970-2000, test on 1960-1969
- Train on 1945-1969 and 1980-2000, test on 1970-1979
- Train on 1945-1979 and 1990-2000, test on 1980-1989

In [291]:
for i in [2, 3, 4]:
    # Exclude current decade from training set
    concatenate_lists = {"X": list(), "y": list()}
    for j in [2, 3, 4]:
        if i != j:
            concatenate_lists["X"].append(all_data[f"{j}_tr"]["X"])
            concatenate_lists["y"].append(all_data[f"{j}_tr"]["y"])
    train_X = pd.concat(concatenate_lists["X"])
    train_y = pd.concat(concatenate_lists["y"])

    test_X = all_data[f"{i}_test"]["X"]
    test_y = all_data[f"{i}_test"]["y"]

    log_reg = LogisticRegression(C=reg, class_weight="balanced").fit(train_X, train_y)
    rdm_for = RandomForestClassifier(max_depth=max_depth, class_weight="balanced").fit(train_X, train_y)
    log_reg_score = log_reg.score(test_X, test_y)
    rdm_for_score = rdm_for.score(test_X, test_y)
    log_reg_f1score = f1_score(test_y, log_reg.predict(test_X))
    rdm_for_f1score = f1_score(test_y, rdm_for.predict(test_X))
    log_reg_auc = roc_auc_score(test_y, log_reg.predict_proba(test_X)[:, 1])
    rdm_for_auc = roc_auc_score(test_y, rdm_for.predict_proba(test_X)[:, 1])

    print(f"Decade {i}:")
    print("Test on {0}, LR: {1}, RF: {2}".format(i, round(log_reg_score, 5), round(rdm_for_score, 5)))
    print("F1 for LR: {0}, RF: {1}".format(round(log_reg_f1score, 4), round(rdm_for_f1score, 4)))
    print("AUC for LR: {0}, RF: {1}\n".format(round(log_reg_auc, 4), round(rdm_for_auc, 4)))

Decade 2:
Test on 2, LR: 0.35962, RF: 0.98805
F1 for LR: 0.0429, RF: 0.6154
AUC for LR: 0.4977, RF: 0.9437

Decade 3:
Test on 3, LR: 0.2886, RF: 0.98688
F1 for LR: 0.0382, RF: 0.48
AUC for LR: 0.6042, RF: 0.9925

Decade 4:
Test on 4, LR: 0.66066, RF: 0.98957
F1 for LR: 0.0272, RF: 0.4762
AUC for LR: 0.5498, RF: 0.9573



# Exp 3: Predicting the future. Test only on future events

In [295]:
# Train on all training set but the last
train_X = pd.concat([
    all_data["1_tr"]["X"],
    all_data["2_tr"]["X"],
    all_data["3_tr"]["X"],
    all_data["4_tr"]["X"]
])

train_y = pd.concat([
    all_data["1_tr"]["y"],
    all_data["2_tr"]["y"],
    all_data["3_tr"]["y"],
    all_data["4_tr"]["y"]
])

# Test on last test set
test_X = all_data["5_test"]["X"]
test_y = all_data["5_test"]["y"]

log_reg = LogisticRegression(C=reg, class_weight="balanced").fit(train_X, train_y)
rdm_for = RandomForestClassifier(max_depth=max_depth, class_weight="balanced").fit(train_X, train_y)
log_reg_score = log_reg.score(test_X, test_y)
rdm_for_score = rdm_for.score(test_X, test_y)
log_reg_f1score = f1_score(test_y, log_reg.predict(test_X))
rdm_for_f1score = f1_score(test_y, rdm_for.predict(test_X))
log_reg_auc = roc_auc_score(test_y, log_reg.predict_proba(test_X)[:, 1])
rdm_for_auc = roc_auc_score(test_y, rdm_for.predict_proba(test_X)[:, 1])

print(f"Decade 5:")
print("Test on 5, LR: {0}, RF: {1}".format(round(log_reg_score, 5), round(rdm_for_score, 5)))
print("F1 for LR: {0}, RF: {1}".format(round(log_reg_f1score, 4), round(rdm_for_f1score, 4)))
print("AUC for LR: {0}, RF: {1}\n".format(round(log_reg_auc, 4), round(rdm_for_auc, 4)))

Decade 5:
Test on 5, LR: 0.73893, RF: 0.97267
F1 for LR: 0.0348, RF: 0.4314
AUC for LR: 0.6312, RF: 0.9589

