In [259]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix
import numpy as np
import pandas as pd
import csv

# Model parameters
reg = 1000
max_depth = 5

# Data Setup

In [260]:
all_data = dict()
all_decades = [1, 2, 3, 4, 5]

for decade in all_decades:
    for ver in ["tr", "test"]:
        filename = f"{decade}_{ver}"
        data = pd.read_csv(f"decade{filename}.csv")
        all_data[filename] = dict()
        all_data[filename]["X"] = data.drop(columns=["warstds"])
        all_data[filename]["y"] = data["warstds"]

# Baseline: train on all decades, predict on each decade individually

In [261]:
# Train on all decades
log_reg = LogisticRegression(C=reg, class_weight="balanced")
rdm_for = RandomForestClassifier(max_depth=max_depth, class_weight="balanced")
for decade in all_decades:
    train_X = all_data[f"{decade}_test"]["X"]
    train_y = all_data[f"{decade}_test"]["y"]
    log_reg.fit(train_X, train_y)
    rdm_for.fit(train_X, train_y)

# Test on each decade individually
for decade in all_decades:
    test_X = all_data[f"{decade}_test"]["X"]
    test_y = all_data[f"{decade}_test"]["y"]
    log_reg_score = f1_score(test_y, log_reg.predict(test_X))
    rdm_for_score = f1_score(test_y, rdm_for.predict(test_X))
    print("Decade {2} -- Test Scores for Logistic Regression: {0}, Random Forest: {1}".format(log_reg_score, rdm_for_score, decade))

Decade 1 -- Test Scores for Logistic Regression: 0.1678321678321678, Random Forest: 0.625
Decade 2 -- Test Scores for Logistic Regression: 0.05757575757575757, Random Forest: 0.4
Decade 3 -- Test Scores for Logistic Regression: 0.0437375745526839, Random Forest: 0.4117647058823529
Decade 4 -- Test Scores for Logistic Regression: 0.0359612724757953, Random Forest: 0.2608695652173913
Decade 5 -- Test Scores for Logistic Regression: 0.0425531914893617, Random Forest: 0.9473684210526316


# Exp 1: Predict the past. Train on 1960-2000, test on 1945-1959

In [262]:
train_X = all_data["1_tr"]["X"]
train_y = all_data["1_tr"]["y"]

test_X = all_data["1_test"]["X"]
test_y = all_data["1_test"]["y"]

log_reg = LogisticRegression(C=reg, class_weight="balanced").fit(train_X, train_y)
log_reg_score = f1_score(test_y, log_reg.predict(test_X))
rdm_for = RandomForestClassifier(max_depth=max_depth, class_weight="balanced").fit(train_X, train_y)
rdm_for_score = f1_score(test_y, rdm_for.predict(test_X))
print("Test Scores for Logistic Regression: {0}, Random Forest: {1}\n".format(log_reg_score, rdm_for_score))

Test Scores for Logistic Regression: 0.15286624203821658, Random Forest: 0.5573770491803278



# Exp 2: Predicting in-between timeframes.
- Train on 1945-1959 and 1970-2000, test on 1960-1969
- Train on 1945-1969 and 1980-2000, test on 1970-1979
- Train on 1945-1979 and 1990-2000, test on 1980-1989

In [263]:
for i in [2, 3, 4]:
    train_X = all_data[f"{i}_tr"]["X"]
    train_y = all_data[f"{i}_tr"]["y"]

    test_X = all_data[f"{i}_test"]["X"]
    test_y = all_data[f"{i}_test"]["y"]

    log_reg = LogisticRegression(C=reg, class_weight="balanced").fit(train_X, train_y)
    log_reg_score = f1_score(test_y, log_reg.predict(test_X))
    rdm_for = RandomForestClassifier(max_depth=max_depth, class_weight="balanced").fit(train_X, train_y)
    rdm_for_score = f1_score(test_y, rdm_for.predict(test_X))
    print("Decade {2} -- Test Scores for Logistic Regression: {0}, Random Forest: {1}".format(log_reg_score, rdm_for_score, i))

Decade 2 -- Test Scores for Logistic Regression: 0.042313117066290554, Random Forest: 0.7272727272727272
Decade 3 -- Test Scores for Logistic Regression: 0.04341534008683068, Random Forest: 0.7272727272727272
Decade 4 -- Test Scores for Logistic Regression: 0.03716216216216216, Random Forest: 0.5454545454545455


# Exp 3: Test only on future events

In [264]:
train_X = all_data["5_tr"]["X"]
train_y = all_data["5_tr"]["y"]

test_X = all_data["5_test"]["X"]
test_y = all_data["5_test"]["y"]

log_reg = LogisticRegression(C=reg, class_weight="balanced").fit(train_X, train_y)
log_reg_score = f1_score(test_y, log_reg.predict(test_X))
rdm_for = RandomForestClassifier(max_depth=max_depth, class_weight="balanced").fit(train_X, train_y)
rdm_for_score = f1_score(test_y, rdm_for.predict(test_X))
print("Test Scores for Logistic Regression: {0}, Random Forest: {1}\n".format(log_reg_score, rdm_for_score))

Test Scores for Logistic Regression: 0.06607929515418502, Random Forest: 0.5245901639344261

