In [22]:
import pandas as pd

data = pd.read_csv(r'/Users/garettwilson/Downloads/ged171.csv')

##simple data cleaning, imputing with median and mode for missing data

In [23]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Assume your DataFrame is named `data`

# Step 1: Impute numeric columns with median
numeric_cols = data.select_dtypes(include='number').columns
for col in numeric_cols:
    if data[col].isnull().any():
        median_value = data[col].median()
        data[col].fillna(median_value, inplace=True)

# Step 2: Impute object/categorical columns with mode
categorical_cols = data.select_dtypes(include='object').columns
for col in categorical_cols:
    if data[col].isnull().any():
        mode_value = data[col].mode().iloc[0]
        data[col].fillna(mode_value, inplace=True)

# Step 3: Label encode categorical columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le  # store in case you need to decode later



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(mode_value, inplace=True)


In [24]:
from sklearn.model_selection import train_test_split

X = data.drop("best", axis=1)
y = data["best"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#drop highly correlated columns

In [25]:
import pandas as pd
import numpy as np

# Calculate absolute correlation matrix
corr_matrix = X.corr().abs()

# Create an upper triangle mask to ignore duplicate pairs
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find features with correlation above a threshold (e.g., 0.9)
to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]


print("Highly correlated columns to drop:")
print(to_drop)
X = X.drop(columns=to_drop)

Highly correlated columns to drop:
['dyad_new_id', 'side_a', 'priogrid_gid', 'country_id', 'date_start', 'date_end', 'high']


In [26]:
#lets also drop deaths_side_a, deaths_side_b, civilian_deaths - these directly impact best
X = X.drop(columns=['deaths_a','deaths_b','deaths_civilians','deaths_unknown','low'])

* ReLU (Rectified Linear Unit): f(x) = max(0, x). It keeps positive values and discards negative ones. ReLU is the most commonly used activation in hidden layers due to its simplicity and effectiveness.
* Sigmoid: f(x) = 1 / (1 + e^(-x)). This function maps any input to the range (0, 1), making it useful in binary classification or probability modeling. However, it can lead to vanishing gradients during training.
* Tanh: Similar to sigmoid but maps input to (-1, 1). It's more balanced but still suffers from vanishing gradient issues in deep networks.

In [31]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import regularizers

#scaling data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

mod2 = keras.Sequential([
    #input
    layers.Input(shape=(X_train.shape[1],)),
    #layer 1, 64 neurons, passed through a relu activation function - which makes the model not behave like linear regression
    #regularizers add penalties to the loss function.
    #increased kernel regularizers to 0.01, and combined both l1 and l2 regularization (reminds me of elastic net)
    layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l1_l2(l1=0.1, l2=0.1), name='layer1'),
    #a dropout layer randomly "drops" (i.e., sets to zero) a fraction of the input neurons for each batch. This forces the network to learn more robust features that are not reliant on any one specific neuron.
    layers.Dropout(0.3),

    # Second layer with L2 and dropout
    layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l1_l2(l1=0.1, l2=0.1), name='layer2'),
    layers.Dropout(0.3),
    #testing out a third layer
    layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l1_l2(l1=0.1, l2=0.1), name='layer3'),
    layers.Dropout(0.3),
    #output used linear activation function sinc we are predicting a 
    layers.Dense(1, activation='linear', name='output')
])

#Compile - calcualte loss
mod2.compile(optimizer='adam', loss='mse', metrics=['mae'])

#Train - 100 epochs
mod2.fit(X_train, y_train, epochs=100, batch_size=32, verbose=1, validation_split=0.1)


#metrics , primarily looking at r^2.
mod2.summary()
y_pred_test = mod2.predict(X_test).flatten()
y_pred_train = mod2.predict(X_train).flatten()

mse = mean_squared_error(y_test, y_pred_test)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)


print("\n--- Test Performance ---")
print(f"R² Score test    : {r2_test:.4f}")
print(f"R² Score train    : {r2_train:.4f}")
print(f"MAE          : {mae:.4f}")
print(f"RMSE         : {rmse:.4f}")

Epoch 1/100
[1m3042/3042[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 454us/step - loss: 1714769.3750 - mae: 19.3044 - val_loss: 10285.9512 - val_mae: 8.1578
Epoch 2/100
[1m3042/3042[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 420us/step - loss: 630082.6250 - mae: 12.7625 - val_loss: 4387.5903 - val_mae: 8.0060
Epoch 3/100
[1m3042/3042[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 516us/step - loss: 233074.0156 - mae: 10.6531 - val_loss: 1155.4342 - val_mae: 8.3064
Epoch 4/100
[1m3042/3042[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 418us/step - loss: 418905.6875 - mae: 12.7417 - val_loss: 1104.5352 - val_mae: 8.7585
Epoch 5/100
[1m3042/3042[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 437us/step - loss: 544042.6250 - mae: 13.4620 - val_loss: 1890.9189 - val_mae: 8.6320
Epoch 6/100
[1m3042/3042[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 440us/step - loss: 843378.6875 - mae: 17.4484 - val_loss: 2193.8831 - val_mae: 7.8476
Ep

[1m845/845[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 187us/step
[1m3380/3380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 164us/step

--- Test Performance ---
R² Score test    : 0.9533
R² Score train    : 0.9856
MAE          : 5.1827
RMSE         : 18.7224
