In [134]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import mapply
mapply.init(n_workers=7, progressbar=True)

import pickle
with open('scaler.pickle', 'rb') as f:
    scaler: StandardScaler = pickle.load(f)

# Read in the data
final = pd.read_csv('../dataset/data/final_clean.csv')

# Remember to the program which ðŸ¦†ing columns are objects
cat_cols = ['date_month', 'date_day', 'date_year', 'congressional_district', 'state', 'incident_characteristics1', 'city_or_county', 'party', 'CLEAN', 'OUTLIER']
final[cat_cols] = final[cat_cols].astype('object')



final['isArrested'] = (final['n_arrested'] > 0)
final['isInjured'] = (final['n_injured'] > 0)
final['isUnharmed'] = (final['n_unharmed'] > 0)

In [135]:
discretized = final.copy()

TO_DISCRETIZE = ['state', 'congressional_district', 'party']

cat_cols = final.select_dtypes(include=['object']).columns
def discretize_data(dataset, variables):
    for variable in variables:
        #get the unique variable's values
        var = sorted(dataset[variable].unique())
        
        #generate a mapping from the variable's values to the number representation  
        mapping = dict(zip(var, range(0, len(var) + 1)))

        #add a new colum with the number representation of the variable
        dataset[variable+'_num'] = dataset[variable].map(mapping).astype(int)
    return dataset

def one_hot(dataset, variables):
    for variable in variables:
        #get the unique variable's values
        vars = sorted(dataset[variable].unique())
        
        for var in vars:
            dataset[variable+'_'+str(var)] = (dataset[variable] == var).astype(int)
    return dataset

# discretized = one_hot(discretized, TO_DISCRETIZE)
discretized = discretized.drop(columns=cat_cols)

# final = discretize_data(final, cat_cols)
# final = final.drop(columns=cat_cols)

In [136]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# Scale the data
scaled = scaler.fit_transform(discretized)

# Convert to a dataframe
scaled = pd.DataFrame(scaled, columns=discretized.columns)

In [137]:
f = scaled.copy()

f['isKilled'] = (f['n_killed'] > 0)

value_counts = f['isKilled'].value_counts()
min_value_count = value_counts.min()
f = f.groupby('isKilled').apply(lambda x: x.sample(n=min_value_count))

isKilled = (f['n_killed'] > 0).astype('int64')

del f['isKilled']

keywords = ['killed', 'injured', 'arrested', 'unharmed', 'n_participants', 'DEAD', 'CLEAN', 'OUTLIER', 'incident_']
# del all columns with keywords in it
deleted = []
for keyword in keywords:
    for col in f.columns:
        if keyword in col:
            deleted.append(col)
            del f[col]

print(deleted)

f = f.astype('float64')

# del f['n_killed']
# del f['p_killed']
# del f['month_cd_ratio_killed']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(f.select_dtypes(include=['float64', 'int64']), isKilled, stratify=isKilled)


['n_killed', 'p_killed', 'month_cd_ratio_killed', 'n_injured', 'p_injured', 'month_cd_ratio_injured', 'n_arrested', 'p_arrested', 'month_cd_ratio_arrested', 'n_unharmed', 'p_unharmed', 'month_cd_ratio_unharmed', 'n_participants_child', 'n_participants_teen', 'n_participants_adult', 'n_participants', 'cd_month_SHOT_DEAD']


In [138]:


from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

clf = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=6, min_samples_split=3, min_samples_leaf=4)
# clf = MLPClassifier(hidden_layer_sizes=(100, 100, 100), max_iter=500, alpha=0.0001, solver='adam', verbose=10,  random_state=21,tol=0.000000001)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test, y_pred))
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, y_pred))

0.8707908120865605
[[12285  2757]
 [ 1130 13911]]


In [139]:
from sklearn.tree import export_graphviz

import pydotplus

# from IPython.display import Image
dot_data = export_graphviz(clf, out_file=None, feature_names=f.select_dtypes(include=['float64', 'int64']).columns, class_names=['Not_Killed', 'Killed'], filled=True, rounded=True, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)

graph.write_png('graph.png')


True

In [133]:
import torch
import torch.nn as nn

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32).to(DEVICE)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32).to(DEVICE)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).to(DEVICE)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).to(DEVICE)

input_size = X_train_tensor.shape[1]
hidden_size = 10
output_size = 1

loss_function = nn.CrossEntropyLoss()

learning_rate = 0.1

model = nn.Sequential(
    nn.Linear(input_size, hidden_size),
    nn.ReLU(),
    nn.Linear(hidden_size, hidden_size),
    nn.ReLU(),
    nn.Linear(hidden_size, output_size),
    nn.Sigmoid()
).to(DEVICE)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

EPOCHS = 5000
losses = []
epochs = []

for epoch in range(EPOCHS):
    optimizer.zero_grad()
    y_pred = model(X_train_tensor)
    y_pred = y_pred.squeeze(1)
    loss = loss_function(y_pred, y_train_tensor)
    losses.append(loss.item())
    if epoch % 100 == 0:
        print(f'Epoch {epoch+1}/{EPOCHS}, Loss: {loss.item():.4f}')
    loss.backward()
    optimizer.step()

from sklearn.metrics import confusion_matrix

# Convert the test data to PyTorch tensors and get the predicted class
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32).to(DEVICE)
y_pred_tensor = model(X_test_tensor).detach().cpu()

y_pred = np.where(y_pred_tensor > 0.5, 1, 0)

y_pred

# _, y_pred = torch.max(y_pred_tensor, 1)

# # Print the confusion matrix
print(confusion_matrix(y_test, y_pred))

# # Print the accuracy
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))


Epoch 1/5000, Loss: 514871.3750
Epoch 101/5000, Loss: 511638.2500
Epoch 201/5000, Loss: 511620.1250
Epoch 301/5000, Loss: 511533.2188
Epoch 401/5000, Loss: 511499.7812
Epoch 501/5000, Loss: 511512.6250
Epoch 601/5000, Loss: 511466.0938
Epoch 701/5000, Loss: 511493.5000
Epoch 801/5000, Loss: 511434.5000
Epoch 901/5000, Loss: 511447.6250
Epoch 1001/5000, Loss: 511470.9688
Epoch 1101/5000, Loss: 511420.8125
Epoch 1201/5000, Loss: 511482.2500
Epoch 1301/5000, Loss: 511404.8750
Epoch 1401/5000, Loss: 511423.0938
Epoch 1501/5000, Loss: 511417.6250
Epoch 1601/5000, Loss: 511391.2500
Epoch 1701/5000, Loss: 511430.5625
Epoch 1801/5000, Loss: 511350.3750
Epoch 1901/5000, Loss: 511319.0625
Epoch 2001/5000, Loss: 511250.5312
Epoch 2101/5000, Loss: 511225.1250
Epoch 2201/5000, Loss: 511192.9375
Epoch 2301/5000, Loss: 511200.9375
Epoch 2401/5000, Loss: 511246.8125
Epoch 2501/5000, Loss: 511200.4062
Epoch 2601/5000, Loss: 511163.5625
Epoch 2701/5000, Loss: 511195.0625
Epoch 2801/5000, Loss: 511161.43

[[ 6257  8784]
 [ 3573 11469]]
0.5892364458331948
