In [56]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
import csv
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

In [57]:
# Read the CSV file from the Resources folder into a Pandas DataFrame

df= pd.read_csv(r'C:\Users\dkrenn\Desktop\GW-Bootcamp-Final-Project\Data Analysis\Resources\cleaned_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,ID,AreaName,VicAge,VicSex,VicRace,StatusCode,Lat,Lon,CrimeType,AgeGroup,AddressType,Arrest
0,0,190326475,Wilshire,999,M,Multiracial,AA,34.0375,-118.3506,Theft,Unknown,Public Space,Yes
1,1,200106753,Central,47,M,Multiracial,IC,34.0444,-118.2628,Theft,40-54,Public Space,No
2,2,200320258,Southwest,19,,,IC,34.021,-118.3002,Theft,Young Adult,Residential,No
3,3,200907217,Van Nuys,19,M,Multiracial,IC,34.1576,-118.4387,Theft,Young Adult,Buisness,No
4,4,220614831,Hollywood,28,M,Hispanic,IC,34.0944,-118.3277,Theft,25-39,Public Space,No


In [58]:
# Remove Unneeded Columns
df= df.drop(columns =["Unnamed: 0", "ID", "StatusCode", "Lat", "Lon", "AgeGroup", "AreaName", "VicAge"])
df.head()

Unnamed: 0,VicSex,VicRace,CrimeType,AddressType,Arrest
0,M,Multiracial,Theft,Public Space,Yes
1,M,Multiracial,Theft,Public Space,No
2,,,Theft,Residential,No
3,M,Multiracial,Theft,Buisness,No
4,M,Hispanic,Theft,Public Space,No


In [60]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = df['Arrest']


In [61]:
# Separate the X variable, the features
x = df.drop(columns='Arrest')
x = pd.get_dummies(x, dtype=int)
x.head()



Unnamed: 0,VicSex_F,VicSex_M,VicRace_,VicRace_AANHPI,VicRace_AIAN,VicRace_Black,VicRace_Hispanic,VicRace_Multiracial,VicRace_White,CrimeType_Assault,...,CrimeType_Property Destruct,CrimeType_Robbery,CrimeType_Sex Crime,CrimeType_Theft,CrimeType_Weapons,AddressType_Buisness,AddressType_Other,AddressType_Public Space,AddressType_Public Transit,AddressType_Residential
0,0,1,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,1,0,0
1,0,1,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,1,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
3,0,1,0,0,0,0,0,1,0,0,...,0,0,0,1,0,1,0,0,0,0
4,0,1,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,1,0,0


In [62]:
# Review the y variable Series
y[:5]

0    Yes
1     No
2     No
3     No
4     No
Name: Arrest, dtype: object

In [63]:
# Review the X variable DataFrame
x[:5]

Unnamed: 0,VicSex_F,VicSex_M,VicRace_,VicRace_AANHPI,VicRace_AIAN,VicRace_Black,VicRace_Hispanic,VicRace_Multiracial,VicRace_White,CrimeType_Assault,...,CrimeType_Property Destruct,CrimeType_Robbery,CrimeType_Sex Crime,CrimeType_Theft,CrimeType_Weapons,AddressType_Buisness,AddressType_Other,AddressType_Public Space,AddressType_Public Transit,AddressType_Residential
0,0,1,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,1,0,0
1,0,1,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,1,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
3,0,1,0,0,0,0,0,1,0,0,...,0,0,0,1,0,1,0,0,0,0
4,0,1,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,1,0,0


In [64]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)

In [65]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model using training data
classifier.fit(x_train, y_train)

In [66]:
# Make a prediction using the testing data
predictions = classifier.predict(x_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
651532,No,Yes
840137,No,No
547912,No,No
620196,No,Yes
756650,No,No
...,...,...
606769,No,Yes
376702,No,No
270747,No,No
670992,No,No


In [67]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, predictions)

0.5044730882418917

In [68]:
# Generate a confusion matrix for the model
confuse_matrix = confusion_matrix(y_test, predictions)

In [69]:
# Print the classification report for the model
class_report = classification_report(y_test, predictions)
print(class_report)

              precision    recall  f1-score   support

          No       0.91      1.00      0.95    201092
         Yes       0.53      0.01      0.02     19905

    accuracy                           0.91    220997
   macro avg       0.72      0.50      0.49    220997
weighted avg       0.88      0.91      0.87    220997



In [70]:
# USE RESAMPLED DATA
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# # Assign a random_state parameter of 1 to the model
ros_model = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
x_resampled, y_resampled = ros_model.fit_resample(x_train, y_train)

In [71]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier2 = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model using training data
classifier2.fit(x_resampled, y_resampled)

# Make a prediction using the testing data
predictions2 = classifier2.predict(x_test)
pd.DataFrame({"Prediction": predictions2, "Actual": y_test})

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,Prediction,Actual
651532,Yes,Yes
840137,No,No
547912,Yes,No
620196,Yes,Yes
756650,No,No
...,...,...
606769,No,Yes
376702,Yes,No
270747,No,No
670992,Yes,No


In [72]:
balanced_accuracy_score(y_test, predictions2)

0.6705951711892204

In [73]:
# Generate a confusion matrix for the model
confuse_matrix = confusion_matrix(y_test, predictions2)

In [74]:
# Print the classification report for the model
class_report2 = classification_report(y_test, predictions2)
print(class_report2)

              precision    recall  f1-score   support

          No       0.95      0.66      0.78    201092
         Yes       0.17      0.68      0.27     19905

    accuracy                           0.66    220997
   macro avg       0.56      0.67      0.52    220997
weighted avg       0.88      0.66      0.73    220997

