In [42]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
import csv
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

In [43]:
# Read the CSV file from the Resources folder into a Pandas DataFrame

df= pd.read_csv(r'C:\Users\dkrenn\Desktop\GW-Bootcamp-Final-Project\Data Analysis\Resources\cleaned_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,ID,AreaName,VicAge,VicSex,VicRace,StatusCode,Lat,Lon,CrimeType,AgeGroup,AddressType,Arrest
0,0,190326475,Wilshire,999,M,Multiracial,AA,34.0375,-118.3506,Theft,Unknown,Public Space,Yes
1,1,200106753,Central,47,M,Multiracial,IC,34.0444,-118.2628,Theft,40-54,Public Space,No
2,2,200320258,Southwest,19,,,IC,34.021,-118.3002,Theft,Young Adult,Residential,No
3,3,200907217,Van Nuys,19,M,Multiracial,IC,34.1576,-118.4387,Theft,Young Adult,Buisness,No
4,4,220614831,Hollywood,28,M,Hispanic,IC,34.0944,-118.3277,Theft,25-39,Public Space,No


In [44]:
# Remove Unneeded Columns
df= df.drop(columns =["Unnamed: 0", "ID", "StatusCode", "Lat", "Lon", "AgeGroup", "Arrest", "AreaName", "AddressType"])
df.head()

Unnamed: 0,VicAge,VicSex,VicRace,CrimeType
0,999,M,Multiracial,Theft
1,47,M,Multiracial,Theft
2,19,,,Theft
3,19,M,Multiracial,Theft
4,28,M,Hispanic,Theft


In [45]:
# Recode Crime Types
crime =['Theft', 'Assault', 'Sex Crime', 'Other', 'Weapons', 'Robbery',
       'Fraud', 'Property Destruct', 'Kidnapping', 'Driving Offense',
       'Homicide']
violent = ["No", "Yes", "Yes", "No", "Yes","Yes","No", "No", "Yes", "No", "Yes"]
df["Violent"]=df["CrimeType"].replace(crime, violent)
df.head()

Unnamed: 0,VicAge,VicSex,VicRace,CrimeType,Violent
0,999,M,Multiracial,Theft,No
1,47,M,Multiracial,Theft,No
2,19,,,Theft,No
3,19,M,Multiracial,Theft,No
4,28,M,Hispanic,Theft,No


In [46]:
# Remove Missing Age Codes
df=df.loc[df["VicAge"]<999]

# Drop Crime Type
df= df.drop(columns =["CrimeType"])
df.head()

Unnamed: 0,VicAge,VicSex,VicRace,Violent
1,47,M,Multiracial,No
2,19,,,No
3,19,M,Multiracial,No
4,28,M,Hispanic,No
5,41,M,Hispanic,No


In [47]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = df['Violent']


In [48]:
# Separate the X variable, the features
x = df.drop(columns='Violent')
x = pd.get_dummies(x, dtype=int)
x.head()



Unnamed: 0,VicAge,VicSex_F,VicSex_M,VicRace_,VicRace_AANHPI,VicRace_AIAN,VicRace_Black,VicRace_Hispanic,VicRace_Multiracial,VicRace_White
1,47,0,1,0,0,0,0,0,1,0
2,19,0,0,1,0,0,0,0,0,0
3,19,0,1,0,0,0,0,0,1,0
4,28,0,1,0,0,0,0,1,0,0
5,41,0,1,0,0,0,0,1,0,0


In [49]:
# Review the y variable Series
y[:5]

1    No
2    No
3    No
4    No
5    No
Name: Violent, dtype: object

In [50]:
# Review the X variable DataFrame
x[:5]

Unnamed: 0,VicAge,VicSex_F,VicSex_M,VicRace_,VicRace_AANHPI,VicRace_AIAN,VicRace_Black,VicRace_Hispanic,VicRace_Multiracial,VicRace_White
1,47,0,1,0,0,0,0,0,1,0
2,19,0,0,1,0,0,0,0,0,0
3,19,0,1,0,0,0,0,0,1,0
4,28,0,1,0,0,0,0,1,0,0
5,41,0,1,0,0,0,0,1,0,0


In [51]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)

In [52]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model using training data
classifier.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [53]:
# Make a prediction using the testing data
predictions = classifier.predict(x_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
332823,No,Yes
525863,Yes,Yes
846642,No,Yes
33785,No,No
845110,No,No
...,...,...
449274,No,Yes
272041,No,No
423714,No,Yes
351815,Yes,No


In [54]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, predictions)

0.6026609735669948

In [55]:
# Generate a confusion matrix for the model
confuse_matrix = confusion_matrix(y_test, predictions)

In [56]:
# Print the classification report for the model
class_report = classification_report(y_test, predictions)
print(class_report)

              precision    recall  f1-score   support

          No       0.67      0.71      0.69     97315
         Yes       0.55      0.49      0.52     68321

    accuracy                           0.62    165636
   macro avg       0.61      0.60      0.60    165636
weighted avg       0.62      0.62      0.62    165636



In [57]:
# USE RESAMPLED DATA
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# # Assign a random_state parameter of 1 to the model
ros_model = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
x_resampled, y_resampled = ros_model.fit_resample(x_train, y_train)

In [58]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier2 = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model using training data
classifier2.fit(x_resampled, y_resampled)

# Make a prediction using the testing data
predictions2 = classifier2.predict(x_test)
pd.DataFrame({"Prediction": predictions2, "Actual": y_test})

Unnamed: 0,Prediction,Actual
332823,Yes,Yes
525863,Yes,Yes
846642,Yes,Yes
33785,No,No
845110,Yes,No
...,...,...
449274,No,Yes
272041,No,No
423714,Yes,Yes
351815,Yes,No


In [59]:
balanced_accuracy_score(y_test, predictions2)

0.626078152839417

In [60]:
# Generate a confusion matrix for the model
confuse_matrix = confusion_matrix(y_test, predictions2)

In [61]:
# Print the classification report for the model
class_report2 = classification_report(y_test, predictions2)
print(class_report2)

              precision    recall  f1-score   support

          No       0.73      0.55      0.62     97315
         Yes       0.52      0.71      0.60     68321

    accuracy                           0.61    165636
   macro avg       0.62      0.63      0.61    165636
weighted avg       0.64      0.61      0.61    165636

