In [122]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as pyplot
from collections import Counter
import seaborn as sns
import psycopg2
from sqlalchemy import create_engine
from sqlalchemy import inspect

In [123]:
# sklearn processing and classification 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [124]:
# sklearn classification model evaluation
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [125]:
# import the data
file_path = pd.read_csv('C:/Users/lizzi/School/Group_5_Final_Project/place_type.csv')


In [126]:
file_path.tail()

Unnamed: 0,ROW_ID,INCIDENT_REPORT_ID,DIVISION_ID,PLACE_TYPE_DESCRIPTION,PLACE_TYPE_CODE,CLEARANCE_STATUS,CLEARANCE_CODE,HIGHEST_NIBRS_CODE,CITY_NEW,VIOLENCE
489979,490718,20220513_2308_489980,12,Residential,3,Open,1,13A,Charlotte,violent
489980,490719,20220513_2308_489981,27,Residential,3,Open,1,290,Charlotte,non-violent
489981,490720,20220513_2308_489982,22,Commercial Place,4,Cleared by Arrest,3,13B,Charlotte,violent
489982,490721,20220513_2308_489983,2,Residential,3,Open,1,290,Charlotte,non-violent
489983,490722,20220513_2308_489984,16,Commercial Place,4,Cleared by Arrest,3,90C,Charlotte,non-violent


In [156]:
file_path.isnull().mean().sort_values()

ROW_ID                    0.0
INCIDENT_REPORT_ID        0.0
DIVISION_ID               0.0
PLACE_TYPE_DESCRIPTION    0.0
PLACE_TYPE_CODE           0.0
CLEARANCE_STATUS          0.0
CLEARANCE_CODE            0.0
HIGHEST_NIBRS_CODE        0.0
CITY_NEW                  0.0
VIOLENCE                  0.0
dtype: float64

In [243]:
# prepare data
# split into input and target features
X = file_path[['CLEARANCE_CODE', 'DIVISION_ID']]
y = file_path.VIOLENCE

In [244]:
X.describe()

Unnamed: 0,CLEARANCE_CODE,DIVISION_ID
count,489984.0,489984.0
mean,1.672653,14.330948
std,0.916322,8.494614
min,1.0,0.0
25%,1.0,7.0
50%,1.0,14.0
75%,2.0,21.0
max,5.0,92.0


In [245]:
# build model
# split into test and training sets
X_train, X_test, y_train, y_test = train_test_split(
X, y, random_state=40, stratify=y)


In [246]:
logisticRegr = LogisticRegression()


In [247]:
logisticRegr.fit(X_train, y_train)


LogisticRegression()

In [248]:
classifier = LogisticRegression(solver='lbfgs',
   max_iter=100,
   random_state=40)

In [249]:
classifier.fit(X_train, y_train)


LogisticRegression(random_state=40)

In [250]:
y_pred = classifier.predict(X_test)


In [251]:
print(accuracy_score(y_test, y_pred))


0.5736513845350052


In [252]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
   intercept_scaling=1, max_iter=100, multi_class='warn', penalty='12',
   random_state=40, solver='lbfgs', warm_start=False)

LogisticRegression(multi_class='warn', penalty='12', random_state=40)

In [253]:
# create predictions with results in Pandas DF
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
485004,violent,non-violent
201140,non-violent,non-violent
342193,non-violent,non-violent
45808,non-violent,non-violent
25184,non-violent,non-violent
...,...,...
346801,non-violent,non-violent
213923,non-violent,violent
376247,non-violent,non-violent
2545,non-violent,non-violent


In [254]:
# Evaluate test performance
accuracy_score(y_test, predictions)


0.5736513845350052

In [255]:
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[54408 11192]
 [41034 15862]]


In [141]:
# Evaluatinf on test set
test_acc = accuracy_score(y_test, y_pred)
print("The Accuracy for Test Set is {}".format(test_acc*100))

The Accuracy for Test Set is 55.18792450365726


In [142]:
# generating classification report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

 non-violent       0.56      0.77      0.65     65600
     violent       0.53      0.29      0.38     56896

    accuracy                           0.55    122496
   macro avg       0.55      0.53      0.51    122496
weighted avg       0.55      0.55      0.52    122496



In [143]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

non-violent       0.56      0.77      0.29      0.65      0.48      0.24     65600
    violent       0.53      0.29      0.77      0.38      0.48      0.22     56896

avg / total       0.55      0.55      0.52      0.52      0.48      0.23    122496



In [144]:
# Path to postgres/aws database
database_path = "postgres:policedata5@g5-final-project.cz9sdl8p2rcm.us-east-2.rds.amazonaws.com:5432/g5-final-project"

In [145]:
# Create an engine that can talk to the database
engine = create_engine("postgresql://postgres:policedata5@g5-final-project.cz9sdl8p2rcm.us-east-2.rds.amazonaws.com:5432/Group_5_Final_Project")



In [146]:
# Get the name of the tables in database (test connection). 
inspector = inspect(engine)
inspector.get_table_names()

conn = engine.connect()

In [147]:
# Test pull to confirm table data can be read into DataFrames
count_data_df = pd.read_sql("SELECT * FROM count_data", conn)
count_data_df.head()

Unnamed: 0,incident_report_id,division_id,place_type_description,clearance_status,highest_nibrs_code,city_new
0,20220513-2308-01,2,Retail,Open,899,Charlotte
1,20220513-2024-00,22,Retail,Open,23C,Charlotte
2,20220513-2022-03,28,Public/Non-Residential,Open,899,Charlotte
3,20220513-2012-00,1,Residential,Exceptionally Cleared,899,Charlotte
4,20220513-1958-01,22,Residential,Open,801,Charlotte
