# Libraries

In [1]:
import pandas as pd
import os

#logistic regression
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


#Export model
import pickle

# Read data

In [2]:
# fetch all the data from the raw_data folder
current_dir = os.getcwd()
current_dir

file_path = os.path.join(current_dir, '..','raw_data', 'data.csv')
df = pd.read_csv(file_path, nrows=10000)

# Getting the data

In [3]:
columns_keep = [
#'division_number',
#'date_reported',
'date_occurred',
#'area',
#'area_name',
#'reporting_district',
#'part',
#'crime_code',
'crime_description',
#'modus_operandi',
'victim_age',
#'victim_sex',
#'victim_descent',
#'premise_code',
#'premise_description',
#'weapon_code',
#'weapon_description',
#'status',
#'status_description',
#'crime_code_1',
#'crime_code_2',
#'crime_code_3',
#'crime_code_4',
#'location',
#'cross_street',
'latitude',
'longitude',
]

In [4]:
df = df[columns_keep]
df.head(3)

Unnamed: 0,date_occurred,crime_description,victim_age,latitude,longitude
0,2020-01-08 22:30:00,BATTERY - SIMPLE ASSAULT,36,34.0141,-118.2978
1,2020-01-01 03:30:00,BATTERY - SIMPLE ASSAULT,25,34.0459,-118.2545
2,2020-02-13 12:00:00,SEX OFFENDER REGISTRANT OUT OF COMPLIANCE,0,34.0448,-118.2474


In [5]:
# Dates
df['date_occurred'] = pd.to_datetime(df['date_occurred'], errors='coerce')
df['year_occurred'] = df['date_occurred'].dt.year
df['month_occurred'] = df['date_occurred'].dt.month
df['hour_occurred'] = df['date_occurred'].dt.hour
df['day_occurred'] = df['date_occurred'].dt.day

df.head(3)

Unnamed: 0,date_occurred,crime_description,victim_age,latitude,longitude,year_occurred,month_occurred,hour_occurred,day_occurred
0,2020-01-08 22:30:00,BATTERY - SIMPLE ASSAULT,36,34.0141,-118.2978,2020,1,22,8
1,2020-01-01 03:30:00,BATTERY - SIMPLE ASSAULT,25,34.0459,-118.2545,2020,1,3,1
2,2020-02-13 12:00:00,SEX OFFENDER REGISTRANT OUT OF COMPLIANCE,0,34.0448,-118.2474,2020,2,12,13


# Logistic regression

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   date_occurred      10000 non-null  datetime64[ns]
 1   crime_description  10000 non-null  object        
 2   victim_age         10000 non-null  int64         
 3   latitude           10000 non-null  float64       
 4   longitude          10000 non-null  float64       
 5   year_occurred      10000 non-null  int32         
 6   month_occurred     10000 non-null  int32         
 7   hour_occurred      10000 non-null  int32         
 8   day_occurred       10000 non-null  int32         
dtypes: datetime64[ns](1), float64(2), int32(4), int64(1), object(1)
memory usage: 547.0+ KB


In [7]:
#Create the feature set

X = df[['victim_age','latitude','longitude','day_occurred','month_occurred','year_occurred' ]]
y = df['crime_description']

# Instanciate model
#model = LogisticRegression(max_iter=50)

# 5-Fold Cross validate model
#`cross_validate` doesn't return a fitted model
#cv_results = cross_validate(model, X, y, cv=5)

# Mean of accuracies
#accuracy = cv_results['test_score'].mean()

#accuracy

In [8]:
#Holdout 30% of the dataset as the test set for a final evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.30,
                                                    random_state=0)


# STEP 1 - Instantiate and train the model on the training data
log_model = LogisticRegression(max_iter=10000).fit(X_train, y_train)


# # STEP 2 - Predictions on the test set
# Evaluate the model
log_model.score(X_test, y_test)
#y_pred = log_model.predict(X_test)




# Use the trained model to predict
prediction = log_model.predict([[18,34.0141,-118.2978,2,3,2024]])

prediction

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array(['THEFT FROM MOTOR VEHICLE - GRAND ($950.01 AND OVER)'],
      dtype=object)

# Export model

In [9]:
# Export model as pickle file
with open("ml_model.pkl", "wb") as file:
    pickle.dump(log_model, file)

# Load Pipeline from pickle file
my_model = pickle.load(open("ml_model.pkl","rb"))

my_model.score(X_test, y_test)

0.348

# Import model

In [10]:
# Load pipeline from pickle file
my_model = pickle.load(open("ml_model.pkl", "rb"))

my_model