# Libraries

In [1]:
import pandas as pd
import os
import numpy as np

#data cleansing
from sklearn.preprocessing import OneHotEncoder


#logistic regression
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


#Export model
import pickle


###################################################################
# Personal library #
import sys
import os

# Path to the folder containing the pesonalized functions
folder_path = os.path.abspath(os.path.join('..', 'library'))
sys.path.insert(0, folder_path)

# Now you can import your module or functions
import la_functions as la

# Reading + Enriching the data

In [2]:
# Here I call the function data_enriching as defined in the module la_functions.py present in the folder library
df = la.data_enriching('data.csv')

# Here focus on 2023 only - to be changed as needed
df = df.query(" year_occurred==2023 ")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
len(df)

210051

In [4]:
for x in df :
    print(f"'{x}',")

'division_number',
'date_reported',
'date_occurred',
'area',
'area_name',
'reporting_district',
'part',
'crime_code',
'crime_description',
'modus_operandi',
'victim_age',
'victim_sex',
'victim_descent',
'premise_code',
'premise_description',
'weapon_code',
'weapon_description',
'status',
'status_description',
'crime_code_1',
'crime_code_2',
'crime_code_3',
'crime_code_4',
'location',
'cross_street',
'latitude',
'longitude',
'counter',
'year_occurred',
'month_occurred',
'hour_occurred',
'geometry',
'index_right',
'OBJECTID',
'name',
'gravity_for_tourist',


# Getting the data

In [5]:
columns_keep = [
#'division_number',
#'date_reported',
'date_occurred',
#'area',
#'area_name',
#'reporting_district',
#'part',
#'crime_code',
'crime_description',
#'modus_operandi',
'victim_age',
'victim_sex',
'victim_descent',
#'premise_code',
#'premise_description',
#'weapon_code',
#'weapon_description',
#'status',
#'status_description',
#'crime_code_1',
#'crime_code_2',
#'crime_code_3',
#'crime_code_4',
#'location',
#'cross_street',
'latitude',
'longitude',

'year_occurred',
'month_occurred',
'hour_occurred',
'geometry',
'index_right',
'OBJECTID',
'name',
'gravity_for_tourist'
]

In [6]:
df = df[columns_keep]
df.head(3)

Unnamed: 0,date_occurred,crime_description,victim_age,victim_sex,victim_descent,latitude,longitude,year_occurred,month_occurred,hour_occurred,geometry,index_right,OBJECTID,name,gravity_for_tourist
112274,2023-04-29 16:00:00,VEHICLE - STOLEN,0,,,34.0672,-118.2941,2023,4,16,POINT (-118.29410 34.06720),52.0,53.0,Koreatown,1
225174,2023-07-03 14:00:00,THEFT OF IDENTITY,33,M,W,34.033,-118.4324,2023,7,14,POINT (-118.43240 34.03300),77.0,78.0,Rancho Park,1
409220,2023-10-19 19:40:00,VEHICLE - STOLEN,0,,,34.2166,-118.4403,2023,10,19,POINT (-118.44030 34.21660),71.0,72.0,Panorama City,1


# Data Cleansing

## 1. Basic Cleansing

In [7]:
#Removing illogical values and replacing erronous values

#removing negative age values
df=df[df['victim_age']>=0]

#removing null gender values
df = df[df['victim_sex'].notnull()]
df = df[df['victim_descent'].notnull()]


In [8]:
#exploring unique values for victim descent and victim sex columns
df['victim_sex'].unique(), df['victim_descent'].unique()

(array(['M', 'X', 'F', 'H', '-'], dtype=object),
 array(['W', 'X', 'H', 'O', 'B', 'F', 'A', 'K', 'C', 'J', 'Z', 'I', 'D',
        'V', 'G', 'P', 'S', 'L', 'U', '-'], dtype=object))

In [9]:
# from the above cell , '-' is present in both columns so we need to remove rows that contain it (they're not that many)
df=df[df['victim_sex'] != '-']
df=df[df['victim_descent'] != '-']

In [10]:
#checking the above code worked
df['victim_sex'].unique(), df['victim_descent'].unique()

(array(['M', 'X', 'F', 'H'], dtype=object),
 array(['W', 'X', 'H', 'O', 'B', 'F', 'A', 'K', 'C', 'J', 'Z', 'I', 'D',
        'V', 'G', 'P', 'S', 'L', 'U'], dtype=object))

In [11]:
len(df)

181259

# This one is included in the data_enriching function
# Parse datetime string to datetime object

df['date_occurred'] = pd.to_datetime(df['date_occurred'], errors='coerce')

#create new date and time columns
df['year_occurred'] = df['date_occurred'].dt.year
df['month_occurred'] = df['date_occurred'].dt.month
df['hour_occurred'] = df['date_occurred'].dt.hour
df['day_occurred'] = df['date_occurred'].dt.day

df.head(3)

In [12]:
# Create new column 'time_of_day' based on time
def categorize_time(hour):
    if 6 <= hour < 12:
        return 'morning'
    elif 12 <= hour < 18:
        return 'afternoon'
    elif 18 <= hour < 24:
        return 'evening'
    else:
        return 'night'

df['time_of_day'] = df['hour_occurred'].apply(categorize_time)

## 1.2 one hot encoding victim_sex and victim_descent

In [13]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [14]:
df = pd.get_dummies(df, columns=['victim_descent', 'victim_sex'])
df = df.replace({True: 1, False: 0})

df.head()

Unnamed: 0,date_occurred,crime_description,victim_age,latitude,longitude,year_occurred,month_occurred,hour_occurred,geometry,index_right,OBJECTID,name,gravity_for_tourist,time_of_day,victim_descent_A,victim_descent_B,victim_descent_C,victim_descent_D,victim_descent_F,victim_descent_G,victim_descent_H,victim_descent_I,victim_descent_J,victim_descent_K,victim_descent_L,victim_descent_O,victim_descent_P,victim_descent_S,victim_descent_U,victim_descent_V,victim_descent_W,victim_descent_X,victim_descent_Z,victim_sex_F,victim_sex_H,victim_sex_M,victim_sex_X
225174,2023-07-03 14:00:00,THEFT OF IDENTITY,33,34.033,-118.4324,2023,7,14,POINT (-118.43240 34.03300),77.0,78.0,Rancho Park,1,afternoon,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
409723,2023-06-17 11:40:00,SHOPLIFTING - PETTY THEFT ($950 & UNDER),0,34.0981,-118.3092,2023,6,11,POINT (-118.30920 34.09810),25.0,26.0,East Hollywood,1,morning,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
409779,2023-02-11 16:30:00,THEFT PLAIN - PETTY ($950 & UNDER),48,34.0396,-118.2726,2023,2,16,POINT (-118.27260 34.03960),23.0,24.0,Downtown,1,afternoon,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
409947,2023-01-23 12:00:00,BURGLARY,60,34.2138,-118.5951,2023,1,12,POINT (-118.59510 34.21380),12.0,13.0,Canoga Park,2,afternoon,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
411222,2023-05-16 15:50:00,SHOPLIFTING - PETTY THEFT ($950 & UNDER),22,34.0539,-118.2712,2023,5,15,POINT (-118.27120 34.05390),106.0,107.0,Westlake,1,afternoon,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


# Logistic regression

In [15]:
df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 181259 entries, 225174 to 852949
Data columns (total 37 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   date_occurred        181259 non-null  datetime64[ns]
 1   crime_description    181259 non-null  object        
 2   victim_age           181259 non-null  int64         
 3   latitude             181259 non-null  float64       
 4   longitude            181259 non-null  float64       
 5   year_occurred        181259 non-null  int64         
 6   month_occurred       181259 non-null  int64         
 7   hour_occurred        181259 non-null  int64         
 8   geometry             181259 non-null  geometry      
 9   index_right          179493 non-null  float64       
 10  OBJECTID             179493 non-null  float64       
 11  name                 179493 non-null  object        
 12  gravity_for_tourist  181259 non-null  int64         
 13  t

## Cross validate

In [16]:
#Create the feature set

X = df[['victim_age','latitude','longitude',]]
y = df['crime_description']

# Instanciate model
model = LogisticRegression(max_iter=10000)

# 5-Fold Cross validate model
#`cross_validate` doesn't return a fitted model
cv_results = cross_validate(model, X, y, cv=5)

# Mean of accuracies
accuracy = cv_results['test_score'].mean()

accuracy



KeyboardInterrupt: 

## Learning curves

In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
import numpy as np

# Get train scores, train sizes, and validation scores using `learning_curve`, r2 score
train_sizes, train_scores, test_scores = learning_curve(estimator = LogisticRegression(max_iter=1000),
                                                              X = X,
                                                              y = y,
                                                              train_sizes = [25,50,75,100,300,400,500,600,700,800],
                                                              cv = 20)

# Take the mean of cross-validated train scores and validation scores
train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)

# Plot the learning curves!
plt.plot(train_sizes, train_scores_mean, label = 'Training score')
plt.plot(train_sizes, test_scores_mean, label = 'Test score')
plt.ylabel('Accuracy score', fontsize = 14)
plt.xlabel('Training set size', fontsize = 14)
plt.title('Learning curves', fontsize = 18, y = 1.03)
#plt.xlim([0, 1000])
#plt.ylim([0, 1])
plt.legend()

## Prediction

In [None]:
#Holdout 30% of the dataset as the test set for a final evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.30,
                                                    random_state=0)


# STEP 1 - Instantiate and train the model on the training data
log_model = LogisticRegression(max_iter=10000).fit(X_train, y_train)


# # STEP 2 - Predictions on the test set
# Evaluate the model
log_model.score(X_test, y_test)
#y_pred = log_model.predict(X_test)




# Use the trained model to predict
prediction = log_model.predict([[18,34.0141,-118.2978,]])

prediction

# Export model

In [None]:
# Export model as pickle file
with open("ml_model.pkl", "wb") as file:
    pickle.dump(log_model, file)

# Load Pipeline from pickle file
my_model = pickle.load(open("ml_model.pkl","rb"))

my_model.score(X_test, y_test)

# Import model

In [None]:
# Load pipeline from pickle file
my_model = pickle.load(open("ml_model.pkl", "rb"))

my_model