<a href="https://colab.research.google.com/github/elijahcw-git/Capstone/blob/main/Logistic_Regression_LAPD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The data source for LA crime from 2020: 2024
  * https://data.lacity.org/Public-Safety/Crime-Data-from-2020-to-Present/2nrs-mtv8/about_data

Previous work:
  * Kaggle: https://www.kaggle.com/datasets/hemil26/crime-in-los-angeles/code

  * EDA with R: https://www.kaggle.com/code/teresawu726/eda-prediction-of-los-angeles-crime-by#Crime-prediction

  * LA Crime Analysis: https://www.kaggle.com/code/shayalvaghasiya/los-angeles-crimes-analysis
  

These questions guide our analysis and are central to our project's objectives:
What are the hotspots of crime occurrences in LA, and how can we predict the types of crime?
We propose to investigate factors such as time of day, day of the week, geographical locations, and types of crime in LA.

2-What role do victim demographics play in crime prediction?
We intend to explore the relationships between demographic variables and crime types to identify potential patterns or trends before they occur.

3-Do specific weather conditions in LA influence crime patterns and frequencies?We propose to construct a model to explore the correlation between weather and crime, aiming to guide law enforcement in resource planning based on weather-related crime trends.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

# !pip install category_encoders

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
from sklearn import linear_model
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
#Import dataset:

clean_df = pd.read_csv('/content/drive/MyDrive/df_10_29Feb_4.csv')
print(clean_df.shape)

(1817882, 26)


In [86]:
# clean_df['Vict_Descent'].unique()

In [87]:
# from google.colab import drive
# drive.mount('/content/drive')

In [4]:
# Print data shape
print(f"Dataset shape: {clean_df.shape}\n")

missing_values = clean_df.isnull().sum()
percentage_missing = (missing_values / len(clean_df)) * 100
unique_values = clean_df.nunique()  # Count unique values in each column

# Creating a DataFrame to display data type, missing values, percentage of missing values, and number of unique values
summary_df = pd.DataFrame({
    'Data_type': clean_df.dtypes,
    'Missing': missing_values,
    '%_Missing': percentage_missing,
    'Unique_values': unique_values  # Adding the unique values count
})

print(summary_df)

Dataset shape: (1817882, 26)

                     Data_type  Missing  %_Missing  Unique_values
DR_NO                    int64        0        0.0        1614674
Date_Rptd               object        0        0.0           5128
DATE_OCC                object        0        0.0           5111
TIME_OCC                 int64        0        0.0           1439
AREA                     int64        0        0.0             21
AREA_NAME               object        0        0.0             21
Rpt_Dist_No              int64        0        0.0           1267
Part_1-2                 int64        0        0.0              2
Crm_Cd                   int64        0        0.0             10
Crm_Cd_Desc             object        0        0.0             10
Vict_Age               float64        0        0.0             99
Vict_Sex                object        0        0.0              3
Vict_Descent            object        0        0.0             21
LOCATION                object        0       

In [5]:
# Existing data cleaning steps
# clean_df = clean_df[~((clean_df['LAT'] == 0) & (clean_df['LON'] == 0))]
# clean_df = clean_df[(clean_df['Avg_Windspeed'] <= 80) & (clean_df['Vict_Age'] <= 100)]
# clean_df['Vict_Sex'].fillna('Unknown', inplace=True)
# clean_df['Vict_Descent'].fillna('Unknown', inplace=True)
# clean_df['Total_Precipitation'].fillna(clean_df['Total_Precipitation'].median(), inplace=True)

# Function to categorize age into 3 numeric categories
def categorize_age(age):
    if age < 18:
        return 0  # Category for minors
    elif age <= 64:
        return 1  # Category for adults
    else:
        return 2  # Category for seniors

# Column for age categories
clean_df['Vict_Age_Category'] = clean_df['Vict_Age'].apply(categorize_age)

In [None]:
# columns_to_drop = ['Crm_Cd_1', 'Premis_Cd', 'Premis_Desc','Status', 'Status_Desc']
# clean_df.drop(columns=columns_to_drop, inplace=True)
# clean_df['Weapon_Reported'] = clean_df['Weapon_Used_Cd'].notna().astype(int)

In [None]:
# Display the unique value the number count of those unique value.

# for column in ['Part_1-2', 'Weapon_Reported','Vict_Sex','Vict_Age_Category']:
#   print(f"Unique values for {column}:")
#   unique_values = clean_df[column].unique()
#   for value in unique_values:
#     count = clean_df[clean_df[column] == value].shape[0]
#     print(f"  - {value}: {count}")

Unique values for Part_1-2:
  - 2: 734451
  - 1: 1083431
Unique values for Weapon_Reported:
  - 0: 1239030
  - 1: 578852
Unique values for Vict_Sex:
  - M: 775974
  - F: 716259
  - X: 325649
Unique values for Vict_Age_Category:
  - 0: 427288
  - 1: 1289574
  - 2: 101020


In [6]:
# Select the top 10 most frequent Crm_Cd
top_10_crimes = clean_df['Crm_Cd'].value_counts().nlargest(10).index

# Filter the dataset to only include rows with the top 10 Crm_Cd
df_top_10_crimes = clean_df[clean_df['Crm_Cd'].isin(top_10_crimes)]

# Print the new dataset shape
print(df_top_10_crimes.shape)


(1817882, 27)


# Cleaning Data to top 10 crimes

In [7]:
data=df_top_10_crimes

# Convert 'Date_Rptd' and 'DATE_OCC' to datetime
data['Date_Rptd'] = pd.to_datetime(data['Date_Rptd'])
data['DATE_OCC'] = pd.to_datetime(data['DATE_OCC'])

# Extract day of week, month, and year from 'DATE_OCC'
data['Day_of_Week'] = data['DATE_OCC'].dt.dayofweek
data['Month'] = data['DATE_OCC'].dt.month
data['Year'] = data['DATE_OCC'].dt.year

# Drop the original 'Date_Rptd' and 'DATE_OCC' columns
data = data.drop(['Date_Rptd', 'DATE_OCC'], axis=1)

# Convert 'Vict_Sex' and 'Vict_Descent' to dummy variables
categorical_to_convert = ['Vict_Sex', 'Region_Ethnic_Origin']
data = pd.get_dummies(data, columns=categorical_to_convert, drop_first=True)
data['Day_of_Week'] = data['Day_of_Week'].astype('category')
data['Month'] = data['Month'].astype('category')
data['Year'] = data['Year'].astype('category')

# Create dummy variables for these columns
data = pd.get_dummies(data, columns=['Day_of_Week', 'Month', 'Year'], drop_first=True)


# We will not convert 'LOCATION' due to its high cardinality
data.drop(['LOCATION','Vict_Age_Category', 'Vict_Descent', 'DR_NO'], axis=1, inplace=True)

In [8]:
data.columns

Index(['TIME_OCC', 'AREA', 'AREA_NAME', 'Rpt_Dist_No', 'Part_1-2', 'Crm_Cd',
       'Crm_Cd_Desc', 'Vict_Age', 'LAT', 'LON', 'Avg_Temp', 'Avg_Dewpoint',
       'Avg_Humidity', 'Avg_Windspeed', 'Avg_Pressure', 'Total_Precipitation',
       'Crime_Category', 'Crime_Category_Code', 'Weapon_Reported',
       'Vict_Sex_M', 'Vict_Sex_X', 'Region_Ethnic_Origin_Black',
       'Region_Ethnic_Origin_Hispanic/Latin/Mexican',
       'Region_Ethnic_Origin_Other', 'Region_Ethnic_Origin_Unknown',
       'Region_Ethnic_Origin_White', 'Day_of_Week_1', 'Day_of_Week_2',
       'Day_of_Week_3', 'Day_of_Week_4', 'Day_of_Week_5', 'Day_of_Week_6',
       'Month_2', 'Month_3', 'Month_4', 'Month_5', 'Month_6', 'Month_7',
       'Month_8', 'Month_9', 'Month_10', 'Month_11', 'Month_12', 'Year_2011',
       'Year_2012', 'Year_2013', 'Year_2014', 'Year_2015', 'Year_2016',
       'Year_2017', 'Year_2018', 'Year_2019', 'Year_2020', 'Year_2021',
       'Year_2022', 'Year_2023'],
      dtype='object')

In [11]:
# Print data shape
print(f"Dataset shape: {data.shape}\n")

missing_values = data.isnull().sum()
percentage_missing = (missing_values / len(data)) * 100
unique_values = data.nunique()  # Count unique values in each column
non_missing_count = data.count()  # Count non-missing values in each column

# Creating a DataFrame to:
summary_df = pd.DataFrame({
    'Data_type': data.dtypes, # Display data type
    'Count': non_missing_count,  # Count of non-missing values
    'Missing': missing_values, # Missing values
    '%_Missing': percentage_missing, # Percentage of missing values
    'Unique_values': unique_values  # Unique values count
})

print(summary_df)


Dataset shape: (1817882, 49)

                                            Data_type    Count  Missing  \
AREA                                            int64  1817882        0   
Part_1-2                                        int64  1817882        0   
Vict_Age                                      float64  1817882        0   
LAT                                           float64  1817882        0   
LON                                           float64  1817882        0   
Avg_Temp                                      float64  1817882        0   
Avg_Dewpoint                                  float64  1817882        0   
Avg_Humidity                                  float64  1817882        0   
Avg_Windspeed                                 float64  1817882        0   
Avg_Pressure                                  float64  1817882        0   
Total_Precipitation                           float64  1817882        0   
Crime_Category_Code                             int64  1817882        

In [10]:
data = data.drop(columns=['Crm_Cd_Desc', 'Crime_Category', 'Crm_Cd', 'AREA_NAME', 'Rpt_Dist_No'], axis = 0)
data = data.drop(columns=['TIME_OCC'], axis = 0)
data = data.drop(columns=['Weapon_Reported'], axis = 0)

# Algorithms LR

In [12]:
X.columns

NameError: name 'X' is not defined

In [15]:
# Separate the features and the target variable
X = data.drop('Crime_Category_Code', axis=1)  # Features
y = data['Crime_Category_Code']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Logistic Regression model
log_reg = LogisticRegression(max_iter=1000)

# Train the model
log_reg.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = log_reg.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the Logistic Regression model: {accuracy * 100:.2f}%")


Accuracy of the Logistic Regression model: 72.81%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
X.columns

Index(['AREA', 'Part_1-2', 'Vict_Age', 'LAT', 'LON', 'Avg_Temp',
       'Avg_Dewpoint', 'Avg_Humidity', 'Avg_Windspeed', 'Avg_Pressure',
       'Total_Precipitation', 'Weapon_Reported', 'Vict_Sex_M', 'Vict_Sex_X',
       'Region_Ethnic_Origin_Black',
       'Region_Ethnic_Origin_Hispanic/Latin/Mexican',
       'Region_Ethnic_Origin_Other', 'Region_Ethnic_Origin_Unknown',
       'Region_Ethnic_Origin_White', 'Day_of_Week_1', 'Day_of_Week_2',
       'Day_of_Week_3', 'Day_of_Week_4', 'Day_of_Week_5', 'Day_of_Week_6',
       'Month_2', 'Month_3', 'Month_4', 'Month_5', 'Month_6', 'Month_7',
       'Month_8', 'Month_9', 'Month_10', 'Month_11', 'Month_12', 'Year_2011',
       'Year_2012', 'Year_2013', 'Year_2014', 'Year_2015', 'Year_2016',
       'Year_2017', 'Year_2018', 'Year_2019', 'Year_2020', 'Year_2021',
       'Year_2022', 'Year_2023'],
      dtype='object')

In [None]:
data[['Day_of_Week', 'Month','Year']].value_counts()

Day_of_Week  Month  Year
5            1      2011    4681
6            1      2023    3813
4            9      2023    3789
             1      2010    3767
5            12     2023    3634
                            ... 
2            11     2020     238
5            12     2020     231
6            12     2020     219
3            11     2020     218
0            12     2020     209
Length: 1176, dtype: int64

In [75]:
coefficients = log_reg.coef_[0]

# Map coefficients to features
feature_names = X_train.columns
feature_impact = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
}).sort_values(by='Coefficient', ascending=False)

print(feature_impact)

                                        Feature  Coefficient
13                   Region_Ethnic_Origin_Black     1.024716
11                                   Vict_Sex_M     0.899332
14  Region_Ethnic_Origin_Hispanic/Latin/Mexican     0.754184
47                                    Year_2023     0.459328
23                                Day_of_Week_6     0.236231
45                                    Year_2021     0.215123
46                                    Year_2022     0.139429
22                                Day_of_Week_5     0.129451
44                                    Year_2020     0.115287
9                                  Avg_Pressure     0.112252
43                                    Year_2019     0.091799
28                                      Month_6     0.047852
27                                      Month_5     0.036137
25                                      Month_3     0.033646
8                                 Avg_Windspeed     0.024312
26                      