In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.utils import resample
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import pickle
from sklearn.inspection import permutation_importance

plt.rcParams['figure.figsize'] = (15,12)
#pd.set_option('display.max_rows', None)

In [None]:
df = pd.read_csv('weather_crime.csv')
df.head()

In [None]:
weather = pd.read_csv('weather_api .csv')
crime = pd.read_csv('Crimes_-_2021.csv')

In [None]:
crime.dtypes

In [None]:
# dropping the unnamed: 0 column as it is not required

df = df.drop(['Unnamed: 0','sunrisetime','sunsettime','temperaturemax','temperaturemin'],axis=1)

In [None]:
df.rename(columns = {'date__of_occurrence':'date_of_occurrence','_primary_description':'primary_description','_secondary_description':'secondary_description','_location_description':'location_description','month_x':'month','day_x':'day','temperaturehigh':'high_temp','temperaturelow':'low_temp'}, inplace = True)


In [None]:
df

In [None]:
df.dtypes


In [None]:
# Create bins for temperature data
temp_bins = [0, 20, 40, 60, 80, 100, 120]
temp_labels = ["0 to 20", "20 to 40", "40 to 60", "60 to 80", "80 to 100", "100 to 120"]

# Bin the Temp column and add it to the dataframe as the Temp. Bin column
df['Temp Bin'] = pd.cut(df['high_temp'], temp_bins, labels=temp_labels)

# Group by temperature bins, counting the number of crimes committed in each bucket
grouped_by_temp_bins = df.groupby(['Temp Bin', 'primary_description'])['date'].count()

grouped_by_temp_bins_ = df.groupby(['Temp Bin', 'primary_description'])['date'].count()

# Create a Dataframe with the grouped temperature data and reset index for plotting
grouped_by_temp_bins_df = pd.DataFrame(grouped_by_temp_bins.reset_index())

# Rename columns
grouped_by_temp_bins_df.columns = ['Temp Bin', 'Crime Type', 'Number of Crimes']

# Pivot Dataframe as preparation for plotting
grouped_by_temp_bins_df2 = grouped_by_temp_bins_df.pivot(index='Temp Bin', columns='Crime Type', values = 'Number of Crimes')

# Preview Dataframe
grouped_by_temp_bins_df2

In [None]:
# Plot Count of Each Crime Type For Each Temperature Bin
fig = plt.figure(figsize=(10,5))
ax = fig.add_subplot(111)
xloc = np.arange(6)  # the x locations for the bars
width = 0.1 # the width of the bars
ax.bar(xloc - width*2, grouped_by_temp_bins_df2['ASSAULT'], width, label = 'Assault')
ax.bar(xloc - width, grouped_by_temp_bins_df2['BATTERY'], width, label = 'Battery')
ax.bar(xloc, grouped_by_temp_bins_df2['BURGLARY'], width, label = 'Burglary')
ax.bar(xloc + width, grouped_by_temp_bins_df2['CRIMINAL DAMAGE'], width, label = 'Criminal Damage')
ax.bar(xloc + width*2, grouped_by_temp_bins_df2['DECEPTIVE PRACTICE'], width, label = 'Deceptive Practice')
plt.xticks(xloc, temp_labels)
plt.xlabel('Temperature (Fahrenheit)', fontweight = 'bold')
plt.ylabel('Total Number of Crimes', fontweight = 'bold')
plt.title('Number of Crimes Committed in Each Temperature Range', fontweight = 'bold', fontsize = 14)
ax.legend()

# Save figure and show it
plt.savefig('Number of Crimes Committed in Each Temperature Range.png', bbox_inches = 'tight', dpi = 199)
plt.show()

In [None]:
# Group weather dataframe by temperature bins, counting the number of days in each bucket
num_days_in_temp_bins = df.groupby(['high_temp'])['date'].count()

# Plot bar graph of temperature distribution
plt.figure(figsize=(10,5))
plt.title('Histogram of Temperature Distribution', fontweight = 'bold', fontsize = 14)
plt.xlabel('Temperature (Celsius)', fontweight = 'bold')
plt.ylabel('Number of Days', fontweight = 'bold')
plt.bar(num_days_in_temp_bins.index, num_days_in_temp_bins)

# Save figure and show it
plt.savefig('Histogram of Temperature Distribution.png', bbox_inches = 'tight', dpi = 199)
plt.show()

In [None]:
# Group weather-crime dataframe by temperature bins, 
#  counting the number of crimes committed in each bucket
num_crimes_in_temp_bins = df.groupby(['high_temp'])['primary_description'].count()

# Create a Dataframe with the grouped temperature data
num_crimes_in_temp_bins_df = pd.DataFrame({'Number of Crimes': num_crimes_in_temp_bins,
                                           'Number of Days': num_days_in_temp_bins,
                                           'Average Number of Crimes Committed': round(num_crimes_in_temp_bins/num_days_in_temp_bins)})

# Preview Dataframe
num_crimes_in_temp_bins_df.head(50)

In [None]:
crime_count = pd.DataFrame(df.groupby('primary_description').size().sort_values(ascending=False).rename('counts').reset_index())

crime_count.head(31)

## X y Split

In [None]:
X = df[['primary_description', 'month', 'day', 'block', 'date_of_occurrence', 'precipintensity', 'date',
       'arrest', 'location_description']]

y = df['primary_description']

X.shape, y.shape

In [None]:
y.value_counts()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


In [None]:
df.to_csv('presentation.csv')

In [None]:
#calculate the total frequency of all crimes per day. 
total_crimes = (crime.groupby(['Date']).size())
avg_weather = weather.groupby(['date']).mean()
print(avg_weather)
print(total_crimes)

In [None]:
weather.dtypes

In [None]:
frames = [total_crimes, avg_weather]
result = pd.concat(frames, axis=1, sort=True)
result = result.dropna()
result.columns = ['BATTERY','apparentTemperatureMax', 'precipAccumulation']
result.plot(kind='Scatter', x='Temp (C)', y='Battery')
result.plot(kind='Scatter', x='Total Precipitation (mm)', y='Battery', color='r')
result.plot()

In [None]:
## KEEP ON WORKING ON MODEL !!!!!!

In [None]:
# balancing data

combined = pd.concat([X_train, y_train], axis=1)

category_0 = combined[combined['primary_description']== 0]
category_1 = combined[combined['primary_description']== 1]

category_0_sample = resample(category_0, replace= True, n_samples = len(category_1))

In [None]:
data_upsampled = pd.concat([category_1, category_0_sample], axis=0)


In [None]:
X_train = data_upsampled.drop('tip_yes', axis=1)
y_train = data_upsampled['tip_yes']

## Train the model

In [None]:
clf = RandomForestClassifier(max_depth=25, 
                             min_samples_split= 10,
                             random_state=0)

clf.fit(X_train, y_train)

clf.score(X_train, y_train)

In [None]:
clf.score(X_test, y_test)


In [None]:
print(metrics.classification_report(y_test, clf.predict(X_test)))

In [None]:
# saving the trained model to be used later or as when needed

pickle.dump(clf, open('first_model.pkl', 'wb'))

In [None]:
# Loading the saved model

model = pickle.load(open('first_model.pkl', 'rb'))

## Feature Importance

In [None]:
# finding the important feature permutation

perm_importance = permutation_importance(clf, X_test, y_test)

In [None]:
sorted_idx = perm_importance.importances_mean.argsort()
plt.barh(X_test.columns[sorted_idx], perm_importance.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")

In [None]:
plt.savefig('model_1_feature_important.jpg')


In [None]:
plt.bar(height=clf.feature_importances_, x=X_train.columns)
plt.xticks(rotation=90)
plt.show()

### Second Model

In [None]:
taxi_data['higher_tip'] = np.where(taxi_data['tip_percent'] > 15, 1, 0)


In [None]:
taxi_data['higher_tip'].value_counts()


In [None]:
# looking at the feature importance, it seems that weather features have no use in the model and so I will
# remove them for building next model

sorted_idx = perm_importance.importances_mean.argsort()
plt.barh(X_test.columns[sorted_idx], perm_importance.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")

In [None]:
plt.savefig('model_2_feature_important.jpg')


In [None]:
# second method of finding feature importance, you dont need to put it in presentation

plt.bar(height=clf.feature_importances_, x=X_train.columns)
plt.xticks(rotation=90)
plt.show()

### Third Model

In [None]:
X_train = X_train[['hour', 'day_of_week', 'month', 'passenger_count']]
X_test = X_test[['hour', 'day_of_week', 'month', 'passenger_count']]


clf = RandomForestClassifier(max_depth=20, 
                             min_samples_leaf= 5,
                             min_samples_split = 5,
                             verbose=20,
                             random_state=0)
clf.fit(X_train, y_train)

print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

In [None]:
pickle.dump(clf, open('third_model.pkl', 'wb'))


In [None]:
print(metrics.classification_report(y_test, clf.predict(X_test)))


In [None]:
sorted_idx = perm_importance.importances_mean.argsort()
plt.barh(X_test.columns[sorted_idx], perm_importance.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")

In [None]:
plt.savefig('model_3_feature_important.jpg')
