In [None]:
import requests
from pprint import pprint
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
import math
import seaborn as sns
import scipy.stats as stats

In [None]:
# PULLING AND CLEANING CRIME DATA

In [None]:
crime_df = pd.read_csv('Crimes_-_2021.csv')

In [None]:
weather_df = pd.read_csv('weather_api .csv',low_memory=False,index_col=0)
weather_df

In [None]:
weather_df.index = pd.to_datetime(weather_df.index)
weather_df

In [None]:
sns.set_style('darkgrid')
weather_df

In [None]:
weather_df.describe()

In [None]:
weather_df.info()

In [None]:
print("The names of the features :\n", list(weather_df.columns))

In [None]:
cols_1 = []
for column in weather_df.columns:
    cols_1.append(column.lower())
weather_df.columns = cols_1

print(weather_df.columns)

In [None]:
weather_df = weather_df.drop(['temperaturemaxtime','windgust','apparenttemperaturehigh','apparenttemperaturelow','apparenttemperaturemin','apparenttemperaturemax','time','preciptype','apparenttemperaturehightime','humidity','precipprobability','pressure','windspeed','windgusttime','windbearing','cloudcover','uvindex','uvindextime','visibility','ozone','precipaccumulation','precipintensitymax','precipintensitymaxtime','apparenttemperaturemaxtime','apparenttemperaturemintime','temperaturemintime','apparenttemperaturelowtime','apparenttemperaturehightime','temperaturelowtime','temperaturehightime'], axis=1)


In [None]:
weather_df

In [None]:
cols_1 = []
for column in weather_df.columns:
    cols_1.append(column.upper())
weather_df.columns = cols_1

print(weather_df.columns)

In [None]:
weather_df.index = pd.to_datetime(weather_df.index)
weather_df['PrecipitationIn'] = weather_df['PrecipitationIn'].replace('T',np.nan)
weather_df['PrecipitationIn'] = weather_df['PrecipitationIn'].dropna().astype(float)
weather_df.tail()

In [None]:
# # Store lists into a dataframe
# crime_df = pd.DataFrame({'Date': date_of_crime,
#                          'Day_of_week': day_of_week_crime,
#                          'Type': crime_type_list})

# crime_df.head()

In [None]:
crime_df.describe()


In [None]:
crime_df.shape

In [None]:
crime_df.info()

In [None]:
sns.set_style('darkgrid')


In [None]:
crime_df.head()

In [None]:
# names of features
print("The names of the features :\n", list(crime_df.columns))

In [None]:
# Number of distinct crimes in Chicago
crimes = crime_df['Primary Type'].unique()
print("The Number of distinct crimes in Chicago in the year 2018:", len(crimes))
print()
print("The Distinct Crimes are :\n", crimes)

In [None]:
# DEALING WITH MISSING VALUES
print("Number of Missing Values in the whole dataset : ", crime_df.isna().sum().sum())


In [None]:
crime_df.isna().sum()

In [None]:
# Let's count number of null entries per feature
missing_values = list(crime_df.isna().sum())
# missing values is a list of the number of missing values in each column

cols = list(crime_df.columns)
col_final = []
for i in range(len(cols)):
    if (missing_values[i] == 0):
        cols[i]="Others"
d = dict(zip(cols, missing_values)) # making a dicionary for the missing values

print("Number of Missing Values per feature >>")
missing_vals = pd.DataFrame(d, index=["Missing Values"]) # Making a custom dataframe from dict d
missing_vals.head()

In [None]:
# Plotting the missing values in the dataset
x = list(d.keys())
y = list(d.values())
sns.barplot(x=x, y=y, palette="GnBu_d")
plt.xticks(rotation=90)
plt.title("Missing Values in the Dataset", fontdict = {'fontsize': 20})
plt.ylabel("Count of missing values", fontdict={'fontsize': 15})
plt.show()

In [None]:
# The simplest cleaning technique here would be to drop all the rows with atleast one missing value
crime_df = crime_df.dropna()
crime_df.info()

In [None]:
# MOST COMMON CRIMES IN CHICAGO

In [None]:
# Set the style of the plot first
plt.style.use('seaborn')

# Filter out the Top 5 crimes
top_5_crimes = crime_df['Primary Type'].value_counts().sort_values(ascending=False).head()

temp = crime_df.groupby('Primary Type', as_index=False).agg({"ID": "count"})
temp = temp.sort_values(by=['ID'], ascending=False).head()
temp = temp.sort_values(by='ID', ascending=True)
sns.barplot(x='ID', y='Primary Type', data=temp, palette="Blues_d")

# Work on the aestehtic appeal of the plot
plt.title("Top 5 Crimes in Chicago", fontdict = {'fontsize': 30, 'fontname':'Arial', 'color': '#000000'})
plt.xlabel("\nCOUNT OF CRIMES", fontdict = {'fontsize': 15})
plt.ylabel("")
plt.xticks(rotation=90)
plt.show()
#plt.show()

In [None]:
# Doing a bit of df manipulation for using bokeh
temp.head()
temp.columns=['Crime','Number']
temp.index=[0,1,2,3,4]
temp['co-ordinates']=[1,2,3,4,5]
temp.head()

In [None]:
crime_df['Datetime'] = pd.to_datetime(crime_df['Date'],format="%m/%d/%Y %I:%M:%S %p")
crime_df['Date'] = crime_df['Datetime'].apply(lambda x:x.date())
crime_df['Weekday'] = crime_df['Datetime'].apply(lambda x:x.weekday())
crime_df['Hour'] = crime_df['Datetime'].apply(lambda x:x.hour)
crime_df['Day'] = crime_df['Datetime'].apply(lambda x:x.day)
crime_df['Week'] = crime_df['Datetime'].apply(lambda x:x.week)
crime_df['Month'] = crime_df['Datetime'].apply(lambda x:x.month)

crime_df.head()

In [None]:
from collections import Counter

dict(Counter(crime_df['Primary Type']))

In [None]:
personal_crimes = ['ASSAULT','BATTERY','CRIM SEXUAL ASSAULT','HOMICIDE']
property_crimes = ['ARSON','BURGLARY','MOTOR VEHICLE THEFT','ROBBERY','THEFT']

In [None]:
arson_gb = crime_df[crime_df['Primary Type'] == 'ARSON'].groupby('Date')['ID'].agg(len)
assault_gb = crime_df[crime_df['Primary Type'] == 'ASSAULT'].groupby('Date')['ID'].agg(len)
battery_gb = crime_df[crime_df['Primary Type'] == 'BATTERY'].groupby('Date')['ID'].agg(len)
burglary_gb = crime_df[crime_df['Primary Type'] == 'BURGLARY'].groupby('Date')['ID'].agg(len)
homicide_gb = crime_df[crime_df['Primary Type'] == 'HOMICIDE'].groupby('Date')['ID'].agg(len)
sexual_assault_gb = crime_df[crime_df['Primary Type'] == 'CRIM SEXUAL ASSAULT'].groupby('Date')['ID'].agg(len)
robbery_gb = crime_df[crime_df['Primary Type'] == 'ROBBERY'].groupby('Date')['ID'].agg(len)
theft_gb = crime_df[crime_df['Primary Type'] == 'THEFT'].groupby('Date')['ID'].agg(len)
vehicle_theft_gb = crime_df[crime_df['Primary Type'] == 'MOTOR VEHICLE THEFT'].groupby('Date')['ID'].agg(len)
personal_gb = crime_df[crime_df['Primary Type'].isin(personal_crimes)].groupby('Date')['ID'].agg(len)
property_gb = crime_df[crime_df['Primary Type'].isin(property_crimes)].groupby('Date')['ID'].agg(len)

arson_gb.index = pd.to_datetime(arson_gb.index)
assault_gb.index = pd.to_datetime(assault_gb.index)
battery_gb.index = pd.to_datetime(battery_gb.index)
burglary_gb.index = pd.to_datetime(burglary_gb.index)
homicide_gb.index = pd.to_datetime(homicide_gb.index)
sexual_assault_gb.index = pd.to_datetime(sexual_assault_gb.index)
robbery_gb.index = pd.to_datetime(robbery_gb.index)
theft_gb.index = pd.to_datetime(theft_gb.index)
vehicle_theft_gb.index = pd.to_datetime(vehicle_theft_gb.index)
personal_gb.index = pd.to_datetime(personal_gb.index)
property_gb.index = pd.to_datetime(property_gb.index)

In [None]:
# ts = pd.DataFrame({'Arson':arson_gb.loc[:'2021-12-31'],
#                    'Assault':assault_gb.loc[:'2021-12-31'],
#                    'Battery':battery_gb.loc[:'2021-12-31'],
#                    'Burglary':burglary_gb.loc[:'2021-12-31'],
#                    'Homicide':homicide_gb.loc[:'2021-12-31'],
#                    'Sexual_assault':sexual_assault_gb.loc[:'2021-12-31'],
#                    'Robbery':robbery_gb.loc[:'2021-12-31'],
#                    'Vehicle_theft':vehicle_theft_gb.loc[:'2021-12-31'],
#                    'Theft':theft_gb.loc[:'2021-12-31'],
#                    'Personal':personal_gb.loc[:'2021-12-31'],
#                    'Property':property_gb.loc[:'2021-12-31'],
#                    'Temperature':weather_df['Mean TemperatureF'].loc[:'2021-12-31'],
#                    'Binned temperature':weather_df['Mean TemperatureF'].loc[:'2021-12-31']//10.*10,
#                    'Humidity':weather_df[' Mean Humidity'].loc[:'2021-12-31'],
#                    'Precipitation':weather_df['PrecipitationIn'].loc[:'2021-12-31']
#                    })
# ts

In [None]:
from bokeh.plotting import figure
from bokeh.io import show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool

temp_cds = ColumnDataSource(temp)

fig1 = figure(plot_width=700, plot_height=400, title="Most occuring Criminal Offences in Chicago",
             x_axis_label="Count of Crimes", y_axis_label="Crime Type", x_range=(0, 70000))

fig1.title.align = "left"
fig1.title.text_color = "black"
fig1.title.text_font_size = "20px"

fig1.hbar(y='co-ordinates', right='Number', source=temp_cds, left=0, color='red', alpha=0.5, height=0.50)
fig1.yaxis.major_label_overrides = {5: 'Theft', 4:'Battery', 3:'Criminal Damage', 2:'Assault',
                                   1:'Deceptive Practice'}

# Adding passive interactivity
tooltips = [
    ('Number of Crimes', '@Number'),
]

#fig1.legend.location
fig1.add_tools(HoverTool(tooltips=tooltips))

output_notebook()
show(fig1)

In [None]:
# Testing out the time and date conversion for one entry

# t = crime_df['Date'][20]
# print(t)
# s1 = t[:11] 
# print(s1)
# s2 = t[11:]
# print(s2)

# print(s2)
# hr = s2[:2]
# mins = s2[3:5]
# sec = s2[6:8]
# time_frame = s2[9:]
# if(time_frame == 'PM'):
#     if (int(hr) != 12):
#         hr = str(int(hr) + 12)
# else:
#     if(int(hr) == 12):
#         hr = '00'

# print(hr, mins, sec)

In [None]:
# Time Conversion Function
def time_convert(date_time):
    s1 = date_time[:11]
    s2 = date_time[11:]
    
    month = s1[:2]
    date = s1[3:5]
    year = s1[6:10]
    
    hr = s2[:2]
    mins = s2[3:5]
    sec = s2[6:8]
    time_frame = s2[9:]
    if(time_frame == 'PM'):
        if (int(hr) != 12):
            hr = str(int(hr) + 12)
    else:
        if(int(hr) == 12):
            hr = '00'
    
    final_date = datetime(int(year), int(month), int(date), int(hr), int(mins), int(sec))
    return final_date

In [None]:
weather_df

In [None]:
crime_df

In [None]:
cols_1 = []
for column in crime_df.columns:
    cols_1.append(column.upper())
crime_df.columns = cols_1

print(crime_df.columns)

In [None]:
# Merge weather and crime dataframes, dropping crime records with no matching weather data (i.e. Feb 29, 2016)
weather_crime_df = pd.merge(crime_df,weather_df,how='left', on='DATE').dropna()
weather_crime_df.head()

In [None]:
# Histogram by Type of Crime
sb.set()
plt.figure(figsize=(10,5))
plt.title('Number of Crimes in Toronto by Type (2014-2017)', fontsize = 14, fontweight = 'bold')
plt.xlabel('Major Crime Indicator', fontweight = 'bold')
plt.ylabel('Number of Crimes (2014-2017)', fontweight = 'bold')
plt.hist(crime_type_list)

# Save figure and show it
plt.savefig('Number of Crimes in Toronto by Type (2014-2017)')
plt.show()

In [None]:
# Prepare data for plotting number of crimes by year
crimes_2014 = crime_df[crime_df.Date.str.startswith('2014')] # 31578 records
crimes_2015 = crime_df[crime_df.Date.str.startswith('2015')] # 32197 records
crimes_2016 = crime_df[crime_df.Date.str.startswith('2016')] # 32601 records
crimes_2017 = crime_df[crime_df.Date.str.startswith('2017')] # 33791 records
num_crimes_2014 = crimes_2014.groupby('Type').count()['Date'] 
num_crimes_2015 = crimes_2015.groupby('Type').count()['Date']
num_crimes_2016 = crimes_2016.groupby('Type').count()['Date']
num_crimes_2017 = crimes_2017.groupby('Type').count()['Date']
crime_type_by_year_df = pd.DataFrame({'2014': num_crimes_2014,
                                     '2015': num_crimes_2015,
                                     '2016': num_crimes_2016,
                                     '2017': num_crimes_2017})

# Create Line Plot of the Number of Crimes By Year
fig = plt.figure(figsize=(10,5))
ax = fig.add_subplot(1,1,1)
plt.plot(crime_type_by_year_df.T, marker = 'o')
plt.title('Number of Crimes in Toronto by Year (2014-2017)', fontsize = 14, fontweight = 'bold')
plt.xlabel('Year', fontweight = 'bold')
plt.ylabel('Number of Crimes (2014-2017)', fontweight = 'bold')
leg = plt.legend(crime_type_by_year_df.index, loc = 'upper right')

# Get the bounding box of the original legend
bb = leg.get_bbox_to_anchor().inverse_transformed(ax.transAxes)

# Change to location of the legend
xOffset = 0.25
bb.x0 += xOffset
bb.x1 += xOffset
leg.set_bbox_to_anchor(bb, transform = ax.transAxes)

# Save figure and show it
plt.savefig('Number of Crimes in Toronto by Year (2014-2017).png', dpi=199)
plt.show()
crime_type_by_year_df


In [None]:
# PULLING AND CLEANING WEATHER DATA

In [None]:
apikey = "56b161f544224f85b0e180450181711"

In [None]:
# Prepare lists for storing weather data
date_list = []
temp_list = []
moon_list = []
rain_list = []
years_of_interest = ['2014', '2015', '2016', '2017']
months_of_interest = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
last_day_of_month = ['31', '28', '31', '30', '31', '30', '31', '31', '30', '31', '30', '31']

for year in years_of_interest:
    for i in range(12):
        
        # Prepare start and end dates for query
        start_date = f"{year}-{months_of_interest[i]}-01"
        end_date = f"{year}-{months_of_interest[i]}-{last_day_of_month[i]}"
        
        # Prepare url (24hr average)
        weather_url = f"http://api.worldweatheronline.com/premium/v1/past-weather.ashx?key={apikey}&q=Toronto&format=json&date={start_date}&enddate={end_date}&tp=24"
        # Request and store json data
        data = requests.get(weather_url).json()
        
        for day in range(len(data['data']['weather'])):
            # Date
            date_list.append(data['data']['weather'][day]['date'])

            # Temperature
            temp_list.append(data['data']['weather'][day]['hourly'][0]['tempC'])

            # Precipitation
            rain_list.append(data['data']['weather'][day]['hourly'][0]['precipMM'])

            # Moon_phase
            moon_list.append(data['data']['weather'][day]['astronomy'][0]['moon_phase'])

# Store lists into a dataframe
weather_df = pd.DataFrame({'Date': date_list,
                          'Temp': temp_list,
                          'Precip': rain_list,
                          'Moon Phase': moon_list})
        

In [None]:
# Convert Temp and Precip in dataframe from objects to floats
weather_df['Temp'] = pd.to_numeric(weather_df['Temp'], errors='ignore')
weather_df['Precip'] = pd.to_numeric(weather_df['Precip'], errors='ignore')

weather_df.head()

In [None]:
# Merge weather and crime dataframes, dropping crime records with no matching weather data (i.e. Feb 29, 2016)
weather_crime_df = pd.merge(crime_df,weather_df,how='left', on='Date').dropna()
weather_crime_df.head()

In [None]:
# TEMERATURE VS CRIME

In [None]:
# Create bins for temperature data
temp_bins = [-100, -10, 0, 10, 20, 30, 100]
temp_labels = ["-100 to -10", "-10 to 0", "0 to 10", "10 to 20", "20 to 30", "30 to 100"]

# Bin the Temp column and add it to the dataframe as the Temp. Bin column
weather_crime_df['Temp Bin'] = pd.cut(weather_crime_df['Temp'], temp_bins, labels=temp_labels)

# Group by temperature bins, counting the number of crimes committed in each bucket
grouped_by_temp_bins = weather_crime_df.groupby(['Temp Bin', 'Type'])['Date'].count()

grouped_by_temp_bins_ = weather_crime_df.groupby(['Temp Bin', 'Type'])['Date'].count()

# Create a Dataframe with the grouped temperature data and reset index for plotting
grouped_by_temp_bins_df = pd.DataFrame(grouped_by_temp_bins.reset_index())

# Rename columns
grouped_by_temp_bins_df.columns = ['Temp Bin', 'Crime Type', 'Number of Crimes']

# Pivot Dataframe as preparation for plotting
grouped_by_temp_bins_df2 = grouped_by_temp_bins_df.pivot(index='Temp Bin', columns='Crime Type', values = 'Number of Crimes')

# Preview Dataframe
grouped_by_temp_bins_df2

In [None]:
# Plot Count of Each Crime Type For Each Temperature Bin
fig = plt.figure(figsize=(10,5))
ax = fig.add_subplot(111)
xloc = np.arange(6)  # the x locations for the bars
width = 0.1 # the width of the bars
ax.bar(xloc - width*2, grouped_by_temp_bins_df2['Assault'], width, label = 'Assault')
ax.bar(xloc - width, grouped_by_temp_bins_df2['Break and Enter'], width, label = 'Break and Enter')
ax.bar(xloc, grouped_by_temp_bins_df2['Robbery'], width, label = 'Robbery')
ax.bar(xloc + width, grouped_by_temp_bins_df2['Auto Theft'], width, label = 'Auto Theft')
ax.bar(xloc + width*2, grouped_by_temp_bins_df2['Theft Over'], width, label = 'Theft Over')
plt.xticks(xloc, temp_labels)
plt.xlabel('Temperature (Celsius)', fontweight = 'bold')
plt.ylabel('Total Number of Crimes', fontweight = 'bold')
plt.title('Number of Crimes Committed in Each Temperature Range', fontweight = 'bold', fontsize = 14)
ax.legend()

# Save figure and show it
plt.savefig('Number of Crimes Committed in Each Temperature Range.png', bbox_inches = 'tight', dpi = 199)
plt.show()