# Import libraries and data

In [None]:
# Import basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Import sklearn libraries
from sklearn.preprocessing import LabelEncoder

# Import warnings
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
# Fetch data
!unzip -o archive.zip

In [None]:
# Create DataFrame based on data
crimes = pd.read_csv('Montreal Crime Data.csv')
crimes

## Dataset Overview
|    category   | Type of crime committed           |
|:-------------:|-----------------------------------|
|      date     | Date crime occured                |
|  postal_code  | Postal Code where event occured   |
|      city     | City where crime occured          |
| neighbourhood | Neighbourhood where crime occured |
|      year     | Year it occured                   |
|     count     | Count                             |
|   longitutde  | Longitude                         |
|    latitude   | Latitude                          |

# Data Preprocessing

In [None]:
# Check whether rows contain Null
crimes.isnull().sum()

In [None]:
# Check whether rows contain NaN
crimes.isna().sum()

## Check distribution of each features

### Category

In [None]:
# Check the distribution of 'category'
categories = crimes['category']
le = LabelEncoder()
categories_le = le.fit_transform(categories)
plt.title('Distribution of Categories')
sns.distplot(categories_le)
labels = enumerate(range(0, 6, 1))
for i, label in labels:
    print(i, " : ", le.inverse_transform([label]))

In [None]:
# As you can see that plot, density of 'Offenses resulting in death' is much lower than any other categories
# So, I think dropping that category can be a good idea.
drop_category_rows = crimes[crimes['category'] == 'Offenses resulting in death'].index
crimes.drop(drop_category_rows, inplace=True)
crimes

In [None]:
# RE-Check the distribution of 'category'
categories = crimes['category']
le = LabelEncoder()
categories_le = le.fit_transform(categories)
plt.title('Distribution of Categories')
sns.distplot(categories_le)
labels = enumerate(range(0, 5, 1))
for i, label in labels:
    print(i, " : ", le.inverse_transform([label]))

### City

In [None]:
# Check the distribution of 'city'
cities = crimes['city']
le = LabelEncoder()
cities_le = le.fit_transform(cities)
plt.title('Distribution of Cities')
sns.distplot(cities_le)
labels = enumerate(range(0, 28, 1))
for i, label in labels:
    print(i, " : ", le.inverse_transform([label]))

In [None]:
# Check how many 'postal code' has unique values
len_city_unique = len(crimes.city.unique())
print('The number of unique values: ', len_city_unique)
print('The proportion of unique values: ', round(len_city_unique / len(crimes['city']) * 100, 3),'%')

In [None]:
# As you can see that plot, density of 'Offenses resulting in death' is much lower than any other categories
# So, I think dropping that category can be a good idea.
drop_city_rows = crimes[crimes['city'] != 'MONTREAL'].index
crimes.drop(drop_city_rows, inplace=True)
crimes

### Neighbourhood

In [None]:
# Check the distribution of 'neighbourhood'
neighbour = crimes['neighbourhood']
le = LabelEncoder()
neighbourhood_le = le.fit_transform(neighbour)
plt.title('Distribution of Neighbourhood')
sns.distplot(neighbourhood_le)
labels = enumerate(range(0, 34, 1))
for i, label in labels:
    print(i, " : ", le.inverse_transform([label]))

## Data Cleansing

In [None]:
# The proportion of unique values of city is too low as about 0%.
# Therefore, we can drop that feature.
# Drop unnecessary features
crimes.drop(['Unnamed: 0', 'postal_code', 'count', 'city'], axis=1, inplace=True)
crimes.reset_index(drop=True, inplace=True)
crimes

In [None]:
# Split date into year, month and day
crimes['date'] = pd.to_datetime(crimes['date'])
crimes['month'] = crimes['date'].dt.month
crimes['day'] = crimes['date'].dt.day
crimes

In [None]:
# Reloate column heads in custom
crimes = crimes[['year', 'month', 'day', 'latitude', 'longitude', 'neighbourhood', 'category']]
crimes

In [None]:
# Sorting values by year, month, and day
crimes = crimes.sort_values(by=['year', 'month', 'day'], ascending=True)
crimes.reset_index(drop=True, inplace=True)
crimes

# Exploratory Data Analysis

In [None]:
# Create label encoded dataframe of 'crimes'
crimes_copy = crimes.copy()
crimes_copy

In [None]:
# Label encoding for 'neighbourhood' and 'category'
neighbourhood_copy = LabelEncoder()
category_copy = LabelEncoder()
crimes_copy['neighbourhood'] = neighbourhood_copy.fit_transform(crimes_copy['neighbourhood'])
crimes_copy['category'] = category_copy.fit_transform(crimes_copy['category'])
crimes_copy

In [None]:
# Create correlation dataframe
crimes_corr = crimes_copy.corr()
crimes_corr

In [None]:
# Correlation Heatmap
sns.heatmap(data=crimes_corr, annot=True)
plt.title('Correlation Heatmap of Montreal Crimes')

## Groupby by year
Summary
1. The highest value of neighbourhood, category was Notre-Dame-de-Grâce and Mischief each.
2. June was the highest along the year and mid-Month(15th) was the highest on Month

In [None]:
# Groupping by year to analyze correlation
crimes_year = crimes_copy.groupby(by='year').mean()
crimes_year

In [None]:
# Create list which shape is int64 on purpose of inverse transform
neighbourhood_year_int = round(crimes_year['neighbourhood'], 0).astype(int)
category_year_int = round(crimes_year['category'], 0).astype(int)

In [None]:
# Inverse transform from label encoder
crimes_year['neighbourhood'] = neighbourhood_copy.inverse_transform(neighbourhood_year_int)
crimes_year['category'] = category_copy.inverse_transform(category_year_int)
crimes_year

## Groupby by month
Summary
1. The highest value of neighbourhood, category was Notre-Dame-de-Grâce and Mischief each.
2. 2017 was the highest along the 5-year(2017 - 2021) and mid-Month(15th) was the highest on Month

In [None]:
# Groupping by month to analyze correlation
crimes_month = crimes_copy.groupby(by='month').mean()
crimes_month

In [None]:
# Create list which shape is int64 on purpose of inverse transform
neighbourhood_month_int = round(crimes_month['neighbourhood'], 0).astype(int)
category_month_int = round(crimes_month['category'], 0).astype(int)

In [None]:
# Inverse transform from label encoder
crimes_month['neighbourhood'] = neighbourhood_copy.inverse_transform(neighbourhood_month_int)
crimes_month['category'] = category_copy.inverse_transform(category_month_int)
crimes_month

## Groupby by day
Summary
1. The highest value of neighbourhood, category was Notre-Dame-de-Grâce and Mischief each.
2. 2017 was the highest along the 5-year(2017 - 2021) and June was the highest along the year

In [None]:
# Groupping by month to analyze correlation
crimes_day = crimes_copy.groupby(by='day').mean()
crimes_day

In [None]:
# Create list which shape is int64 on purpose of inverse transform
neighbourhood_day_int = round(crimes_day['neighbourhood'], 0).astype(int)
category_day_int = round(crimes_day['category'], 0).astype(int)

In [None]:
# Inverse transform from label encoder
crimes_day['neighbourhood'] = neighbourhood_copy.inverse_transform(neighbourhood_day_int)
crimes_day['category'] = category_copy.inverse_transform(category_day_int)
crimes_day

# Conclusion
1. Through groupping data by year, month, day each, we could find Notre-Dame-de-Grâce was the most dangerous region in Montreal
2. Also, mischief was more frequent than any other categories
3. In June, 2017 was the most dangerous period in Montreal