https://medium.com/@abhikjha/predicting-sensex-48f4afb900b5
https://machinelearningmastery.com/autoregression-models-time-series-forecasting-python/

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns


import matplotlib.pyplot as plt
from pandas.plotting import lag_plot



In [None]:
df = pd.read_excel("Data\Events_2014_2019.xlsx")

### Some descriptive statistics

In [None]:
df['ANO'] = df['DATA FATO'].dt.year
df['MES'] = df['DATA FATO'].dt.month
df['FATO'] = df['FATO'].apply(lambda x: 'Roubo' if x == "ROUBO DE VEICULO" else ('Furto' if x == 'FURTO DE VEICULO' else x))
df['LOCAL FATO'] = df['LOCAL FATO'].apply(lambda x: 'PORTO ALEGRE' if x == "PORTO ALEGRE RS" else x)

dfTemp = df[['DATA FATO','ANO','MES','FATO']].groupby(['ANO', 'MES','FATO']).agg({'DATA FATO': 'count'})

dfTemp = dfTemp.reset_index().set_index(['ANO','MES'])
display(dfTemp.head())
dfTemp[dfTemp['FATO']=='Furto'].plot(figsize=(10,5), grid=True)
dfTemp[dfTemp['FATO']=='Roubo'].plot(figsize=(10,5), grid=True)

In [None]:
df_roubo = df[df['FATO'] == 'Roubo']
df_roubo = df_roubo[['DATA FATO', 'FATO']].groupby('DATA FATO').count().reset_index().rename(columns={'DATA FATO': 'DATE','FATO': 'OCCURRENCE'})

### Datetime feature engineering

In [None]:
df_roubo['year'] = df_roubo['DATE'].map(lambda x: x.year)
df_roubo['month'] = df_roubo['DATE'].map(lambda x: x.month)
df_roubo['day_week'] = df_roubo['DATE'].map(lambda x: x.dayofweek)
df_roubo['quarter'] = df_roubo['DATE'].map(lambda x: x.quarter)
df_roubo['week'] = df_roubo['DATE'].map(lambda x: x.week)
df_roubo['quarter_start'] = df_roubo['DATE'].map(lambda x: x.is_quarter_start)
df_roubo['quarter_end'] = df_roubo['DATE'].map(lambda x: x.is_quarter_end)
df_roubo['month_start'] = df_roubo['DATE'].map(lambda x: x.is_month_start)
df_roubo['month_end'] = df_roubo['DATE'].map(lambda x: x.is_month_end)
df_roubo['year_start'] = df_roubo['DATE'].map(lambda x: x.is_year_start)
df_roubo['year_end'] = df_roubo['DATE'].map(lambda x: x.is_year_end)
df_roubo['week_year'] = df_roubo['DATE'].map(lambda x: x.weekofyear)
df_roubo['day_month'] = df_roubo['DATE'].map(lambda x: x.daysinmonth)

df_roubo['quarter_start'] = df_roubo['quarter_start'].astype(int)
df_roubo['quarter_end'] = df_roubo['quarter_end'].astype(int)
df_roubo['month_start'] = df_roubo['month_start'].astype(int)
df_roubo['month_end'] = df_roubo['month_end'].astype(int)
df_roubo['year_start'] = df_roubo['year_start'].astype(int)
df_roubo['year_end'] = df_roubo['year_end'].astype(int)

#Check if it is weekday or weekend
df_roubo['weekend'] = df_roubo['day_week'].map(lambda x: 0 if x in list(range(0,5)) else 1)

#Check if it is second semester or not
df_roubo['half_year'] = df_roubo['month'].map(lambda x: 1 if x in list(range(7,13)) else 0)

In [None]:
df_roubo.head()

### Occurrences by winter and summer

In [None]:
dfTemp = df_roubo[['month','half_year','OCCURRENCE']].groupby(['month','half_year']).agg({'OCCURRENCE':['mean','max','min']}).reset_index()
dfTemp.columns = dfTemp.columns.map('_'.join)

f, axes = plt.subplots(3, 1, figsize=(15, 15))
sns.lineplot(x='month_', y='OCCURRENCE_mean', hue='half_year_', data=dfTemp,  ax=axes[0]);
axes[0].grid()
sns.lineplot(x='month_', y='OCCURRENCE_max', hue='half_year_', data=dfTemp,  ax=axes[1])
axes[1].grid()
sns.lineplot(x='month_', y='OCCURRENCE_min', hue='half_year_', data=dfTemp,  ax=axes[2])
axes[2].grid();


### Average number of occurrences by days of week

In [None]:
dfTemp = df_roubo[['day_week','OCCURRENCE']].groupby(['day_week']).agg({'OCCURRENCE':['mean','max','min']}).reset_index()
dfTemp.columns = dfTemp.columns.map('_'.join)

f, axes = plt.subplots(3, 1, figsize=(15, 15))
sns.lineplot(x='day_week_', y='OCCURRENCE_mean', data=dfTemp,  ax=axes[0]);
axes[0].grid()
sns.lineplot(x='day_week_', y='OCCURRENCE_max',  data=dfTemp,  ax=axes[1])
axes[1].grid()
sns.lineplot(x='day_week_', y='OCCURRENCE_min',  data=dfTemp,  ax=axes[2])
axes[2].grid();

In [None]:
dfTemp = df_roubo[['year','half_year','OCCURRENCE']]

f, axes = plt.subplots(figsize=(15, 10))
sns.boxplot(x='year', y='OCCURRENCE', data=dfTemp, hue='half_year', ax=axes);


In [None]:
dfTemp = df_roubo[['year','weekend','OCCURRENCE']]

f, axes = plt.subplots(figsize=(15, 10))
sns.boxplot(x='year', y='OCCURRENCE', data=dfTemp, hue='weekend', ax=axes);

### Autocorrelation Analysis

In [None]:
dfTemp=df_roubo[['DATE','OCCURRENCE']].set_index('DATE')
fig, ax = plt.subplots(figsize=(20, 10))
lag_plot(dfTemp, ax=ax)
plt.show()