In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import numpy as np
from sklearn.model_selection import cross_val_score
from statsmodels.formula.api import ols
import pandas as pd

In [None]:
%store -r donnees_2018_hab
%store -r donnees_2018
donnees_2018_hab

In [None]:
donnees_2018_hab=donnees_2018_hab.drop(columns=['REG', 'Libellé','Crim_Del_PN_GN','Crim_Del_GN_hab','Crim_Del_PN_hab'])



In [None]:
# donnees_2018_hab=donnees_2018_hab[(donnees_2018_hab.index<'91') & (donnees_2018_hab.index!='75') & (donnees_2018_hab.index!='2A') & (donnees_2018_hab.index!='2B')]

# 1 - Approche économétrique

Dans une première approché économétrique, nous essayons de prédire le nombre de crimes à partir uniquement du nombre de boucherie, puis en rajoutant des variables de contrôles

## 1-A Régression linéaire simple

In [None]:
# Code from https://towardsdatascience.com/simulating-replicating-r-regression-plot-in-python-using-sklearn-4ee48a15b67

lm = LinearRegression()


df_train, df_test = train_test_split(donnees_2018_hab, 
                                     train_size = 0.7, 
                                     test_size = 0.3
                                    )

X_train=df_train[[ 'Nb_Boucherie_dep_hab']]
y_train=df_train[['Crim_Del_PN_GN_hab']]

lm.fit(X_train, y_train)

X_test=df_test[['Nb_Boucherie_dep_hab']]
y_test=df_test[['Crim_Del_PN_GN_hab']]

# # for predictions
predictions = lm.predict(X_test)

beta_hat = [lm.intercept_.tolist()] + lm.coef_.tolist()
beta_hat
plt.scatter(X_train, y_train)



In [None]:
reg_with_statsmodels = ols(" Crim_Del_PN_GN_hab ~  Nb_Boucherie_dep_hab", data = df_train).fit()
print(reg_with_statsmodels.summary())

## 1-B Régression linéaire multiple

In [None]:
# Code from https://towardsdatascience.com/simulating-replicating-r-regression-plot-in-python-using-sklearn-4ee48a15b67

lm = LinearRegression()


df_train, df_test = train_test_split(donnees_2018_hab, 
                                     train_size = 0.7, 
                                     test_size = 0.3
                                    )

X_train=df_train[['MED18', 'TP6018', 'D118', 'D918', 'RD18', 'T1_2018',
       'Nb_PN_GN_dep_100k_hab', 'Nb_Boucherie_dep_hab']]
y_train=df_train[['Crim_Del_PN_GN_hab']]

lm.fit(X_train, y_train)

X_test=df_test[['MED18', 'TP6018', 'D118', 'D918', 'RD18', 'T1_2018',
       'Nb_PN_GN_dep_100k_hab', 'Nb_Boucherie_dep_hab']]
y_test=df_test[['Crim_Del_PN_GN_hab']]

# # for predictions
predictions = lm.predict(X_test)
lm.coef_
lm.intercept_
scores = cross_val_score(lm, X_train, y_train, scoring='r2', cv=5)
scores
# R² can be negative with small datasets and cv fold https://stackoverflow.com/questions/23036866/scikit-learn-is-returning-coefficient-of-determination-r2-values-less-than-1

In [None]:


sns.residplot(predictions.reshape(-1),'Crim_Del_PN_GN_hab', data=df_test,lowess=True,
                                  line_kws={'color': 'red', 'lw': 1, 'alpha': 1})
plt.xlabel("Fitted values")
plt.title('Residual plot')

In [None]:
residuals = df_test["Crim_Del_PN_GN_hab"] - predictions.reshape(-1)
residuals

plt.figure(figsize=(7,7))
stats.probplot(residuals, dist="norm", plot=plt)
plt.title("Normal Q-Q Plot")

In [None]:
model_norm_residuals_abs_sqrt=np.sqrt(np.abs(residuals))

plt.figure(figsize=(7,7))
sns.regplot(predictions.reshape(-1), model_norm_residuals_abs_sqrt,
              scatter=True,
              lowess=True,
              line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})
plt.ylabel("Standarized residuals")
plt.xlabel("Fitted value")

# Hétéroscédasticité
# Woolridge's Introductory Econometrics suggest taking the natural log for : 
# * conditional distributions that are heteroskedastic or skewed;
# * Population variables/  Monetary variables
# * log(1+y) if 0 exists

In [None]:
formula_reg= " Crim_Del_PN_GN_hab ~  Nb_Boucherie_dep_hab + MED18 + TP6018 + D118 + D918 + RD18 + T1_2018 +    Nb_PN_GN_dep_100k_hab"
reg_with_statsmodels = ols( formula_reg, data = df_train).fit()
print(reg_with_statsmodels.summary())

In [None]:
df_train['Crim_Del_PN_GN_hab'] = np.log(df_train['Crim_Del_PN_GN_hab'])
df_train['RD18'] = np.log(df_train['RD18'])
df_train['D918'] = np.log(df_train['D918'])
df_train['D118'] = np.log(df_train['D118'])
df_train['TP6018'] = np.log(df_train['TP6018'])
df_train['MED18'] = np.log(df_train['MED18'])
df_train['Nb_Boucherie_dep_hab'] = np.log(df_train['Nb_Boucherie_dep_hab'])
df_train['Nb_PN_GN_dep_100k_hab'] = np.log(df_train['Nb_PN_GN_dep_100k_hab'])
df_train['T1_2018'] = np.log(df_train['T1_2018'])

formula_reg= " Crim_Del_PN_GN_hab ~  Nb_Boucherie_dep_hab + MED18 + TP6018   + T1_2018 +    Nb_PN_GN_dep_100k_hab"

reg_with_statsmodels = ols( formula_reg, data = df_train).fit()
print(reg_with_statsmodels.summary())

# 2 - Approche Machine learning

In [None]:
# https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html
# Both StandardScaler and MinMaxScaler are very sensitive to the presence of outliers.
# MaxAbsScaler therefore also suffers from the presence of large outliers.


from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVR


# df_train, df_test = train_test_split(donnees_2018_hab, 
#                                      train_size = 0.7, 
#                                      test_size = 0.3
#                                     )



# pipeline = make_pipeline(RobustScaler(),  SVR(kernel='linear',C=1.0, epsilon=0.2))
# pipeline.fit(X_train, y_train)
# # vA = GridSearchCV(pipeline, param_grid=param_grid,
# #                      scoring='roc_auc', cv=10, refit=True)
# # vA.fit(X_train, y_train)

# print(pipeline.predict(X_test))
# print(y_test)




new_X_train = X_train['Nb_Boucherie_dep_hab']
new_Y_train = y_train
new_X_test= X_test['Nb_Boucherie_dep_hab']
new_Y_test = y_test

sc_X = RobustScaler()
sc_Y = RobustScaler()
X_train_scaled = sc_X.fit_transform(new_X_train.values.reshape(-1, 1))
Y_train_scaled = sc_Y.fit_transform(new_Y_train.values.reshape(-1, 1))
X_test_scaled=sc_X.fit_transform(new_X_test.values.reshape(-1, 1))


svr = SVR(kernel ='linear')
svr.fit(X_train_scaled, Y_train_scaled)
plt.scatter(X_train_scaled, Y_train_scaled, color = 'blue')
plt.scatter(X_train_scaled, svr.predict(X_train_scaled), color = 'red')
plt.title('Crim_Del_PN_GN_hab vs  Nb_Boucherie_dep_hab  (SVR)')
plt.xlabel('Nb_Boucherie_dep_hab')
plt.ylabel('Crim_Del_PN_GN_hab')
plt.show()

print(sc_Y.inverse_transform(svr.predict(X_test_scaled).reshape(-1,1)))
print(new_Y_test)