In [1]:
import pandas as pd
import numpy as np
import warnings
import random
from plotnine import *
warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:,.2f}'.format

In [2]:
df = pd.read_csv("CrimeData.csv", index_col=0)

In [3]:
# Some simple data cleaning by filling the ?s with the column mean
df = df.replace('?', np.NaN)
df = df.drop(['countyCode', 'communityCode', 'fold'], axis=1)
df.iloc[:, 2:] = df.iloc[:, 2:].astype('float')
df = df.fillna(df.mean())

In [54]:
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [134]:
train, test = train_test_split(df, train_size=.6, random_state=33)

In [135]:
random.seed(34)
parameterpool = set(df.columns[5:-18])  # create parameter pool
num_parameters = np.arange(5, 101, 5)  # add five predictors each time
lasso_rmse = []
linear_rmse = []
parameters = []
for n in num_parameters:
    parameters = parameters + random.sample(parameterpool, n-len(parameters))
    parameterpool.difference_update(parameters)
    model = Lasso(random_state=3)
    model.fit(train[parameters], train['arsonsPerPop'])
    lasso_rmse.append(np.sqrt(mean_squared_error(
        test['arsonsPerPop'], model.predict(test[parameters]))))
    linmodel = LinearRegression()
    linmodel.fit(train[parameters], train['arsonsPerPop'])
    linear_rmse.append(np.sqrt(mean_squared_error(
        test['arsonsPerPop'], linmodel.predict(test[parameters]))))
zipped = list(zip(num_parameters, lasso_rmse, linear_rmse))
table = pd.DataFrame(
    zipped, columns=['num_parameters', 'lasso_rmse', 'linear_rmse'])
table

Unnamed: 0,num_parameters,lasso_rmse,linear_rmse
0,5,36.53,36.4
1,10,38.45,38.42
2,15,39.45,39.54
3,20,39.59,38.96
4,25,38.07,37.54
5,30,34.99,34.98
6,35,34.31,34.42
7,40,34.14,34.45
8,45,34.04,34.59
9,50,34.02,34.53


In [136]:
print(table.to_latex())

\begin{tabular}{lrrr}
\toprule
{} &  num\_parameters &  lasso\_rmse &  linear\_rmse \\
\midrule
0  &               5 &       36.53 &        36.40 \\
1  &              10 &       38.45 &        38.42 \\
2  &              15 &       39.45 &        39.54 \\
3  &              20 &       39.59 &        38.96 \\
4  &              25 &       38.07 &        37.54 \\
5  &              30 &       34.99 &        34.98 \\
6  &              35 &       34.31 &        34.42 \\
7  &              40 &       34.14 &        34.45 \\
8  &              45 &       34.04 &        34.59 \\
9  &              50 &       34.02 &        34.53 \\
10 &              55 &       36.29 &        37.28 \\
11 &              60 &       36.49 &        37.38 \\
12 &              65 &       37.77 &        39.40 \\
13 &              70 &       37.58 &        39.02 \\
14 &              75 &       37.49 &        38.64 \\
15 &              80 &       36.68 &        38.08 \\
16 &              85 &       36.30 &        37.92 \\
17 

In [137]:
a = table.iloc[:, [0, 1]].copy()
a.columns = ['num_parameters', 'rmse']
a['model'] = 'lasso'
b = table.iloc[:, [0, 2]].copy()
b.columns = ['num_parameters', 'rmse']
b['model'] = 'linear'
combined_table = pd.concat([a, b])

In [138]:
p = ggplot(combined_table, aes('num_parameters',
           'rmse', color='model')) + geom_line()
ggsave(p, '1.png')

In [139]:
# Repeating but with interaction terms
from sklearn.preprocessing import PolynomialFeatures

In [140]:
random.seed(34)
parameterpool = set(df.columns[5:-18])
num_parameters = np.arange(2, 30, 1)
lasso_rmse = []
linear_rmse = []
parameters = []
for n in num_parameters:
    parameters = parameters + random.sample(parameterpool, n-len(parameters))
    parameterpool.difference_update(parameters)
    interaction = PolynomialFeatures(
        degree=2, include_bias=False, interaction_only=True)
    param_inter = interaction.fit_transform(train[parameters])
    test_param = interaction.fit_transform(test[parameters])
    model = Lasso(random_state=3)
    model.fit(param_inter, train['arsonsPerPop'])
    lasso_rmse.append(np.sqrt(mean_squared_error(
        test['arsonsPerPop'], model.predict(test_param))))
    linmodel = LinearRegression()
    linmodel.fit(param_inter, train['arsonsPerPop'])
    linear_rmse.append(np.sqrt(mean_squared_error(
        test['arsons'], linmodel.predict(test_param))))
zipped = list(zip(num_parameters, lasso_rmse, linear_rmse))
table = pd.DataFrame(
    zipped, columns=['num_parameters', 'lasso_rmse', 'linear_rmse'])
table['total_terms'] = (table['num_parameters']) * \
    (table['num_parameters']-1)/2 + table['num_parameters']
table

Unnamed: 0,num_parameters,lasso_rmse,linear_rmse,total_terms
0,2,37.66,164.04,3.0
1,3,34.68,162.42,6.0
2,4,34.71,162.77,10.0
3,5,34.83,162.63,15.0
4,6,34.82,162.58,21.0
5,7,36.49,152.11,28.0
6,8,36.92,147.75,36.0
7,9,38.25,145.21,45.0
8,10,99.07,137.03,55.0
9,11,113.93,140.42,66.0


In [141]:
print(table.to_latex())

\begin{tabular}{lrrrr}
\toprule
{} &  num\_parameters &  lasso\_rmse &  linear\_rmse &  total\_terms \\
\midrule
0  &               2 &       37.66 &       164.04 &         3.00 \\
1  &               3 &       34.68 &       162.42 &         6.00 \\
2  &               4 &       34.71 &       162.77 &        10.00 \\
3  &               5 &       34.83 &       162.63 &        15.00 \\
4  &               6 &       34.82 &       162.58 &        21.00 \\
5  &               7 &       36.49 &       152.11 &        28.00 \\
6  &               8 &       36.92 &       147.75 &        36.00 \\
7  &               9 &       38.25 &       145.21 &        45.00 \\
8  &              10 &       99.07 &       137.03 &        55.00 \\
9  &              11 &      113.93 &       140.42 &        66.00 \\
10 &              12 &      103.27 &       148.47 &        78.00 \\
11 &              13 &       98.57 &       121.88 &        91.00 \\
12 &              14 &       86.46 &       125.41 &       105.00 \\
13 

In [142]:
a = table.iloc[:, [0, 1]].copy()
a.columns = ['num_parameters', 'rmse']
a['model'] = 'lasso'
b = table.iloc[:, [0, 2]].copy()
b.columns = ['num_parameters', 'rmse']
b['model'] = 'linear'
combined_table = pd.concat([a, b])
combined_table['total_terms'] = (combined_table['num_parameters'])*(
    combined_table['num_parameters']-1)/2 + combined_table['num_parameters']

In [143]:
p = ggplot(combined_table, aes('total_terms',
           'rmse', color='model')) + geom_line()
ggsave(p, '2.png')

In [144]:
test['arsonsPerPop'].max() - test['arsonsPerPop'].min()

319.18