In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model, svm, tree
from sklearn.ensemble import RandomForestRegressor
from  sklearn import metrics
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler


In [2]:
data = pd.read_csv("communities.csv")

In [3]:
def prepare_data(exclude_columns):
    data = pd.read_csv("communities.csv")
    data = data.drop(exclude_columns, axis=1)

    # One hot encoding for state column.
    data = pd.get_dummies(data, columns=["state"])

    x = data.drop('ViolentCrimesPerPop', axis=1)
    
    y = data['ViolentCrimesPerPop']

    x_train, x_test, y_train, y_test = train_test_split(
        x, y, test_size=0.25, random_state=42)
    return x_train, x_test, y_train, y_test

In [4]:
def scale_data(x_train, x_test, scaler=StandardScaler):

    scaler = scaler()
    scaler.fit(x_train)

    x_train_scaled = scaler.transform(x_train)
    x_train_scaled = pd.DataFrame(x_train_scaled, index=x_train.index, columns=x_train.columns)

    x_test_scaled = scaler.transform(x_test)
    x_test_scaled = pd.DataFrame(x_test_scaled, index=x_test.index, columns=x_test.columns)

    x_train_scaled.fillna(x_train_scaled.mean(), inplace=True)
    x_test_scaled.fillna(x_test_scaled.mean(), inplace=True)

    return x_train_scaled, x_test_scaled



In [5]:
def evaluate_regression(y_true_train, y_pred_train, y_true_test, y_pred_test, model_name=""):   
    return pd.DataFrame.from_records([[metrics.mean_squared_error(y_true_train, y_pred_train, squared=False),
                                       metrics.mean_squared_error(y_true_train, y_pred_train),
                                       metrics.mean_absolute_error(y_true_train, y_pred_train),
                                       metrics.max_error(y_true_train, y_pred_train),
                                       metrics.r2_score(y_true_train, y_pred_train), 
                                       metrics.mean_squared_error(y_true_test, y_pred_test,  squared=False),
                                      metrics.mean_squared_error(y_true_test, y_pred_test),
                                       metrics.mean_absolute_error(y_true_test, y_pred_test),
                                       metrics.max_error(y_true_test, y_pred_test),
                                       metrics.r2_score(y_true_test, y_pred_test)]], 
                                     
                                     index=[model_name], 
                                     columns=['RMSE_train','mean_squared_error_train', 'mean_absolute_error_train', 'max_error_train', "r2_score_train",
                                         'RMSE_test', 'mean_squared_error_test', 'mean_absolute_error_test', 'max_error_test', "r2_score_test"])


In [6]:
x_train, x_test, y_train, y_test = prepare_data(exclude_columns=["county",
                                                                "community",
                                                                "communityname",
                                                                ])
x_train, x_test = scale_data(x_train, x_test, scaler=StandardScaler)

In [7]:
results = pd.DataFrame()
for model in [linear_model.LinearRegression(), 
            linear_model.RidgeCV(),
            linear_model.LassoCV(),
            svm.SVR(kernel="linear"),
            svm.SVR(kernel="rbf"),
            svm.SVR(kernel="poly"),
            tree.DecisionTreeRegressor()]:
    model.fit(x_train, y_train)

    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    results = results.append(evaluate_regression(y_train, y_train_pred, y_test, y_test_pred,
                        model_name=model))


In [8]:
results.sort_values("RMSE_test")

Unnamed: 0,RMSE_train,mean_squared_error_train,mean_absolute_error_train,max_error_train,r2_score_train,RMSE_test,mean_squared_error_test,mean_absolute_error_test,max_error_test,r2_score_test
LassoCV(),0.1279265,0.01636518,0.08905247,0.7660781,0.707599,0.1238333,0.0153347,0.08674278,0.6400473,0.6846289
"RidgeCV(alphas=array([ 0.1, 1. , 10. ]))",0.1199164,0.01437995,0.08486875,0.7277727,0.743069,0.1269664,0.01612047,0.09103543,0.6857531,0.6684688
SVR(kernel='linear'),0.1221549,0.01492181,0.08679873,0.7962276,0.733388,0.1302102,0.01695471,0.09257628,0.6984992,0.6513121
SVR(),0.08488995,0.007206303,0.07032034,0.5556636,0.871243,0.1376818,0.01895628,0.1010344,0.7318555,0.610148
SVR(kernel='poly'),0.08977506,0.008059561,0.07402678,0.6079991,0.855998,0.1480762,0.02192657,0.1066338,0.8748978,0.5490615
DecisionTreeRegressor(),1.80579e-18,3.260876e-36,2.0190089999999998e-19,2.775558e-17,1.0,0.1943547,0.03777375,0.1307615,0.85,0.2231509
LinearRegression(),0.119071,0.0141779,0.08474138,0.7100313,0.746679,202530700.0,4.10187e+16,65376070.0,4314820000.0,-8.435842e+17


In [9]:
print("Reference RMSE", results.sort_values("RMSE_test").iloc[0]["RMSE_test"])

Reference RMSE 0.12383334723075583


# Neural Network

In [10]:
from keras import models
from keras import layers
from keras.layers import Dense
from keras.models import Sequential
from keras.layers import Dense

In [11]:
x_train.shape

(1495, 168)

In [22]:
import plotly.graph_objects as go
def plot_hist(history):
    fig = go.Figure()
    fig.add_trace(go.Scattergl(y=history.history['loss'],
                        name='Train'))
    fig.add_trace(go.Scattergl(y=history.history['val_loss'],
                        name='Valid'))
    fig.update_layout(height=500, width=700,
                    xaxis_title='Epoch',
                    yaxis_title='Loss')
    fig.show()

In [24]:
def baseline_model():
    model = Sequential()
    model.add(Dense(2, input_dim=168,  activation='relu'))
    model.add(Dense(3, activation='relu'))
    model.add(Dense(1))

    model.compile(loss='mean_squared_error', optimizer='adam')
    model.summary()
    
    return model

In [38]:
def deep_model():
    model = Sequential()
    model.add(Dense(12, input_dim=168,  activation='sigmoid'))
    model.add(Dense(12, activation='sigmoid'))
    model.add(Dense(6, activation='sigmoid'))
    model.add(Dense(1))

    model.compile(loss='mean_squared_error', optimizer='adam')
    model.summary()

    return model

In [27]:
model = baseline_model()
history = model.fit(x_train, y_train, epochs=20, validation_split=0.2, verbose=0)
y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)

results = results.append(evaluate_regression(y_train, y_train_pred, y_test, y_test_pred,
                        model_name="Keras:baseline_model 2:3:1"))
plot_hist(history)

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_16 (Dense)             (None, 2)                 338       
_________________________________________________________________
dense_17 (Dense)             (None, 3)                 9         
_________________________________________________________________
dense_18 (Dense)             (None, 1)                 4         
Total params: 351
Trainable params: 351
Non-trainable params: 0
_________________________________________________________________


In [40]:
model = deep_model()
history = model.fit(x_train, y_train, epochs=20, validation_split=0.2, verbose=0)
y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)

results = results.append(evaluate_regression(y_train, y_train_pred, y_test, y_test_pred,
                        model_name="Keras:deep_model 12:12:6 all_sigmoid"))
plot_hist(history)

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_35 (Dense)             (None, 12)                2028      
_________________________________________________________________
dense_36 (Dense)             (None, 12)                156       
_________________________________________________________________
dense_37 (Dense)             (None, 6)                 78        
_________________________________________________________________
dense_38 (Dense)             (None, 1)                 7         
Total params: 2,269
Trainable params: 2,269
Non-trainable params: 0
_________________________________________________________________


In [41]:
results.sort_values("RMSE_test")

Unnamed: 0,RMSE_train,mean_squared_error_train,mean_absolute_error_train,max_error_train,r2_score_train,RMSE_test,mean_squared_error_test,mean_absolute_error_test,max_error_test,r2_score_test
LassoCV(),0.1279265,0.01636518,0.08905247,0.7660781,0.707599,0.1238333,0.0153347,0.08674278,0.6400473,0.6846289
Keras:deep_model 12:12:6 all_sigmoid,0.1188157,0.01411718,0.07939493,0.7596041,0.747764,0.1247719,0.01556802,0.08515799,0.7041526,0.6798304
Keras:deep_model 6:12:6 sigmoid,0.119271,0.01422558,0.08332297,0.6741686,0.745827,0.1266157,0.01603153,0.08971219,0.6515012,0.670298
"RidgeCV(alphas=array([ 0.1, 1. , 10. ]))",0.1199164,0.01437995,0.08486875,0.7277727,0.743069,0.1269664,0.01612047,0.09103543,0.6857531,0.6684688
Keras:deep_model 12:12:6 sigmoid,0.1205101,0.01452267,0.08130825,0.7107195,0.740519,0.128913,0.01661855,0.08870923,0.7807262,0.6582254
SVR(kernel='linear'),0.1221549,0.01492181,0.08679873,0.7962276,0.733388,0.1302102,0.01695471,0.09257628,0.6984992,0.6513121
Keras:baseline_model 2:3:1,0.1199998,0.01439994,0.08043026,0.9231929,0.742712,0.132163,0.01746706,0.08993189,0.7291537,0.640775
Keras:deep_model 12:12:6 sigmoid,0.1090428,0.01189034,0.07212627,0.730134,0.787552,0.1374419,0.01889028,0.09019274,0.7862707,0.6115053
SVR(),0.08488995,0.007206303,0.07032034,0.5556636,0.871243,0.1376818,0.01895628,0.1010344,0.7318555,0.610148
Keras:deep_model 12:12:6 sigmoid,0.1447287,0.02094641,0.09886027,0.7336865,0.625745,0.1411657,0.01992776,0.09904748,0.636338,0.5901688


In [54]:
print("Best NN test RMSE", results.loc["Keras:deep_model 12:12:6 all_sigmoid", "RMSE_test"], " \nReference    RMSE 0.12383334723075583")


Best NN test RMSE 0.1247718767093575  
Reference    RMSE 0.12383334723075583
