In [2]:
%matplotlib inline
# Plot everything as SVG
%config InlineBackend.figure_formats=['svg']

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Configure some styling
palette = ['#386DF9', '#FFDC52', '#FF1614', '#62F591', '#AA22FF', '#34495E']
sns.set(font_scale=1.1, style='darkgrid', palette=palette, context='notebook')

# We'll use this later to visualize lines
def make_line(m, b, frm=0, to=200):
    xs = np.linspace(frm, to, 500)
    ys = np.dot(xs[:,np.newaxis], [m]) + b
    return xs, ys

In [3]:
reg_data = pd.read_csv('../data/crimecoef.csv')
reg_data.head()

Unnamed: 0,heart_rate,has_criminal_relative,num_facebook_friends,age,income,crime_coef
0,63.525691,0,420,38.728165,10.309871,10.099342
1,70.875672,0,414,7.16845,15.180091,49.455816
2,71.480779,0,428,36.510977,16.34741,39.200911
3,56.346841,0,296,30.695399,11.170218,39.941107
4,63.238515,0,427,36.192791,10.595675,29.95706


In [7]:
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

feats = ['heart_rate', 'num_facebook_friends', 'has_criminal_relative', 'age', 'income']

# (set random state for the purposes of demonstration)
train, test = train_test_split(reg_data[feats + ['crime_coef']], train_size=0.7, random_state=1000)
X_train = train[feats]
y_train = train['crime_coef']

model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the testing data
X_test = test[feats]
y_test = test['crime_coef']
y_pred = model.predict(X_test)
metrics.mean_squared_error(y_pred, y_test)

64.35212618063116

In [13]:
# What value's for m and b did we learn?
m = model.coef_
b = model.intercept_

for f, m in zip(feats, model.coef_):
    print(f, '=>', m)
print('b', b)

heart_rate => 3.01339476788
num_facebook_friends => -0.400473709592
has_criminal_relative => 120.376591318
age => 0.00240924247578
income => -0.138654160617
b -0.504742894832


You can see that age and income have little predictive power, but the criminal coefficient is heavily biased against those who are related to someone with a criminal record.