In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
plt.style.use('ggplot')
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.linear_model import Ridge, Lasso
import random

%matplotlib inline

In [2]:
random.seed(42)

In [3]:
df = pd.read_pickle('df4.p')

### 3 features plus region dummy variables model

In [92]:
X = df.loc[:,['Monthly rent for 85 m2 (900 Sqft) furnished accommodation in EXPENSIVE area',\
              'Monthly ticket public transport', \
              'Combo meal in fast food restaurant (Big Mac Meal or similar)', \
              'Asia', 'Central America', 'Eastern Europe', 'Middle East', 'North America', 'Oceania', \
              'South America', 'Western Europe'\
                ]].values
y = df.loc[:,'Price_Index'].values.reshape(-1,1)

In [93]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [94]:
lr = LinearRegression()
lr.fit(X_train, y_train);

In [95]:
def square_error(true,pred):
    return (pred-true)**2

def get_square_errors(label,pred):
    sq_errors = []
    for pred,true in zip(pred,label):
        sqe = square_error(true,pred)
        sq_errors.append(sqe)
    return sq_errors

def MSE(true,pred):
    return np.mean(get_square_errors(true,pred))

def SSE(true,pred):
    return np.sum(get_square_errors(true,pred))

def RMSE(true,pred):
    return np.sqrt(MSE(true,pred))
    
def assess_model(model, feat, label, verbose=0):
    y_pred = model.predict(feat)
    sse = SSE(label,y_pred)
    mse = MSE(label,y_pred)
    rmse = np.sqrt(mse)
    if verbose:
        print("Root Mean Square Error: ", rmse)
        print("Mean Square Error: ", mse)
        print("Sum Square Error: ", sse)
    return sse,mse,rmse

In [96]:
assess_model(lr, X_train, y_train, verbose=1);
print()
assess_model(lr, X_test, y_test, verbose=1)

Root Mean Square Error:  8.33620630571
Mean Square Error:  69.4923355714
Sum Square Error:  17929.0225774

Root Mean Square Error:  8.85971127888
Mean Square Error:  78.4944839451
Sum Square Error:  5102.14145643


(5102.1414564289616, 78.494483945060949, 8.8597112788770342)

### 3 features no dummy variables

In [98]:
X = df.loc[:,['Monthly rent for 85 m2 (900 Sqft) furnished accommodation in EXPENSIVE area',\
               'Monthly ticket public transport', \
                'Combo meal in fast food restaurant (Big Mac Meal or similar)'
                ]].values
y = df.loc[:,'Price_Index'].values.reshape(-1,1)

In [54]:
cols_to_keep = ['Price_Index','Monthly rent for 85 m2 (900 Sqft) furnished accommodation in EXPENSIVE area',\
               'Monthly ticket public transport', \
                'Combo meal in fast food restaurant (Big Mac Meal or similar)'
                ]
data = df[cols_to_keep]

In [99]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [100]:
lr = LinearRegression()
lr.fit(X_train, y_train);

In [101]:
assess_model(lr, X_train, y_train, verbose=1);
print()
assess_model(lr, X_test, y_test, verbose=1)

Root Mean Square Error:  9.25430722604
Mean Square Error:  85.642202234
Sum Square Error:  22095.6881764

Root Mean Square Error:  9.99991468275
Mean Square Error:  99.9982936624
Sum Square Error:  6499.88908805


(6499.8890880545778, 99.998293662378117, 9.9999146827549534)

### Poly Transformed model

In [103]:
poly = make_pipeline(PolynomialFeatures(2), LinearRegression())
poly.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [104]:
assess_model(model, X_train, y_train, verbose=1);
print()
assess_model(model, X_test, y_test, verbose=1)

Root Mean Square Error:  8.80403100023
Mean Square Error:  77.5109618529
Sum Square Error:  19997.8281581

Root Mean Square Error:  9.90947509957
Mean Square Error:  98.1976967489
Sum Square Error:  6382.85028868


(6382.8502886798242, 98.197696748920379, 9.909475099566091)

### Will regularization produce a better model?

In [105]:
def ridge_model_cv_test(X, y, alphas = [1.0, 0.10, 0.01], folds = 5):
    for alpha in alphas:
        model = Ridge(alpha = alpha)
        model.fit(X_train, y_train)
        scores = cross_val_score(model, X_train, y_train, cv = folds, scoring=make_scorer(MSE))
        print('ridge cv results:\nalpha:', alpha, 'MSE:', scores, 'Avg MSE:', np.mean(scores))

In [106]:
def lasso_model_cv_test(X, y, alphas = [1.0, 0.10, 0.01], folds = 5):
    for alpha in alphas:
        model = Lasso(alpha = alpha)
        model.fit(X_train, y_train)
        scores = cross_val_score(model, X_train, y_train, cv = folds, scoring=make_scorer(MSE))
        print('lasso cv results:\nalpha:', alpha, 'MSE:', scores, 'Avg MSE:', np.mean(scores))

In [107]:
ridge_model_cv_test(X_train, y_train)

ridge cv results:
alpha: 1.0 MSE: [ 116.85302124   94.76329788   71.24935819   81.69623858   75.95223039] Avg MSE: 88.1028292554
ridge cv results:
alpha: 0.1 MSE: [ 116.83960027   94.72404902   71.36482683   81.71014854   75.91365605] Avg MSE: 88.1104561414
ridge cv results:
alpha: 0.01 MSE: [ 116.83827229   94.72013423   71.37644792   81.71160637   75.90982901] Avg MSE: 88.1112579629


In [108]:
lasso_model_cv_test(X_train, y_train)

lasso cv results:
alpha: 1.0 MSE: [ 117.40929632   96.40251562   68.07883656   81.85516742   77.9213576 ] Avg MSE: 88.3334347048
lasso cv results:
alpha: 0.1 MSE: [ 116.87628384   94.86804859   71.0033946    81.65783094   76.06548463] Avg MSE: 88.0942085209
lasso cv results:
alpha: 0.01 MSE: [ 116.84181434   94.73435128   71.33974162   81.70568659   75.92455872] Avg MSE: 88.1092305093
