In [1]:
import pandas as pd
import sklearn

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [4]:
happiness = pd.read_csv('data/happiness.csv')
happiness

Unnamed: 0,country,rank,score,high,low,gdp,family,lifexp,freedom,generosity,corruption,dystopia
0,Norway,1,7.537,7.594445,7.479556,1.616463,1.533524,0.796667,0.635423,0.362012,0.315964,2.277027
1,Denmark,2,7.522,7.581728,7.462272,1.482383,1.551122,0.792566,0.626007,0.355280,0.400770,2.313707
2,Iceland,3,7.504,7.622030,7.385970,1.480633,1.610574,0.833552,0.627163,0.475540,0.153527,2.322715
3,Switzerland,4,7.494,7.561772,7.426227,1.564980,1.516912,0.858131,0.620071,0.290549,0.367007,2.276716
4,Finland,5,7.469,7.527542,7.410458,1.443572,1.540247,0.809158,0.617951,0.245483,0.382612,2.430182
...,...,...,...,...,...,...,...,...,...,...,...,...
150,Rwanda,151,3.471,3.543030,3.398970,0.368746,0.945707,0.326425,0.581844,0.252756,0.455220,0.540061
151,Syria,152,3.462,3.663669,3.260331,0.777153,0.396103,0.500533,0.081539,0.493664,0.151347,1.061574
152,Tanzania,153,3.349,3.461430,3.236570,0.511136,1.041990,0.364509,0.390018,0.354256,0.066035,0.621130
153,Burundi,154,2.905,3.074690,2.735310,0.091623,0.629794,0.151611,0.059901,0.204435,0.084148,1.683024


In [5]:
happiness.columns

Index(['country', 'rank', 'score', 'high', 'low', 'gdp', 'family', 'lifexp',
       'freedom', 'generosity', 'corruption', 'dystopia'],
      dtype='object')

In [7]:
happiness.index[happiness['country'] == 'Spain']

Int64Index([33], dtype='int64')

In [8]:
happiness.index[happiness['country'] == 'Peru']

Int64Index([62], dtype='int64')

In [9]:
happiness.describe()

Unnamed: 0,rank,score,high,low,gdp,family,lifexp,freedom,generosity,corruption,dystopia
count,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0
mean,78.0,5.354019,5.452326,5.255713,0.984718,1.188898,0.551341,0.408786,0.246883,0.12312,1.850238
std,44.888751,1.13123,1.118542,1.14503,0.420793,0.287263,0.237073,0.149997,0.13478,0.101661,0.500028
min,1.0,2.693,2.864884,2.521116,0.0,0.0,0.0,0.0,0.0,0.0,0.377914
25%,39.5,4.5055,4.608172,4.374955,0.663371,1.042635,0.369866,0.303677,0.154106,0.057271,1.591291
50%,78.0,5.279,5.370032,5.193152,1.064578,1.253918,0.606042,0.437454,0.231538,0.089848,1.83291
75%,116.5,6.1015,6.1946,6.006527,1.318027,1.414316,0.723008,0.516561,0.323762,0.153296,2.144654
max,155.0,7.537,7.62203,7.479556,1.870766,1.610574,0.949492,0.658249,0.838075,0.464308,3.117485


In [11]:
# Settings features and targets
x = happiness[['gdp', 'family', 'lifexp', 'freedom', 'corruption','dystopia']]
y = happiness[['score']]

In [12]:
# Split train and test data
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.25)

In [27]:
# Linear Regression Model
modelLinear = LinearRegression().fit(X_train, y_train)
y_predict_linear = modelLinear.predict(X_test)
linearLoss = mean_squared_error(y_test, y_predict_linear)
print('Model Linear Loss: ', linearLoss)
print('Coeficient Linear ', modelLinear.coef_)

Model Linear Loss:  0.012966753631575142
Coeficient Linear  [[0.80528131 1.05253561 1.18650634 1.23571931 1.35275973 0.96750493]]


In [25]:
# Lasso Model
modelLasso = Lasso(alpha = 0.02).fit(X_train, y_train)       # alpha: Penalization
y_predict_lasso = modelLasso.predict(X_test)
lassoLoss = mean_squared_error(y_test, y_predict_lasso)
print('Model Lasso Loss: ', lassoLoss)
print('Coeficient Lasso ', modelLasso.coef_)

Model Lasso Loss:  0.031530774844607395
Coeficient Lasso  [1.2030058  0.87369001 0.64296979 0.91160102 0.         0.88968204]


In [26]:
# Ridge Model
modelRidge = Ridge(alpha = 1).fit(X_train, y_train)       # alpha: Penalization
y_predict_ridge = modelRidge.predict(X_test)
ridgeLoss = mean_squared_error(y_test, y_predict_ridge)
print('Model Ridge Loss: ', ridgeLoss)
print('Coeficient Ridge ', modelRidge.coef_)

Model Ridge Loss:  0.014209009749945042
Coeficient Ridge  [[0.97093657 0.98263592 0.96424706 1.05687228 0.80810456 0.93651442]]


### Explanation:
- Less loss means less error in the prediction
- For Linear Regression Model `family` `lifexp` `freedom` and `corruption` are the most important variables.
- For Lasso Model `gdp` is the most  important variable. Also this model drop `corruption` because makes noise in the results.
- For Ridge Model `freedom` are the most important variable.