In [80]:
from sklearn.datasets import load_boston
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso

In [90]:

df = pd.read_csv('happiness.csv')


In [91]:
df = df.fillna(value=df.mean())

In [92]:
df.head()

Unnamed: 0,Country,Score,Life Expectancy,GDP Per Capita,Crime Index,Death Rate,Birth rate,Marriage rate,Divorce rate,Literacy rate,...,Unemployment rate,Fossil fuels,Net migration,Diabetes Prevalance,HIV Prevalance,Rights Strength,Women in Parliament,Car Accidents,Urban Population,Average Rainfall
0,Afghanistan,3.203,65.98,502.115487,76.63,6.423,32.487,6.01954,1.891477,43.01972,...,11.118,69.312725,-314602,9.2,27.52239,10.0,27.868852,15.1,25.754,327.0
1,Albania,4.719,78.96,5352.857411,40.3,7.898,11.78,7.8,1.9,98.14115,...,12.331,61.421801,-69998,9.0,767.292729,8.0,29.508197,13.6,61.229,1485.0
2,Algeria,5.211,77.5,3948.343279,48.33,4.716,24.282,10.1,1.6,81.40784,...,11.704,99.977917,-50002,6.7,50.031294,2.0,25.757576,16.759712,73.189,89.0
3,Argentina,6.086,77.17,10006.14897,62.55,7.609,17.021,6.01954,1.891477,99.00387,...,9.789,87.722407,24000,5.9,3018.488825,2.0,40.15748,14.0,91.991,591.0
4,Armenia,4.559,75.55,4622.733493,22.0,9.857,13.987,6.0,1.0,84.530033,...,16.99,74.561866,-24989,6.1,478.745363,6.0,23.484848,17.1,63.219,562.0


In [93]:
df.drop(['Country'], axis=1, inplace=True)

In [94]:
X = df.drop("Score",1)   #Feature Matrix
y = df["Score"]          #Target Variable
df.head()

Unnamed: 0,Score,Life Expectancy,GDP Per Capita,Crime Index,Death Rate,Birth rate,Marriage rate,Divorce rate,Literacy rate,Electricity access,Unemployment rate,Fossil fuels,Net migration,Diabetes Prevalance,HIV Prevalance,Rights Strength,Women in Parliament,Car Accidents,Urban Population,Average Rainfall
0,3.203,65.98,502.115487,76.63,6.423,32.487,6.01954,1.891477,43.01972,98.713203,11.118,69.312725,-314602,9.2,27.52239,10.0,27.868852,15.1,25.754,327.0
1,4.719,78.96,5352.857411,40.3,7.898,11.78,7.8,1.9,98.14115,100.0,12.331,61.421801,-69998,9.0,767.292729,8.0,29.508197,13.6,61.229,1485.0
2,5.211,77.5,3948.343279,48.33,4.716,24.282,10.1,1.6,81.40784,100.0,11.704,99.977917,-50002,6.7,50.031294,2.0,25.757576,16.759712,73.189,89.0
3,6.086,77.17,10006.14897,62.55,7.609,17.021,6.01954,1.891477,99.00387,100.0,9.789,87.722407,24000,5.9,3018.488825,2.0,40.15748,14.0,91.991,591.0
4,4.559,75.55,4622.733493,22.0,9.857,13.987,6.0,1.0,84.530033,100.0,16.99,74.561866,-24989,6.1,478.745363,6.0,23.484848,17.1,63.219,562.0


In [95]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147 entries, 0 to 146
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Score                147 non-null    float64
 1   Life Expectancy      147 non-null    float64
 2   GDP Per Capita       147 non-null    float64
 3   Crime Index          147 non-null    float64
 4   Death Rate           147 non-null    float64
 5   Birth rate           147 non-null    float64
 6   Marriage rate        147 non-null    float64
 7   Divorce rate         147 non-null    float64
 8   Literacy rate        147 non-null    float64
 9   Electricity access   147 non-null    float64
 10  Unemployment rate    147 non-null    float64
 11  Fossil fuels         147 non-null    float64
 12  Net migration        147 non-null    int64  
 13  Diabetes Prevalance  147 non-null    float64
 14  HIV Prevalance       147 non-null    float64
 15  Rights Strength      147 non-null    flo

In [96]:
#Adding constant column of ones, mandatory for sm.OLS model
X_1 = sm.add_constant(X)
#Fitting sm.OLS model
model = sm.OLS(y,X_1).fit()
model.pvalues

const                  0.328905
Life Expectancy        0.344720
GDP Per Capita         0.006508
Crime Index            0.263112
Death Rate             0.465388
Birth rate             0.427726
Marriage rate          0.666326
Divorce rate           0.554376
Literacy rate          0.186838
Electricity access     0.072505
Unemployment rate      0.000040
Fossil fuels           0.057380
Net migration          0.016394
Diabetes Prevalance    0.126002
HIV Prevalance         0.870078
Rights Strength        0.349097
Women in Parliament    0.006373
Car Accidents          0.002959
Urban Population       0.000007
Average Rainfall       0.458535
dtype: float64

In [97]:
#Backward Elimination
cols = list(X.columns)
pmax = 1
while (len(cols)>0):
    p= []
    X_1 = X[cols]
    X_1 = sm.add_constant(X_1)
    model = sm.OLS(y,X_1).fit()
    p = pd.Series(model.pvalues.values[1:],index = cols)      
    pmax = max(p)
    feature_with_p_max = p.idxmax()
    if(pmax>0.05):
        cols.remove(feature_with_p_max)
    else:
        break
selected_features_BE = cols
print(selected_features_BE)

['GDP Per Capita', 'Electricity access', 'Unemployment rate', 'Fossil fuels', 'Net migration', 'Women in Parliament', 'Car Accidents', 'Urban Population']


In [98]:
model = LinearRegression()
#Initializing RFE model
rfe = RFE(model, 7)
#Transforming data using RFE
X_rfe = rfe.fit_transform(X,y)  
#Fitting the data to model
model.fit(X_rfe,y)
print(rfe.support_)
print(rfe.ranking_)

[ True False False False False  True False False False  True False False
  True False  True False  True  True False]
[ 1 11  9  6  5  1  3  8  7  1  4 13  1 12  1  2  1  1 10]




In [99]:
#no of features
nof_list=np.arange(1,13)            
high_score=0
#Variable to store the optimum features
nof=0           
score_list =[]
for n in range(len(nof_list)):
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 0)
    model = LinearRegression()
    rfe = RFE(model,nof_list[n])
    X_train_rfe = rfe.fit_transform(X_train,y_train)
    X_test_rfe = rfe.transform(X_test)
    model.fit(X_train_rfe,y_train)
    score = model.score(X_test_rfe,y_test)
    score_list.append(score)
    if(score>high_score):
        high_score = score
        nof = nof_list[n]
print("Optimum number of features: %d" %nof)
print("Score with %d features: %f" % (nof, high_score))

Optimum number of features: 9
Score with 9 features: 0.669607




In [100]:
cols = list(X.columns)
model = LinearRegression()
#Initializing RFE model
rfe = RFE(model, 10)             
#Transforming data using RFE
X_rfe = rfe.fit_transform(X,y)  
#Fitting the data to model
model.fit(X_rfe,y)              
temp = pd.Series(rfe.support_,index = cols)
selected_features_rfe = temp[temp==True].index
print(selected_features_rfe)

Index(['Life Expectancy', 'Marriage rate', 'Divorce rate', 'Unemployment rate',
       'Fossil fuels', 'Diabetes Prevalance', 'Rights Strength',
       'Women in Parliament', 'Car Accidents', 'Urban Population'],
      dtype='object')


