In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import linear_model   
import pandas as pd
import numpy as np

In [2]:
file = '../Data/Test_data_2017.csv'
df = pd.read_csv(file)


# Ordinary Least Square Regression (OLS)

Ordinary least squares regression is a statistical method that produces the one straight line that minimizes the total squared error.
There is statsmodel.api and statsmodel.formula.api. Here we are using the statsmodel.formula.api, due to the fact that it will calculate an intercept coefficient, hence a constant is automatically added to the data and an intercept is fitted, while using the statsmodel.api, we have to assign a constant for y-intersept 

The generic formula is:

$Y_i = Bias_0 + Weight_1 Feature_1 + Weight_2 Feature_2 + \ldots + Weight_p Feature_p$

In [3]:
df.head()

Unnamed: 0,Country_name,year,Life_Ladder,Log_GDP_percapita,Social_support,Healthy _life_expectancy_at_birth,Freedom_to_make_life_choices,Generosity,Perceptions_of_corruption,Positive_affect,Negative_affect
0,Afghanistan,2017,2.662,7.697,0.491,52.8,0.427,-0.121,0.954,0.496,0.371
1,Albania,2017,4.64,9.476,0.638,68.4,0.75,-0.029,0.876,0.669,0.334
2,Algeria,2017,5.249,9.354,0.807,65.7,0.437,-0.167,0.7,0.642,0.289
3,Argentina,2017,6.039,10.067,0.907,68.6,0.832,-0.186,0.841,0.809,0.292
4,Armenia,2017,4.288,9.402,0.698,66.6,0.614,-0.147,0.865,0.625,0.437


In [4]:
df.count()

Country_name                         147
year                                 147
Life_Ladder                          147
Log_GDP_percapita                    147
Social_support                       147
Healthy _life_expectancy_at_birth    147
Freedom_to_make_life_choices         147
Generosity                           147
Perceptions_of_corruption            147
Positive_affect                      147
Negative_affect                      147
dtype: int64

In [5]:
clean_data = df.drop(columns=['Country_name', 'year'])

In [6]:
clean_data.count()

Life_Ladder                          147
Log_GDP_percapita                    147
Social_support                       147
Healthy _life_expectancy_at_birth    147
Freedom_to_make_life_choices         147
Generosity                           147
Perceptions_of_corruption            147
Positive_affect                      147
Negative_affect                      147
dtype: int64

In [7]:
# Renaming column
clean_df=clean_data.rename(columns={'Healthy _life_expectancy_at_birth': 'Life_expectancy'})

In [8]:
clean_df

Unnamed: 0,Life_Ladder,Log_GDP_percapita,Social_support,Life_expectancy,Freedom_to_make_life_choices,Generosity,Perceptions_of_corruption,Positive_affect,Negative_affect
0,2.662,7.697,0.491,52.8,0.427,-0.121,0.954,0.496,0.371
1,4.640,9.476,0.638,68.4,0.750,-0.029,0.876,0.669,0.334
2,5.249,9.354,0.807,65.7,0.437,-0.167,0.700,0.642,0.289
3,6.039,10.067,0.907,68.6,0.832,-0.186,0.841,0.809,0.292
4,4.288,9.402,0.698,66.6,0.614,-0.147,0.865,0.625,0.437
...,...,...,...,...,...,...,...,...,...
142,5.071,9.073,0.896,66.3,0.636,-0.169,0.844,0.726,0.363
143,5.175,8.876,0.827,67.7,0.872,0.018,0.781,0.632,0.210
144,3.254,7.578,0.790,55.9,0.595,-0.147,0.833,0.455,0.295
145,3.933,8.156,0.744,54.8,0.823,0.140,0.740,0.685,0.387


#### Define Features, Target


In [9]:
X = clean_df.drop(columns='Life_Ladder', axis=1)
y= clean_df['Life_Ladder']


In [10]:
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
#75% of data is for training the dataset, & 25% is for testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)
#Counter(y_train)

In [None]:
#y_train=y_train.values.reshape(-1,1)


In [11]:
y_train.shape

(110,)

In [12]:
y_train_series=pd.Series(y_train)
type(y_train_series)

pandas.core.series.Series

In [13]:
X_train.shape

(110, 8)

In [14]:
training_data=X_train.assign(Life_Ladder=y_train_series.values)
training_data

Unnamed: 0,Log_GDP_percapita,Social_support,Life_expectancy,Freedom_to_make_life_choices,Generosity,Perceptions_of_corruption,Positive_affect,Negative_affect,Life_Ladder
118,10.509,0.928,70.8,0.921,-0.025,0.829,0.615,0.286,6.167
17,9.969,0.942,66.6,0.689,-0.154,0.911,0.614,0.189,5.097
111,7.588,0.517,60.5,0.908,0.051,0.214,0.762,0.358,3.108
78,11.634,0.905,72.6,0.903,0.044,0.330,0.766,0.184,7.061
69,10.837,0.853,66.5,0.884,-0.005,0.512,0.692,0.307,6.094
...,...,...,...,...,...,...,...,...,...
133,10.238,0.876,66.4,0.644,-0.237,0.671,0.450,0.313,5.607
137,11.115,0.836,66.9,0.962,0.216,0.299,0.795,0.208,7.039
72,10.257,0.895,66.5,0.700,-0.154,0.798,0.623,0.232,5.978
140,9.968,0.914,68.9,0.898,-0.091,0.627,0.836,0.280,6.336


### statsmodels.formula.api:

In [123]:
import statsmodels.formula.api as smf
import statsmodels.api as sm

#create a model
#.fit() uses the model to create a best fit line, hence the linear regression
#mod=smf.ols(formula='Life_Ladder ~ Log_GDP_percapita + Social_support +Life_expectancy +Freedom_to_make_life_choices+Generosity+ Perceptions_of_corruption+Positive_affect + Negative_affect', 
              #data=clean_df).fit()
mod=smf.ols(formula='y_train ~ X_train', data= training_data).fit()


In [124]:
mod.params

Intercept    -1.703568
X_train[0]   -1.703568
X_train[1]    0.318189
X_train[2]    2.367737
X_train[3]    0.034190
X_train[4]    0.931277
X_train[5]    0.732088
X_train[6]   -0.318034
X_train[7]    1.328723
X_train[8]    1.083689
dtype: float64

In [125]:
Z=mod.predict(testing_data)

PatsyError: predict requires that you use a DataFrame when predicting from a model
that was created using the formula api.

The original error message returned by patsy is:
Number of rows mismatch between data argument and X_train (37 versus 110)
    y_train ~ X_train
              ^^^^^^^

In [88]:
X_train=sm.add_constant(X_train)
model=sm.OLS(y_train, X_train).fit()
model.params

const                          -3.407136
Log_GDP_percapita               0.318189
Social_support                  2.367737
Life_expectancy                 0.034190
Freedom_to_make_life_choices    0.931277
Generosity                      0.732088
Perceptions_of_corruption      -0.318034
Positive_affect                 1.328723
Negative_affect                 1.083689
dtype: float64

In [76]:
y_test.shape

(37,)

In [77]:
y_test=y_test.values.reshape(-1,1)

In [60]:
X_test.shape

(37, 8)

In [61]:
y_test_series=pd.Series(y_test)
type(y_test_series)

pandas.core.series.Series

In [78]:
y_test.shape

(37, 1)

In [64]:
testing_data=X_test.assign(Life_Ladder=y_test_series.values)
testing_data.head()

Unnamed: 0,Log_GDP_percapita,Social_support,Life_expectancy,Freedom_to_make_life_choices,Generosity,Perceptions_of_corruption,Positive_affect,Negative_affect,Life_Ladder
99,9.65,0.8,65.303,0.752,-0.059,0.856,0.502,0.299,5.234
93,8.039,0.816,62.4,0.845,0.134,0.77,0.571,0.376,4.737
54,10.925,0.967,73.0,0.939,0.246,0.727,0.895,0.148,7.476
103,10.324,0.912,69.5,0.9,-0.17,0.841,0.833,0.242,6.568
98,8.555,0.733,49.3,0.826,0.124,0.835,0.725,0.236,5.322


In [116]:
X_test.shape

(37, 8)

In [120]:
y_test.shape

(37, 1)

In [122]:
y_1=model.predict(testing_data)
y_1

99     128.503125
93     126.785302
54     144.736636
103    137.335086
98      94.420035
75     111.949891
89     131.661134
53     131.442437
44     126.105137
59     141.013632
5      145.609636
16     131.870573
91     116.129008
14     134.058907
58     115.677441
33     142.375166
73     131.318501
29     143.473459
66     124.248701
35     136.753219
117    135.068780
84     130.209429
31     145.792216
128    112.839727
109    131.100160
19     122.925235
51     136.780008
48     129.785593
120    144.758864
141    131.257546
125    145.715323
132    131.623441
42     113.682389
138    142.773610
28     107.445380
40     142.217142
105    135.655368
dtype: float64

In [117]:
y_1=sm.OLS.predict(X_test)

TypeError: predict() missing 1 required positional argument: 'params'

In [103]:
y_predict=model.get_prediction(sel,X_test)

TypeError: get_prediction() got multiple values for argument 'exog'

In [69]:
y_predict

<statsmodels.regression._prediction.PredictionResults at 0x227a060fdc8>

In [None]:
#--------

In [19]:
print(mod.summary())


                            OLS Regression Results                            
Dep. Variable:                y_train   R-squared:                       0.746
Model:                            OLS   Adj. R-squared:                  0.726
Method:                 Least Squares   F-statistic:                     37.05
Date:                Thu, 18 Nov 2021   Prob (F-statistic):           9.25e-27
Time:                        09:41:11   Log-Likelihood:                -93.039
No. Observations:                 110   AIC:                             204.1
Df Residuals:                     101   BIC:                             228.4
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -3.4071      1.054     -3.231      0.0

##### OLSEquation: 
 Happiness_Ladder = -3.5 +(0.31*Log_GDP_percapita)+(2.46*Social_support)+(0.03*Healthy _life_expectancy_at_birth)+ (0.99*Freedom_to_make_life_choices)+(0.32*Generosity)+(-0.41*Perceptions_of_corruption)+(1.61*Positive_affect)+(0.71*Negative_affect) + e

In [29]:
predictions=mod.get_predict()

AttributeError: 'OLSResults' object has no attribute 'get_predict'

In [20]:
y_pred=mod.predict(X_test)

PatsyError: predict requires that you use a DataFrame when predicting from a model
that was created using the formula api.

The original error message returned by patsy is:
Number of rows mismatch between data argument and X_train (37 versus 110)
    y_train ~ X_train
              ^^^^^^^

In [None]:
print('Statsmodel.formula.api R2:{:.2f}'.format((mod.rsquared)*100)+'%')


#### <ins>Interpretations of the results:
* R-squared value: This is a statistical measure of how well the regression line fits with the real data points. 
    i.e Is the measurement of how much of the independent variable is explained by changes in our dependent variables.
    The higher the value, the better the fit
* Adj, R-squared: This is the corrected R-squared value according to the number of input features. Ideally, it should be close to the R-squareds value
* Intercept: The result of the model, if all variables were 0, i.e it is the 'b' constant added to give the starting value of the line    
* Coefficient: This gives the ‘M’ value for the regression line. It tells how much the Life_Ladder changes with a unit change in the feature columns. A positive value means that the two variables are directly proportional. A negative value means that the two variables are inversely proportional to each other, i.e as Life_Ladder increases, Perception of corruption decreases
* Std error:  This tells us how accurate our coefficient value is. The lower the standard error, the higher the accuracy. A low std error compared to a high coefficient produces a high t statistic, which signifies a high significance for your coefficient   
* t: The t is a measurement of the precision with which the coefficient was measured.
* P >|t| : This is the p-value. It tells us how statistically significant feature values are to the Life_Ladder. A value less
     than $\alpha$=0.05 means that it is quite significant.   
     Looking at 'Generosity, with a p-value of 0.346, means that there is a34.6% chance that the Generosity variable has no
     effect on the Life_Ladder, and the results are produced by chance
* Omnibus: describes the normalcy of the distribution. A 0 would indicate perfect normalcy
* Prob(Omnibus): is a statistical test measuring the probability the residuals are normally distributed.A 1 would indicate  
  perfectly normal distribution
* Skew: measures the symmetry in the data, 0 means perfect symmetry
* Kurtosis: measures the peak in our data, i.e concentration around 0 in a normal curve. Higher values implies fewer outliners
* Durbin-Watson: is a measurement of an even distribution of errors throughout the data. Ideal value is 1-2


### statsmodels api

In [None]:
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
#75% of data is for training the dataset, & 25% is for testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)
#Counter(y_train)

In [None]:

## fit a OLS model with intercept
#X_train= sm.add_constant(X_train)
model=sm.OLS(y_train,X_train).fit()



In [None]:
print(model.summary())

In [None]:
print('Statsmodel api using training data, R2:{:.2f}'.format((model.rsquared)*100)+'%')

In [None]:
y_pred = model.predict(X_test)

print("Prediction for test set:",(y_pred.values.reshape(1,-1)))



In [None]:
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head()

#### In the OLS model, we use the training data to fit and predict, which is probably the reason for high R_2
#### In the Linear Regression model, we are using training data to fit and test the data to predict

### Try the model with real input data to predict the Life_Ladder score:

In [None]:
# Format the any numpy array to 3 decimal points
np.set_printoptions(formatter={'float':"{0:0.3f}".format})

# Try the model with feature values where we know the output
user_input=np.array([7.647,0.539,51.6,0.6,0.121,0.707,0.618,0.275])

user_input=user_input.reshape(1,-1)

In [None]:
user_input.shape

In [None]:
#user_predict=mod.predict(user_input)


In [None]:
# checking the model against a Life_Ladder score that is known:
print(f'The real Life_Ladder is 4.758, compared to the predicted Life_Ladder: {user_predict}')

#### Testing data with statsmodels.formula.api

In [None]:
test_data=pd.DataFrame([7.647,0.539,51.6,0.6,0.121,0.707,0.618,0.275])


In [None]:
predictions=mod.predict(test_data)