## Training

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import statsmodels.tools
from sklearn.preprocessing import MinMaxScaler

In [3]:
df = pd.read_csv('Life Expectancy Data.csv')

In [4]:
df.head()

Unnamed: 0,Country,Region,Year,Infant_deaths,Under_five_deaths,Adult_mortality,Alcohol_consumption,Hepatitis_B,Measles,BMI,...,Diphtheria,Incidents_HIV,GDP_per_capita,Population_mln,Thinness_ten_nineteen_years,Thinness_five_nine_years,Schooling,Economy_status_Developed,Economy_status_Developing,Life_expectancy
0,Turkiye,Middle East,2015,11.1,13.0,105.824,1.32,97,65,27.8,...,97,0.08,11006,78.53,4.9,4.8,7.8,0,1,76.5
1,Spain,European Union,2015,2.7,3.3,57.9025,10.35,97,94,26.0,...,97,0.09,25742,46.44,0.6,0.5,9.7,1,0,82.8
2,India,Asia,2007,51.5,67.9,201.0765,1.57,60,35,21.2,...,64,0.13,1076,1183.21,27.1,28.0,5.0,0,1,65.4
3,Guyana,South America,2006,32.8,40.5,222.1965,5.68,93,74,25.3,...,93,0.79,4146,0.75,5.7,5.5,7.9,0,1,67.0
4,Israel,Middle East,2012,3.4,4.3,57.951,2.89,97,89,27.0,...,94,0.08,33995,7.91,1.2,1.1,12.8,1,0,81.7


In [5]:
feature_cols = list(df.columns)
feature_cols.remove('Life_expectancy')

In [6]:
X = df[feature_cols]
y = df['Life_expectancy']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [8]:
X_train.head()

Unnamed: 0,Country,Region,Year,Infant_deaths,Under_five_deaths,Adult_mortality,Alcohol_consumption,Hepatitis_B,Measles,BMI,Polio,Diphtheria,Incidents_HIV,GDP_per_capita,Population_mln,Thinness_ten_nineteen_years,Thinness_five_nine_years,Schooling,Economy_status_Developed,Economy_status_Developing
2026,Sri Lanka,Asia,2014,7.9,9.3,111.2825,2.45,99,99,22.9,99,99,0.01,3694,20.78,15.2,15.0,10.9,0,1
651,Czechia,European Union,2004,3.7,4.6,114.2985,13.42,98,98,26.6,96,98,0.08,14070,10.2,2.1,2.2,11.6,1,0
2225,"Venezuela, RB",South America,2014,15.4,18.0,143.0785,6.6,78,83,26.6,79,78,0.4,16056,30.04,1.6,1.5,10.0,0,1
2357,Albania,Rest of Europe,2010,11.8,13.3,80.9365,4.88,99,98,26.1,99,99,0.03,3577,2.91,1.4,1.5,9.3,0,1
670,Namibia,Africa,2003,43.3,74.4,495.7265,2.29,83,64,23.2,82,79,9.74,3298,1.88,14.2,14.3,5.8,0,1


In [9]:
X_train = pd.get_dummies(X_train, columns = ['Region'], prefix = 'Reg', drop_first = True, dtype=int)

In [11]:
X_train.head()

Unnamed: 0,Country,Year,Infant_deaths,Under_five_deaths,Adult_mortality,Alcohol_consumption,Hepatitis_B,Measles,BMI,Polio,...,Economy_status_Developed,Economy_status_Developing,Reg_Asia,Reg_Central America and Caribbean,Reg_European Union,Reg_Middle East,Reg_North America,Reg_Oceania,Reg_Rest of Europe,Reg_South America
2026,Sri Lanka,2014,7.9,9.3,111.2825,2.45,99,99,22.9,99,...,0,1,1,0,0,0,0,0,0,0
651,Czechia,2004,3.7,4.6,114.2985,13.42,98,98,26.6,96,...,1,0,0,0,1,0,0,0,0,0
2225,"Venezuela, RB",2014,15.4,18.0,143.0785,6.6,78,83,26.6,79,...,0,1,0,0,0,0,0,0,0,1
2357,Albania,2010,11.8,13.3,80.9365,4.88,99,98,26.1,99,...,0,1,0,0,0,0,0,0,1,0
670,Namibia,2003,43.3,74.4,495.7265,2.29,83,64,23.2,82,...,0,1,0,0,0,0,0,0,0,0


In [13]:
X_train.columns

Index(['Country', 'Year', 'Infant_deaths', 'Under_five_deaths',
       'Adult_mortality', 'Alcohol_consumption', 'Hepatitis_B', 'Measles',
       'BMI', 'Polio', 'Diphtheria', 'Incidents_HIV', 'GDP_per_capita',
       'Population_mln', 'Thinness_ten_nineteen_years',
       'Thinness_five_nine_years', 'Schooling', 'Economy_status_Developed',
       'Economy_status_Developing', 'Reg_Asia',
       'Reg_Central America and Caribbean', 'Reg_European Union',
       'Reg_Middle East', 'Reg_North America', 'Reg_Oceania',
       'Reg_Rest of Europe', 'Reg_South America'],
      dtype='object')

In [14]:
num_cols = ['Infant_deaths', 'Under_five_deaths',
       'Adult_mortality', 'Alcohol_consumption', 'Hepatitis_B', 'Measles',
       'BMI', 'Polio', 'Diphtheria', 'Incidents_HIV', 'GDP_per_capita',
       'Population_mln', 'Thinness_ten_nineteen_years',
       'Thinness_five_nine_years', 'Schooling', 'Economy_status_Developed',
       'Economy_status_Developing', 'Reg_Asia',
       'Reg_Central America and Caribbean', 'Reg_European Union',
       'Reg_Middle East', 'Reg_North America', 'Reg_Oceania',
       'Reg_Rest of Europe', 'Reg_South America']

In [15]:
X_train_minmax = X_train[num_cols]
X_train_minmax.head()

Unnamed: 0,Infant_deaths,Under_five_deaths,Adult_mortality,Alcohol_consumption,Hepatitis_B,Measles,BMI,Polio,Diphtheria,Incidents_HIV,...,Economy_status_Developed,Economy_status_Developing,Reg_Asia,Reg_Central America and Caribbean,Reg_European Union,Reg_Middle East,Reg_North America,Reg_Oceania,Reg_Rest of Europe,Reg_South America
2026,7.9,9.3,111.2825,2.45,99,99,22.9,99,99,0.01,...,0,1,1,0,0,0,0,0,0,0
651,3.7,4.6,114.2985,13.42,98,98,26.6,96,98,0.08,...,1,0,0,0,1,0,0,0,0,0
2225,15.4,18.0,143.0785,6.6,78,83,26.6,79,78,0.4,...,0,1,0,0,0,0,0,0,0,1
2357,11.8,13.3,80.9365,4.88,99,98,26.1,99,99,0.03,...,0,1,0,0,0,0,0,0,1,0
670,43.3,74.4,495.7265,2.29,83,64,23.2,82,79,9.74,...,0,1,0,0,0,0,0,0,0,0


In [12]:
minmax = MinMaxScaler()

In [16]:
minmax.fit(X_train_minmax)

In [19]:
X_train_scaled = pd.DataFrame(minmax.transform(X_train_minmax), columns = num_cols)
X_train_scaled.head()

Unnamed: 0,Infant_deaths,Under_five_deaths,Adult_mortality,Alcohol_consumption,Hepatitis_B,Measles,BMI,Polio,Diphtheria,Incidents_HIV,...,Economy_status_Developed,Economy_status_Developing,Reg_Asia,Reg_Central America and Caribbean,Reg_European Union,Reg_Middle East,Reg_North America,Reg_Oceania,Reg_Rest of Europe,Reg_South America
0,0.04559,0.031447,0.094604,0.137101,1.0,1.0,0.252033,1.0,1.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0142,0.010332,0.099213,0.750979,0.988506,0.988764,0.552846,0.967033,0.987952,0.00323,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.101644,0.07053,0.1432,0.369334,0.758621,0.820225,0.552846,0.78022,0.746988,0.017997,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.074738,0.049416,0.048224,0.273083,1.0,0.988764,0.512195,1.0,1.0,0.000923,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.310164,0.323899,0.682175,0.128148,0.816092,0.606742,0.276423,0.813187,0.759036,0.449008,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
X_train_scaled = sm.add_constant(X_train_scaled)

In [21]:
y_train = y_train.reset_index(drop = True)
y_train.head()

0    76.1
1    75.7
2    72.8
3    76.6
4    50.6
Name: Life_expectancy, dtype: float64

In [22]:
all(y_train.index==X_train_scaled.index)

True

### Round one

In [25]:
X_train_scaled.columns

Index(['const', 'Infant_deaths', 'Under_five_deaths', 'Adult_mortality',
       'Alcohol_consumption', 'Hepatitis_B', 'Measles', 'BMI', 'Polio',
       'Diphtheria', 'Incidents_HIV', 'GDP_per_capita', 'Population_mln',
       'Thinness_ten_nineteen_years', 'Thinness_five_nine_years', 'Schooling',
       'Economy_status_Developed', 'Economy_status_Developing', 'Reg_Asia',
       'Reg_Central America and Caribbean', 'Reg_European Union',
       'Reg_Middle East', 'Reg_North America', 'Reg_Oceania',
       'Reg_Rest of Europe', 'Reg_South America'],
      dtype='object')

In [26]:
fe_cols = ['const', 'Infant_deaths', 'Under_five_deaths', 'Adult_mortality',
       'Alcohol_consumption', 'Hepatitis_B', 'Measles', 'BMI', 'Polio',
       'Diphtheria', 'Incidents_HIV', 'GDP_per_capita', 'Population_mln',
       'Thinness_ten_nineteen_years', 'Thinness_five_nine_years', 'Schooling',
       'Economy_status_Developed', 'Reg_Asia',
       'Reg_Central America and Caribbean', 'Reg_European Union',
       'Reg_Middle East', 'Reg_North America', 'Reg_Oceania',
       'Reg_Rest of Europe', 'Reg_South America']

In [27]:
X_train_scaled = X_train_scaled[fe_cols]
X_train_scaled.head()

Unnamed: 0,const,Infant_deaths,Under_five_deaths,Adult_mortality,Alcohol_consumption,Hepatitis_B,Measles,BMI,Polio,Diphtheria,...,Schooling,Economy_status_Developed,Reg_Asia,Reg_Central America and Caribbean,Reg_European Union,Reg_Middle East,Reg_North America,Reg_Oceania,Reg_Rest of Europe,Reg_South America
0,1.0,0.04559,0.031447,0.094604,0.137101,1.0,1.0,0.252033,1.0,1.0,...,0.75969,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0142,0.010332,0.099213,0.750979,0.988506,0.988764,0.552846,0.967033,0.987952,...,0.813953,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.101644,0.07053,0.1432,0.369334,0.758621,0.820225,0.552846,0.78022,0.746988,...,0.689922,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.074738,0.049416,0.048224,0.273083,1.0,0.988764,0.512195,1.0,1.0,...,0.635659,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.0,0.310164,0.323899,0.682175,0.128148,0.816092,0.606742,0.276423,0.813187,0.759036,...,0.364341,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
lin_reg = sm.OLS(y_train, X_train_scaled)
results = lin_reg.fit()
results.summary()

0,1,2,3
Dep. Variable:,Life_expectancy,R-squared:,0.984
Model:,OLS,Adj. R-squared:,0.984
Method:,Least Squares,F-statistic:,5836.0
Date:,"Tue, 28 May 2024",Prob (F-statistic):,0.0
Time:,13:02:56,Log-Likelihood:,-3659.9
No. Observations:,2291,AIC:,7370.0
Df Residuals:,2266,BIC:,7513.0
Df Model:,24,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,78.1524,0.337,231.972,0.000,77.492,78.813
Infant_deaths,-7.6283,0.856,-8.909,0.000,-9.307,-5.949
Under_five_deaths,-10.6222,0.896,-11.860,0.000,-12.379,-8.866
Adult_mortality,-30.6535,0.409,-74.993,0.000,-31.455,-29.852
Alcohol_consumption,-0.1736,0.206,-0.843,0.400,-0.578,0.230
Hepatitis_B,-0.4925,0.223,-2.213,0.027,-0.929,-0.056
Measles,0.2467,0.156,1.583,0.114,-0.059,0.552
BMI,-1.3819,0.279,-4.957,0.000,-1.929,-0.835
Polio,0.6472,0.535,1.210,0.226,-0.401,1.696

0,1,2,3
Omnibus:,14.424,Durbin-Watson:,2.041
Prob(Omnibus):,0.001,Jarque-Bera (JB):,16.107
Skew:,0.138,Prob(JB):,0.000318
Kurtosis:,3.304,Cond. No.,104.0


In [29]:
y_pred = results.predict(X_train_scaled)
rmse = statsmodels.tools.eval_measures.rmse(y_train, y_pred)
print(rmse)

1.195516563383316


### Round two

In [30]:
def stepwise_selection(X, y, threshold_in = 0.01, threshold_out = 0.05, verbose = True):
    # The function is checking for p-values (whether features are statistically significant) - lower is better
    included = [] # this is going to be the list of features we keep
    while True:
        changed = False
        # forward step
        excluded = list(set(X.columns) - set(included))
        new_pval = pd.Series(index = excluded, dtype = 'float64')
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included + [new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        # we add the feature with the lowest (best) p-value under the threshold to our 'included' list
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed = True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval)) # specifying the verbose text


        # backward step: removing features if new features added to the list make them statistically insignificant
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()

        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max() # null if pvalues is empty
        # if the p-value exceeds the upper threshold, the feature will be dropped from the 'included' list
        if worst_pval > threshold_out:
            changed = True
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
        if not changed:
            break
    return included

In [31]:
step_result = stepwise_selection(X_train_scaled, y_train)
print(step_result)

Add  Under_five_deaths              with p-value 0.0
Add  const                          with p-value 0.0
Add  Adult_mortality                with p-value 0.0
Add  Economy_status_Developed       with p-value 1.79712e-150
Add  Reg_Central America and Caribbean with p-value 1.8882e-48
Add  GDP_per_capita                 with p-value 1.15359e-38
Add  Reg_South America              with p-value 1.1788e-43
Add  Reg_Oceania                    with p-value 9.21724e-27
Add  Infant_deaths                  with p-value 1.00003e-16
Add  Reg_European Union             with p-value 9.71311e-18
Add  Schooling                      with p-value 6.62755e-11
Add  BMI                            with p-value 3.59072e-07
Add  Incidents_HIV                  with p-value 6.2488e-06
Add  Hepatitis_B                    with p-value 0.000386826
['Under_five_deaths', 'const', 'Adult_mortality', 'Economy_status_Developed', 'Reg_Central America and Caribbean', 'GDP_per_capita', 'Reg_South America', 'Reg_Oceania', 

In [32]:
fe_cols_stepped = ['Under_five_deaths', 'const', 'Adult_mortality', 'Economy_status_Developed', 'Reg_Central America and Caribbean', 'GDP_per_capita', 'Reg_South America', 'Reg_Oceania', 'Infant_deaths', 'Reg_European Union', 'Schooling', 'BMI', 'Incidents_HIV', 'Hepatitis_B']

In [33]:
X_train_stepped = X_train_scaled[fe_cols_stepped]

In [34]:
lin_reg = sm.OLS(y_train, X_train_stepped)
results2 = lin_reg.fit()
results2.summary()

0,1,2,3
Dep. Variable:,Life_expectancy,R-squared:,0.984
Model:,OLS,Adj. R-squared:,0.984
Method:,Least Squares,F-statistic:,10730.0
Date:,"Tue, 28 May 2024",Prob (F-statistic):,0.0
Time:,13:05:44,Log-Likelihood:,-3670.6
No. Observations:,2291,AIC:,7369.0
Df Residuals:,2277,BIC:,7449.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Under_five_deaths,-11.1985,0.836,-13.398,0.000,-12.838,-9.559
const,78.4816,0.225,348.756,0.000,78.040,78.923
Adult_mortality,-30.8393,0.391,-78.950,0.000,-31.605,-30.073
Economy_status_Developed,2.5488,0.155,16.470,0.000,2.245,2.852
Reg_Central America and Caribbean,1.7480,0.092,18.937,0.000,1.567,1.929
GDP_per_capita,2.0113,0.266,7.559,0.000,1.490,2.533
Reg_South America,1.4822,0.105,14.121,0.000,1.276,1.688
Reg_Oceania,-1.0605,0.118,-8.986,0.000,-1.292,-0.829
Infant_deaths,-7.2292,0.817,-8.853,0.000,-8.831,-5.628

0,1,2,3
Omnibus:,25.313,Durbin-Watson:,2.044
Prob(Omnibus):,0.0,Jarque-Bera (JB):,29.304
Skew:,0.193,Prob(JB):,4.33e-07
Kurtosis:,3.397,Cond. No.,70.5


In [35]:
y_pred_stepped = results2.predict(X_train_stepped)
rmse = statsmodels.tools.eval_measures.rmse(y_train, y_pred_stepped)
print(rmse)

1.2010930786862224


### Round three

In [36]:
fe_cols_3 = ['Adult_mortality', 'const', 'Economy_status_Developed',
                   'Reg_Central America and Caribbean', 'Reg_South America', 'Under_five_deaths',
                   'GDP_per_capita', 'Reg_Oceania', 'Reg_European Union', 'Schooling', 'BMI',
                   'Incidents_HIV', 'Hepatitis_B']

In [37]:
X_train_3 = X_train_scaled[fe_cols_3]

In [38]:
lin_reg = sm.OLS(y_train, X_train_3)
results3 = lin_reg.fit()
results3.summary()

0,1,2,3
Dep. Variable:,Life_expectancy,R-squared:,0.983
Model:,OLS,Adj. R-squared:,0.983
Method:,Least Squares,F-statistic:,11230.0
Date:,"Tue, 28 May 2024",Prob (F-statistic):,0.0
Time:,13:06:39,Log-Likelihood:,-3709.3
No. Observations:,2291,AIC:,7445.0
Df Residuals:,2278,BIC:,7519.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Adult_mortality,-30.9309,0.397,-77.900,0.000,-31.710,-30.152
const,77.8919,0.219,356.377,0.000,77.463,78.320
Economy_status_Developed,2.7007,0.156,17.269,0.000,2.394,3.007
Reg_Central America and Caribbean,1.7557,0.094,18.705,0.000,1.572,1.940
Reg_South America,1.4841,0.107,13.904,0.000,1.275,1.693
Under_five_deaths,-18.0137,0.331,-54.429,0.000,-18.663,-17.365
GDP_per_capita,2.3139,0.268,8.623,0.000,1.788,2.840
Reg_Oceania,-1.1683,0.119,-9.789,0.000,-1.402,-0.934
Reg_European Union,-0.9646,0.138,-7.000,0.000,-1.235,-0.694

0,1,2,3
Omnibus:,43.78,Durbin-Watson:,2.038
Prob(Omnibus):,0.0,Jarque-Bera (JB):,67.476
Skew:,0.186,Prob(JB):,2.23e-15
Kurtosis:,3.754,Cond. No.,34.0


In [39]:
y_pred_3 = results3.predict(X_train_3)
rmse = statsmodels.tools.eval_measures.rmse(y_train, y_pred_3)
print(rmse)

1.2215876196075914


### Round four

In [40]:
fe_cols_4 = ['Adult_mortality', 'const', 'Economy_status_Developed',
                   'Reg_Central America and Caribbean', 'Reg_South America', 'Under_five_deaths',
                   'GDP_per_capita', 'Reg_Oceania', 'Reg_European Union', 'Schooling', 'BMI',
                   'Incidents_HIV']

In [41]:
X_train_4 = X_train_scaled[fe_cols_4]

In [42]:
lin_reg = sm.OLS(y_train, X_train_4)
results4 = lin_reg.fit()
results4.summary()

0,1,2,3
Dep. Variable:,Life_expectancy,R-squared:,0.983
Model:,OLS,Adj. R-squared:,0.983
Method:,Least Squares,F-statistic:,12220.0
Date:,"Tue, 28 May 2024",Prob (F-statistic):,0.0
Time:,13:07:46,Log-Likelihood:,-3712.4
No. Observations:,2291,AIC:,7449.0
Df Residuals:,2279,BIC:,7518.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Adult_mortality,-30.9381,0.397,-77.833,0.000,-31.718,-30.159
const,77.4966,0.149,520.630,0.000,77.205,77.788
Economy_status_Developed,2.7298,0.156,17.485,0.000,2.424,3.036
Reg_Central America and Caribbean,1.7512,0.094,18.640,0.000,1.567,1.935
Reg_South America,1.4927,0.107,13.977,0.000,1.283,1.702
Under_five_deaths,-17.7545,0.314,-56.507,0.000,-18.371,-17.138
GDP_per_capita,2.3195,0.269,8.635,0.000,1.793,2.846
Reg_Oceania,-1.1446,0.119,-9.611,0.000,-1.378,-0.911
Reg_European Union,-0.9805,0.138,-7.115,0.000,-1.251,-0.710

0,1,2,3
Omnibus:,36.702,Durbin-Watson:,2.037
Prob(Omnibus):,0.0,Jarque-Bera (JB):,54.35
Skew:,0.168,Prob(JB):,1.58e-12
Kurtosis:,3.676,Cond. No.,28.6


In [43]:
y_pred_4 = results4.predict(X_train_4)
rmse = statsmodels.tools.eval_measures.rmse(y_train, y_pred_4)
print(rmse)

1.2232193602242891


## Testing

In [44]:
X_test = pd.get_dummies(X_test, columns = ['Region'], prefix = 'Reg', drop_first = True, dtype=int)

In [45]:
num_cols = ['Infant_deaths', 'Under_five_deaths',
       'Adult_mortality', 'Alcohol_consumption', 'Hepatitis_B', 'Measles',
       'BMI', 'Polio', 'Diphtheria', 'Incidents_HIV', 'GDP_per_capita',
       'Population_mln', 'Thinness_ten_nineteen_years',
       'Thinness_five_nine_years', 'Schooling', 'Economy_status_Developed',
       'Economy_status_Developing', 'Reg_Asia',
       'Reg_Central America and Caribbean', 'Reg_European Union',
       'Reg_Middle East', 'Reg_North America', 'Reg_Oceania',
       'Reg_Rest of Europe', 'Reg_South America']

In [46]:
X_test_minmax = X_test[num_cols]

In [47]:
X_test_scaled = pd.DataFrame(minmax.transform(X_test_minmax), columns = num_cols)

In [50]:
X_test_scaled = sm.add_constant(X_test_scaled)

In [51]:
X_test_scaled = X_test_scaled[fe_cols_4]

In [48]:
y_test = y_test.reset_index(drop = True)

In [52]:
y_test_pred = results4.predict(X_test_scaled)
rmse = statsmodels.tools.eval_measures.rmse(y_test, y_test_pred)
print(rmse)

1.2289241510483786
