In [285]:
# Imports:
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split

In [286]:
# Global variables:
xls_file = 'ridge_regression_database_income_index.xls' # 'ridge_regression_database_education_index.xls' or 'ridge_regression_database_income_index.xls' or 'ridge_regression.xls'.
start_year_index = 30 # 1990.
end_year_index = 59 # 2018.
explained_variable = 'Income Index' # 'Education Index' and 'Income Index'.

In [287]:
# Classes:
class Dataset_handler():

    def __init__(self):
        pass

    def read_excel(self, file_name):
        return pd.read_excel(file_name)
    
    def transpose_df(self, df):
        df = df.transpose()
        df.columns = df.iloc[0] # Sets the header as the first row.
        return df.drop(df.index[0]) # Drops the first row.
         
    def separte_by_year(self, df, start, end):
        return df[start:end]
    
    def nan_to_mean(self, df, axis):
        return df.apply(lambda x: x.fillna(x.mean()), axis=axis)
    
    def drop_column(self, df, column):
        return df.drop(columns=df.columns[column])

    def plotter(self, df):
        return pd.plotting.scatter_matrix(df)
    
    def df_to_csv(self, df, file_name):
        return df.to_csv(path_or_buf=file_name)

    
class Regression():

    def __init__(self, alpha):
        self.rdg = Ridge(alpha=alpha)
    
    def split_data(self, df):
        self.x = treated_dataset.drop(columns=[explained_variable])
        self.y = treated_dataset[explained_variable]
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.x, self.y, test_size=0.3, random_state=0)
        return self.x_train, self.x_test, self.y_train, self.y_test
    
    def print_ridge_fit(self, x_train, x_test, y_train, y_test):
        self.reg = self.rdg.fit(x_train, y_train)
        print(f'Intercept: {self.reg.intercept_}\n')
        print(f'Coefficients: {self.reg.coef_}\n')
        print(f'Predicted Y: {self.reg.predict(x_test)}\n')
        print(f'Real Y: {list(y_test)}\n')
        print(f'Scores: {self.reg.score(x_test, y_test)}\n')
        self.eq = f'Equation: y = {self.rdg.intercept_}'
        for variable, coefficient in zip(x_train, self.rdg.coef_):
            self.eq += f' + "{variable}" * {coefficient}'
        print(self.eq)
        return self.reg
    
    def print_xs_and_ys(self, x_train, x_test, y_train, y_test):
        print('\n###########################################################################')
        print('Datasets used for training:\n')
        print('X dataset:\n')
        print(x_train)
        print('\nY series:\n')
        print(y_train)
        print('###########################################################################')
        print('Datasets used for testing:\n')
        print('X dataset:\n')
        print(x_test)
        print('\nY series:\n')
        print(y_test)
        print('###########################################################################')


In [288]:
# Instances:
dataset_handler = Dataset_handler()
regression = Regression(0.5)

# Main program:
brazilian_indices = dataset_handler.read_excel(xls_file)
treated_dataset = dataset_handler.transpose_df(brazilian_indices)
treated_dataset = dataset_handler.separte_by_year(treated_dataset, start_year_index, end_year_index)
treated_dataset = dataset_handler.nan_to_mean(treated_dataset, 0) # Replaces NaN with the column mean column-wise. 
treated_dataset = treated_dataset.dropna(axis=1)
x_train, x_test, y_train, y_test = regression.split_data(treated_dataset)
reg = regression.print_ridge_fit(x_train, x_test, y_train, y_test)
regression.print_xs_and_ys(x_train, x_test, y_train, y_test)

Intercept: 0.6479846775989255

Coefficients: [-5.68159099e-11 -5.20897718e-04 -1.28444749e-04  5.82925163e-05
 -7.94126397e-04  1.19105898e-05 -2.02246291e-15 -3.06361974e-05
 -1.99377959e-04]

Predicted Y: [0.69509653 0.74852392 0.76184713 0.71407194 0.71090667 0.7530639
 0.74652908 0.74522986 0.70776281]

Real Y: [0.6940000000000001, 0.7490000000000001, 0.76, 0.713, 0.711, 0.752, 0.746, 0.745, 0.7090000000000001]

Scores: 0.9980464878851263

Equation: y = 0.6479846775989255 + "Share of tariff lines with specific rates, primary products (%)" * -5.681590985040137e-11 + "Unemployment with advanced education (% of total labor force with advanced education)" * -0.0005208977175195226 + "Labor force with intermediate education, female (% of female working-age population with intermediate education)" * -0.00012844474917968852 + "Literacy rate, adult total (% of people ages 15 and above)" * 5.829251633903036e-05 + "Population growth (annual %)" * -0.000794126397369074 + "Benefit incidence of 

  return linalg.solve(A, Xy, sym_pos=True,
