In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from  sklearn import metrics
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler


In [38]:
data = pd.read_csv("firm.csv")

In [56]:
def prepare_data():
    data = pd.read_csv("firm.csv")
    data = data.replace(" ", np.nan)
    data_len = len(data)
    
    x = data.drop('INDOUTPT', axis=1)
    
    y = data['INDOUTPT']

    x_train, x_test, y_train, y_test = train_test_split(
        x, y, test_size=0.33, random_state=42)
    return x_train, x_test, y_train, y_test

In [72]:
def scale_data(x_train, x_test, scaler=StandardScaler):

    scaler = scaler()
    scaler.fit(x_train)

    x_train_scaled = scaler.transform(x_train)
    x_train_scaled = pd.DataFrame(x_train_scaled, index=x_train.index, columns=x_train.columns)

    x_test_scaled = scaler.transform(x_test)
    x_test_scaled = pd.DataFrame(x_test_scaled, index=x_test.index, columns=x_test.columns)

    x_train_scaled.fillna(x_train_scaled.mean(), inplace=True)
    x_test_scaled.fillna(x_test_scaled.mean(), inplace=True)

    return x_train_scaled, x_test_scaled



In [89]:
def evaluate_regression(y_true_train, y_pred_train, y_true_test, y_pred_test, model_name=""):   
    return pd.DataFrame.from_records([[metrics.mean_squared_error(y_true_train, y_pred_train),
                                       metrics.mean_absolute_error(y_true_train, y_pred_train),
                                       metrics.max_error(y_true_train, y_pred_train),
                                       metrics.r2_score(y_true_train, y_pred_train), 
                                      
                                      metrics.mean_squared_error(y_true_test, y_pred_test),
                                       metrics.mean_absolute_error(y_true_test, y_pred_test),
                                       metrics.max_error(y_true_test, y_pred_test),
                                       metrics.r2_score(y_true_test, y_pred_test)]], 
                                     
                                     index=[model_name], 
                                     columns=['mean_squared_error_train', 'mean_absolute_error_train', 'max_error_train', "r2_score_train",
                                         'mean_squared_error_test', 'mean_absolute_error_test', 'max_error_test', "r2_score_test"])


In [90]:
x_train, x_test, y_train, y_test = prepare_data()
x_train, x_test = scale_data(x_train, x_test, scaler=StandardScaler)

In [91]:
results = pd.DataFrame()
for model in [linear_model.LinearRegression(), linear_model.RidgeCV(), linear_model.LassoCV()]:
    model.fit(x_train, y_train)

    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    results = results.append(evaluate_regression(y_train, y_train_pred, y_test, y_test_pred,
                        model_name=type(model)))


In [92]:
results

Unnamed: 0,mean_squared_error_train,mean_absolute_error_train,max_error_train,r2_score_train,mean_squared_error_test,mean_absolute_error_test,max_error_test,r2_score_test
<class 'sklearn.linear_model._base.LinearRegression'>,0.219139,0.369343,1.728156,0.997732,0.222263,0.376802,1.650439,0.997838
<class 'sklearn.linear_model._ridge.RidgeCV'>,0.219151,0.369283,1.726612,0.997732,0.222506,0.377012,1.649628,0.997835
<class 'sklearn.linear_model._coordinate_descent.LassoCV'>,0.2274,0.380265,1.685174,0.997647,0.230479,0.387552,1.600265,0.997758
