In [5]:
import pandas as pd
import numpy as np
import linmodules as lm
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [6]:
try:
    del pd.DataFrame.lmodels
except AttributeError:
    pass

In [7]:
@pd.api.extensions.register_dataframe_accessor("lmodels")
class LinearModels:
    def __init__(self, pandas_obj) -> pd.DataFrame:
        self._df = pandas_obj
        
    def multilinear(self, xcols, ycol) -> np.array:
        """
            This function retrieves variables attributed to a train test split with sklearn.model_selection
            Args: xcols = the names of x columns to use in our multilinear models as predictors of ycol (passed as a list if it is plural),
                  ycol = the variable to predict from dataframe
            Output: (y_test, y_pred) the adjusted values to our linear model, from these we can retrieve a residual plot.
        """
        # Data pipeline
        X = self._df[xcols].values
        y = self._df[[ycol]].values
        
        # Normalize data
        X_train, X_test, y_train, y_test = train_test_split(X, y)
        sc_x = StandardScaler().fit(X)
        sc_y = StandardScaler().fit(y)
        
        # Selecting data
        X_train = sc_x.transform(X_train)
        X_test  = sc_x.transform(X_test)
        y_train = sc_y.transform(y_train)
        y_test  = sc_y.transform(y_test)
        
        # Training the model
        model = LinearRegression()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        return sc_x, sc_y, model, y_test, y_pred
    
    def single_linear(self, xcol, ycol) -> np.array:
        """
            This function retrieves variables attributed to a train test split with sklearn.model_selection
            
            Args: xcols = the name of x column to use in our single linear models as predictors of ycol,
                  ycol = the variable to predict from dataframe.
                  
            Output: (y_test, y_pred) the adjusted values to our linear model, from these we can retrieve a residual plot.
        """
        # Data pipeline
        X = self._df[xcol].values.reshape(-1,1)
        y = self._df[ycol].values.reshape(-1,1)
        
        # Normalize data
        X_train, X_test, y_train, y_test = train_test_split(X, y)
        sc_x = StandardScaler().fit(X)
        sc_y = StandardScaler().fit(y)
        
        # Selecting data
        X_train = sc_x.transform(X_train)
        X_test  = sc_x.transform(X_test)
        y_train = sc_y.transform(y_train)
        y_test  = sc_y.transform(y_test)
        
        # Training the model
        model = LinearRegression()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        return sc_x, sc_y, model, y_test, y_pred
    
    def linear_adjust_plot(self, xcol, ycol, **plot_dict) -> plt.plot: 
        """
            This function plots a linear regression embedded in the original x, y scatterplot to observe graphically how accurate is our model.
            
            Parameters
            ------------
            x_col: 1D x value we want to use as predictor,
            y_col: 1D y value we want to predict,
            **plot_dict: values used to give more context to our plot as xlabel, ylabel, and title.
            
        """  
        X = self._df[xcol].values.reshape(-1,1)
        Y = self._df[ycol].values.reshape(-1,1)

        # Normalizing values
        X_sc = StandardScaler()
        Y_sc = StandardScaler()

        X_std = X_sc.fit_transform(X)
        Y_std = Y_sc.fit_transform(Y)

        # Training model
        slr = LinearRegression()
        slr.fit(X_std, Y_std)

        # Plotting the normalized linear adjust
        lm.lin_regplot(X_std, Y_std, slr)
        try:
            plt.title(label=plot_dict["title"])
            plt.xlabel(xlabel=plot_dict["xlabel_name"])
            plt.ylabel(ylabel=plot_dict["ylabel_name"])
        except KeyError:
            pass
        plt.tight_layout()
        plt.grid(alpha=0.4)
        plt.show()
        
    def ols_model(self, x_cols, y_col) -> set:
        """
            This method is used to create a linear or multilinear ordinary least squares (OLS) regression model to a dataframe given a set of given
            x variables, and a y variable to predict.
            
            Args: x_cols, pd.columns that come from analyzed dataframe, in case of being multilinear, argument should be provided as a list. 
            y_col, pd.column variable to predict.
            
            Return: model.
        """
        # Init variables.
        try:
            X = self._df[x_cols].values
            y = self._df[y_col].values
        except KeyError:
            print("Enter your x_cols as a list, in case of being multivariate.")
 
        if (X.ndim == 1):
            X = X.reshape(-1,1)
        elif (y.ndim == 1):
            y = y.reshape(-1,1)
        else:
            X = X
            y = y
                   
        # Normalizing variables.
        X_train, X_test, y_train, y_test = train_test_split(X, y)
        sc_x = StandardScaler().fit(X)
        sc_y = StandardScaler().fit(y)
        
        X_train = sc_x.transform(X_train)
        X_test  = sc_x.transform(X_test)
        y_train = sc_y.transform(y_train)
        y_test  = sc_y.transform(y_test)
        
        # Testing the model.
        X_train_sm = sm.add_constant(X_train)
        X_test_sm  = sm.add_constant(X_test)
        
        model_ols = sm.OLS(y_train, X_train_sm)
        results_ols = model_ols.fit()

        return sc_x, sc_y, X_test_sm, results_ols