# Linear Regression: Statsmodels vs Sci-kit Learn


## Today's Goals:

- Showcase the differences between the different implementations of ordinary least squares regression

### First: Set Up

In [None]:
# Basic imports
import numpy as np
import pandas as pd
# Data visualizations
import matplotlib.pyplot as plt
import seaborn as sns
# Pre-Processing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# Metrics
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

Credit data from https://www.kaggle.com/avikpaul4u/credit-card-balance

Target: `Balance`

In [None]:
# Data
df = pd.read_csv('data/Credit.csv', 
                 usecols=['Income', 'Limit', 'Rating', 'Cards', 'Age', 'Balance'])

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
# Let's define our X and y
X = df.drop(columns='Balance')
y = df['Balance']

In [None]:
# Train test split here!
# Set test_size = .33
# Set random_state = 42

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [None]:
# Time to scale!
# Instantiate a new scaler
scaler = StandardScaler()

# Learn the pattern from the training data
scaler.fit(X_train)

# Apply the pattern to the training and testing data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Let's turn these into dataframes
X_train_scaled = pd.DataFrame(X_train_scaled,
                              columns=X_train.columns,
                              index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled,
                             columns=X_test.columns,
                             index=X_test.index)

X_train_scaled.head()

## Statsmodels' `ols`

Aka the formula version

In [None]:
# Import
from statsmodels.formula.api import ols

In [None]:
# For this version, we need to create a train_df and test_df
# This is easier because we made sure our scaled X data is a df
train_df_scaled = pd.concat([X_train_scaled, y_train], axis=1)
test_df_scaled = pd.concat([X_test_scaled, y_test], axis=1)

train_df_scaled.head()

In [None]:
# Now define our formula - all X variabels against y
formula = 'Balance ~ Income + Limit + Rating + Cards + Age'
# or can do:
# formula = 'Balance ~ ' + ' + '.join(X_train.columns)
formula

In [None]:
# Set up and fit your model
model_ols = ols(formula=formula, data=train_df_scaled).fit()

In [None]:
# Check your results!
model_ols.summary()

## Statsmodels' `OLS`

Aka X vs y version version

In [None]:
# Import
import statsmodels.api as sm

In [None]:
# Now we'll use our X_train_scaled and y_train!
# Note the add constant
model_OLS = sm.OLS(endog=y_train, exog=sm.add_constant(X_train_scaled)).fit()

In [None]:
# Check your results!
model_OLS.summary()

## And Now - SKLearn!

Aka the no-summary version

In [None]:
# Import
from sklearn.linear_model import LinearRegression

In [None]:
# Instantiate our model
model_sk = LinearRegression()

In [None]:
# Fit our model
model_sk.fit(X_train_scaled, y_train)

In [None]:
# Get our R2 score
model_sk.score(X_train_scaled, y_train)

In [None]:
# Can also use:
train_preds = model_sk.predict(X_train_scaled)

r2_score(y_train, train_preds)

In [None]:
# Check our coefficients
model_sk.coef_

In [None]:
# Add the column names to look at
dict(zip(X_train.columns, model_sk.coef_))