<br>

<br>

# Supervised Learning - Codebook

<br>

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats as stats

from sklearn.model_selection import train_test_split
import sklearn.metrics 

import warnings
warnings.filterwarnings('ignore')

<br>

# Feature Engineering

### Checking Missing Values

In [1]:
def check_null(data_frame):
        for col in data_frame.columns:
            if data_frame[col].isnull().sum()!= 0:
                print("|",col,"  ---->  ",data_frame[col].isnull().sum(),"\n","_______________________________")
            else:
                pass

In [2]:
def null_columns(data_frame):
        null_col_list = []
        for col in data_frame.columns:
            if data_frame[col].isnull().sum()!= 0:
                null_col_list.append(col)
            else:
                pass
        print(null_col_list)

## Checking - dropping duplicates

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

## Heatmap

In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot=True)
plt.show()

<br>

## Get Dummies

In [None]:
pd.get_dummies(df, drop_first = True)

# Train - Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df.drop(columns ="sales")
y = df["sales"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print("Train features shape : ", X_train.shape)
print("Train target shape   : ", y_train.shape)
print("Test features shape  : ", X_test.shape)
print("Test target shape    : ", y_test.shape)

# Linear Regression

In [None]:
model = LinearRegression()

In [None]:
model.fit(X_train, y_train)

In [None]:
model.coef_

In [None]:
model.intercept_

In [None]:
y_pred = model.predict(X_test)

In [None]:
my_dict = {"Actual":y_test, "pred": y_pred, 'residuals': y_test-y_pred}
compare = pd.DataFrame(my_dict)
compare

In [None]:
residuals = y_test-y_pred

In [None]:
plt.figure(figsize=(10, 6))
sns.kdeplot(residuals)

In [None]:
plt.figure(figsize=(10, 6))
stats.probplot(residuals, plot=plt);

In [None]:
model.score(X_train, y_train)

In [None]:
model.score(X_train, y_train)

In [1]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [5]:
def eval_metric(actual, pred):
    mae = metrics.mean_absolute_error(actual, pred)
    mse = metrics.mean_squared_error(actual, pred)
    rmse = np.sqrt(metrics.mean_squared_error(actual, pred))
    score = r2_score(actual, pred)
    return print("r2_score:", score, "\n", "mae:", mae, "\n", "mse", mse, "\n", "rmse", rmse)

In [None]:
from yellowbrick.regressor import PredictionError
# Instantiate the linear model and visualizer
lm = LinearRegression(normalize=True)
visualizer = PredictionError(model)
visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data
visualizer.show() ;

<br>

### Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()

In [None]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

<br>

##  Ridge

In [None]:
ridge_model = Ridge()
ridge_model.fit(X_train_scaled, y_train)
y_pred = ridge_model.predict(X_test_scaled)

In [None]:
eval_metric(y_test,y_pred)

In [None]:
ridge_model.alpha

In [None]:
alpha_space = np.linspace(0.1, 20, 100)
alpha_space

In [None]:
ridgecv = RidgeCV(alphas=alpha_space, cv=10)
ridgecv.fit(X_train_scaled, y_train)

In [None]:
ridgecv.alpha_

In [None]:
from yellowbrick.regressor import ManualAlphaSelection
# Create a list of alphas to cross-validate against
alpha_space = np.linspace(0.01, 10, 100)
# Instantiate the visualizer
visualizer = ManualAlphaSelection(
    Ridge(),
    alphas=alpha_space,
    cv=10
)
visualizer.fit(X_train_scaled, y_train)
visualizer.show();

In [None]:
y_pred=ridgecv.predict(X_test_scaled)

<br>

## Lasso

In [None]:
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV

In [None]:
lasso_model = Lasso()
lasso_model.fit(X_train_scaled, y_train)
y_pred = lasso_model.predict(X_test_scaled)
eval_metric(y_test,y_pred)

In [None]:
alpha_space = np.linspace(0.001, 10, 100)
lasso_cv_model = LassoCV(alphas = alpha_space, cv = 10).fit(X_train_scaled, y_train)

In [None]:
y_pred=lasso_cv_model.predict(X_test_scaled)
eval_metric(y_test,y_pred)

In [None]:
lasso_cv_model.alpha_

In [None]:
def evaluate_model(Model, lambdas):
    training_errors = [] 
    testing_errors = [] 
    for l in lambdas:

        model = Model(alpha=l) 
        model.fit(X_train_scaled, y_train)

        training_predictions = model.predict(X_train_scaled)
        training_mse = mean_squared_error(y_train, training_predictions)
        training_errors.append(training_mse)

        testing_predictions = model.predict(X_test_scaled)
        testing_mse = mean_squared_error(y_test, testing_predictions)
        testing_errors.append(testing_mse)
    return training_errors, testing_errors

In [None]:
lambdas = np.arange(0.01, 10, step=0.1)
lasso_train, lasso_test = evaluate_model(Lasso, lambdas)
plot_errors(lambdas, lasso_train, lasso_test, "Lasso")

<br>

## KNN Regression

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
knn = KNeighborsRegressor()

In [None]:
knn.fit(X_train_scaled, y_train)

In [None]:
y_pred = knn.predict(X_test_scaled)

In [None]:
eval_metric(y_test, y_pred)

In [None]:
rmse_val = []
for K in range(20):
    K = K+1
    model = KNeighborsRegressor(n_neighbors = K)

    model.fit(X_train_scaled, y_train)
    pred=model.predict(X_test_scaled)
    error = np.sqrt(mean_squared_error(y_test, pred))
    rmse_val.append(error) #store rmse values
    print('RMSE value for k= ' , K , 'is:', error)

In [None]:
knn = KNeighborsRegressor(n_jobs=-1, n_neighbors=1)

In [None]:
knn.fit(X_train_scaled, y_train)

In [None]:
y_pred = knn.predict(X_test_scaled)

In [None]:
eval_metric(y_test, y_pred)