In [None]:
import plotly.express as px
import pandas as pd
import seaborn as sns
import numpy as np
import plotly
import matplotlib.pyplot as plt
import scipy
px.defaults.template = 'seaborn'
px.defaults.width = 700
px.defaults.height = 500
pd.set_option('display.max_columns', 30)

import warnings
warnings.simplefilter(action='ignore')

In [None]:
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import cross_val_score

In [None]:
#Loading the data
df = pd.read_csv('Automobile price data _Raw_.csv')
df

In [None]:
#Cleaning the data and encoding the ordinal columns
df['normalized-losses'].replace('?', np.nan, inplace=True)
df['normalized-losses'].fillna(df['normalized-losses'].median(), inplace=True)
df['normalized-losses'] = df['normalized-losses'].astype('int')

df.replace({'num-of-doors': '?'}, np.nan, inplace=True)
df.dropna(subset=['num-of-doors'], inplace=True)
df['num-of-doors'] = df['num-of-doors'].replace(['four', 'two'], [4, 2])
df['num-of-doors'].dtype

df['num-of-cylinders'] = df['num-of-cylinders'].replace(['four', 'six', 'five', 'eight', 'two', 'twelve', 'three'],
                                                        [4, 6, 5, 8, 2, 12, 3])

filt = df['bore'] != '?'
df = df[filt]
df['bore'] = df['bore'].astype('float')

df['stroke'] = pd.to_numeric(df['stroke'])

df['horsepower'] = df['horsepower'].replace('?', np.nan)
df.dropna(subset=['horsepower'], inplace=True)
df['horsepower'] = pd.to_numeric(df['horsepower'])

df['peak-rpm'] = df['peak-rpm'].astype('float')

df = df[df['price'] != '?']
df['price'] = df['price'].astype('float')
df

In [None]:
value_counts = df.apply(lambda x: len(x.value_counts()))
df_value_counts = pd.DataFrame([value_counts, df.dtypes]).T
df_value_counts.columns = ['unique_values', 'dtype']
df_value_counts = df_value_counts.sort_values('unique_values')
fig = px.bar(df_value_counts,x=df_value_counts.index, color='dtype', y='unique_values')
fig.show(renderer='png')

In [None]:
X = df.copy()
y = X.pop('price')
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.2)

In [None]:
#Data Distribution
y.hist()

In [None]:
from scipy.stats import boxcox
from scipy.stats import skew , kurtosis

print('no transform')
print('skew: ', round(skew(y), 2))
print('kurtosis', round(kurtosis(y), 2))
print()

y_ = np.log(y)
print('log transform')
print('skew: ', round(skew(y_), 2))
print('kurtosis', round(kurtosis(y_), 2))
print()

y_ = np.sqrt(y)
print('sqrt transform')
print('skew: ', round(skew(y_), 2))
print('kurtosis', round(kurtosis(y_), 2))
print()

from sklearn.preprocessing import PowerTransformer
trans = PowerTransformer(method='box-cox')
y_ = trans.fit_transform(y[:,None])
print('box-cox transformation')
print('skew: ', round(skew(y_)[0], 2))
print('kurtosis', round(kurtosis(y_)[0], 2))

In [None]:
#carrying box cox transformation forward
trans = PowerTransformer(method='box-cox')
y_train_trans = trans.fit_transform(y_train[:,None])
pd.DataFrame(y_train_trans).hist()
plt.title('train')
y_test_trans = trans.transform(y_test[:,None])
pd.DataFrame(y_test_trans).hist()
plt.title('test')

In [None]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
    print('Train r2 score: ', r2_score(y_train, model.predict(X_train)))
    print('Test r2 score: ', r2_score(y_test, model.predict(X_test)))
    fig = plt.figure(figsize=(12,6))
    fig.suptitle('Prediction vs Actual')
    fig.add_subplot(121)
    sns.scatterplot(x=y_train, y = model.predict(X_train))
    plt.title('train set')
    fig.add_subplot(122)
    sns.scatterplot(x=y_test, y = model.predict(X_test), color='red')
    plt.title('test set')
    #fig = plt.figure(figsize=(12,6))
    #fig.suptitle('Residual Error & homoscedasticity')
    #fig.add_subplot(121)
    #sns.scatterplot(x=y_train, y = (y_train-model.predict(X_train)))
    #plt.title('train set')
    #fig.add_subplot(122)
    #sns.scatterplot(x=y_test, y = (y_test-model.predict(X_test)), color='red')
    #plt.title('test set')

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

def generate_ridge_model(**kwargs):
    ct = ColumnTransformer([("onehot", OneHotEncoder(sparse=False, handle_unknown ='ignore'), X.select_dtypes('object').columns),
                            ("standard_scaler", StandardScaler(), X.select_dtypes(exclude='object').columns)])
    model = TransformedTargetRegressor(regressor=Ridge(**kwargs), transformer=PowerTransformer(method='box-cox'))
    pipe = Pipeline([('feature_transformation', ct),
                     ('model', model)])
    return pipe

def generate_random_forrest_model(**kwargs):
    ct = ColumnTransformer([("onehot", OneHotEncoder(sparse=False, handle_unknown ='ignore'), X.select_dtypes('object').columns),
                            ("standard_scaler", StandardScaler(), X.select_dtypes(exclude='object').columns)])
    model = TransformedTargetRegressor(regressor=RandomForestRegressor(**kwargs), transformer=PowerTransformer(method='box-cox'))
    pipe = Pipeline([('feature_transformation', ct),
                     ('model', model)])
    return pipe

In [None]:
from sklearn.model_selection import GridSearchCV
pipe = generate_ridge_model()
parameters = {'model__regressor__alpha':[0.01, 0.1, 0.5, 1, 5, 10, 100, 1000, 10000]}
gs = GridSearchCV(pipe, parameters, cv=5)
gs.fit(X, y)
print(gs.best_params_)
print(gs.best_score_)

In [None]:
pipe = generate_ridge_model(alpha=1)
pipe.fit(X_train, y_train)
print(cross_val_score(pipe, X, y, cv=5))
evaluate_model(pipe, X_train, y_train, X_test, y_test)

In [None]:
'''
pipe = generate_random_forrest_model()
parameters = {'model__regressor__n_estimators':[10, 50, 100, 200],
              'model__regressor__criterion':['mse', 'mae'],
              'model__regressor__max_depth':[None, 10, 20],
              'model__regressor__min_samples_split' : [2, 4, 8],
              'model__regressor__min_samples_leaf' : [1, 2, 4],
             }
gs = GridSearchCV(pipe, parameters, cv=5)
gs.fit(X, y)
print(gs.best_params_)
print(gs.best_score_)
'''
#Obtained output
#{'model__regressor__criterion': 'mse', 'model__regressor__max_depth': None, 'model__regressor__min_samples_leaf': 1, 'model__regressor__min_samples_split': 2, 'model__regressor__n_estimators': 10}
#0.6888227121958135

In [None]:
from sklearn.metrics import mean_absolute_error
pipe = generate_random_forrest_model(n_estimators=10,
                                     criterion='mse',
                                     max_depth=None,
                                     min_samples_split=2,
                                     min_samples_leaf=1)
pipe.fit(X_train, y_train)
print(cross_val_score(pipe, X, y, cv=10))
print('Train mae', mean_absolute_error(y_train, pipe.predict(X_train)))
print('Test mae', mean_absolute_error(y_test, pipe.predict(X_test)))
evaluate_model(pipe, X_train, y_train, X_test, y_test)

In [None]:
#Understanding what is going wrong with multiple cross validations
from sklearn.model_selection import KFold
kf = KFold(n_splits=10)
train_index, test_index = list(kf.split(X))[4]

X_train_issue, X_test_issue = X.iloc[train_index,:], X.iloc[test_index,:]
y_train_issue, y_test_issue = y.iloc[train_index], y.iloc[test_index]
pipe.fit(X_train_issue, y_train_issue)
evaluate_model(pipe, X_train_issue, y_train_issue, X_test_issue, y_test_issue)

In [None]:
#Data on which prodiction is highly correlated
filt = (pd.Series(y_test_issue) > 12500)
pd.DataFrame(X_test_issue).loc[filt,:]
#All nissan models going wrong

In [None]:
X_train_issue.make.value_counts()
#No nissan model available in the training data!

In [None]:
#Stratified model insuring all models are in test as well as in train data
X_stratified_train = []
y_stratified_test = []
def generate_stratified_split(df, target, stratified_on, test_size=0.2):
    X_train_s = []
    X_test_s = []
    y_train_s = []
    y_test_s = []
    for _, df_ in df.groupby(stratified_on):
        X_ = df_.copy()
        y_ = X_.pop(target)
        if len(X_)>1:
            X_train_,X_test_,y_train_,y_test_ = train_test_split(X_,y_, test_size = test_size)
        else:
            X_train_,X_test_,y_train_,y_test_ = X_, X_, y_, y_
        X_train_s.append(X_train_)
        X_test_s.append(X_test_)
        y_train_s.append(y_train_)
        y_test_s.append(y_test_)
    X_train_s = pd.concat(X_train_s, axis=0)
    X_test_s = pd.concat(X_test_s, axis=0)
    y_train_s = pd.concat(y_train_s, axis=0)
    y_test_s = pd.concat(y_test_s, axis=0)
    return X_train_s, X_test_s, y_train_s, y_test_s

X_train_s, X_test_s, y_train_s, y_test_s = generate_stratified_split(df, 'price', 'make', test_size=0.2)

In [None]:
print('unique make in train set: ', X_train_s.make.nunique())
print('unique make in test set: ', X_test_s.make.nunique())
print(X_train.shape)
print(X_test.shape)

In [None]:
#Final Model
pipe = generate_ridge_model(alpha=1)

pipe.fit(X_train_s, y_train_s)
print(cross_val_score(pipe, pd.concat([X_train_s, X_test_s], axis=0), pd.concat([y_train_s, y_test_s], axis=0), cv=10))
print('Train mae', mean_absolute_error(y_train_s, pipe.predict(X_train_s)))
print('Test mae', mean_absolute_error(y_test_s, pipe.predict(X_test_s)))
evaluate_model(pipe, X_train_s, y_train_s, X_test_s, y_test_s)

In [None]:
from sklearn.inspection import permutation_importance

feature_importance = permutation_importance(pipe, X_train_s, y_train_s,
                        n_repeats=10,
                        random_state=0)

In [None]:
fi = pd.DataFrame(feature_importance['importances'].T, columns=X_train_s.columns).mean().sort_values()
fi.plot.bar()

In [None]:
for i in range(len(X_train_s.columns) - 1):
    columns_selected = fi.index[i:]
    
    ct = ColumnTransformer([("onehot", OneHotEncoder(sparse=False, handle_unknown ='ignore'), X_train_s[columns_selected].select_dtypes('object').columns),
                            ("standard_scaler", StandardScaler(), X_train_s[columns_selected].select_dtypes(exclude='object').columns)])
    model = TransformedTargetRegressor(regressor=Ridge(alpha=1),
                                       transformer=PowerTransformer(method='box-cox'))
    pipe = Pipeline([('feature_transformation', ct),
                     ('model', model)])
    X_cv = pd.concat([X_train_s[columns_selected], X_test_s[columns_selected]], axis=0)
    y_cv = pd.concat([y_train_s, y_test_s], axis=0)
    
    cv_score = cross_val_score(pipe, X_cv, y_cv, cv=5).mean()
    print(f'number of features: {len(X_cv.columns)}', f' mean cv r2: {cv_score}')

In [None]:
#import statsmodels.api as sm
#X_sm = sm.add_constant(X_train)
#model = sm.OLS(y_train_trans,X_sm).fit()
#model.summary()
#from statsmodels.stats.outliers_influence import variance_inflation_factor
#vif = pd.DataFrame(np.zeros((1,len(X.columns))),columns=X.columns)
#vif.iloc[0,:] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]