In [1]:
import pandas as pd
from pandas.api.types import CategoricalDtype
pd.options.display.max_rows = 999
pd.set_option('display.max_columns',100)

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.figure_factory as ff

#Find dataset to read : Diamond data
#Criteria : 1500 by 11
# Importing dataset and examining it
filelocation = "../data/diamonds.csv"

dataset = pd.read_csv(filelocation)

#High-level view of the Dataset:
print(f'\n The First Five rows : \n {dataset.head()}')
print(f'\n Shape of the Dataset : \n {dataset.shape}')
print(f'\n Data types in the dataset : \n {dataset.info()}')
print(f'\n Statistical Summary of Dataset : \n {dataset.describe()}')

dataset.drop('Unnamed: 0', axis = 1, inplace = True) #o significant input to the analysis.

#Convert Cut, Color and Clarity to Numerical Variables
dataset.color = dataset.color.map({'J' : 1,'I': 2, 'H' : 3, 'G' : 4, 'F' : 5, 'E' : 6, 'D' : 7})
dataset.cut = dataset.cut.map({'Fair' : 1,'Good': 2, 'Very Good' : 3, 'Premium' : 4, 'Ideal' : 5})
dataset.clarity = dataset.clarity.map({'I1' : 1, 'SI2' : 2, 'SI1' : 3, 'VS2' :4 , 'VS1' : 5, 'VVS2' : 6, 'VVS1' : 7, 'IF' : 8})
#Split dataset to Features and Target
features = dataset.drop('price', axis = 1)
target = dataset['price']
#Standardize to remove lopsidedness of some features
scaler = StandardScaler()

dataset_scaled = scaler.fit_transform(dataset)
ds_df = pd.DataFrame(dataset_scaled, columns = dataset.columns)

features_scaled = scaler.fit_transform(features)
fs_df = pd.DataFrame(features_scaled, columns = features.columns)

#Split to Train and test for model development
fs_df_train, fs_df_test, target_train, target_test = train_test_split(fs_df,target, train_size = 0.8, random_state = 10)
#Visualization of Correlationship between features and target : 
corrs = ds_df.corr()
figure = ff.create_annotated_heatmap(
    z=corrs.values,
    x=list(corrs.columns),
    y=list(corrs.index),
    annotation_text=corrs.round(2).values,
    showscale=True)
figure.show()
for alpha_ in [.001, .01, .1, 1,10, 100] :
#Defining The models :  First Part with Regularization
                std_model = LinearRegression()
                L1_model = Lasso(alpha = alpha_)
                L2_model = Ridge(alpha = alpha_)
                EN_model = ElasticNet(alpha = alpha_)

                std_model.fit(fs_df_train, target_train)
                L1_model.fit(fs_df_train, target_train)
                L2_model.fit(fs_df_train, target_train) #Note the lower the Alpha the more overfitted the model gets and conversely applies.

                EN_model.fit(fs_df_train, target_train)
                print(f'alpha = {alpha_} \n ---------------------------------------------------------------------------------------------------------------------\n')
                print('Co-efficients of Models as shown below : ')
                print(f'---Without Regularization : {np.round(std_model.coef_, 2)}')
                print(f'---With L1 Regularization : {np.round(L1_model.coef_, 2)}')
                print(f'---With L2 Regularization : {np.round(L2_model.coef_, 2)}')
                print(f'---With EN Regularization : {np.round(EN_model.coef_, 2)}')

                print('\n ---------------------------------------------------------------------------------------------------------------------\n')

                ''' 
               #Degree of Accuracy of the Models
                print('Accuracy of Models as shown below:')
                print(f'---Without Regularization : {round(std_model.score(fs_df_test, target_test), 4) * 100}%')
                print(f'With L1 Regularization : {round(L1_model.score(fs_df_test, target_test), 4) * 100}%')
                print(f'With L2 Regularization : {round(L2_model.score(fs_df_test, target_test), 4) * 100}%')
                print(f'With EN Regularization : {round(EN_model.score(fs_df_test, target_test), 4) * 100}%')
                

                print('\n ---------------------------------------------------------------------------------------------------------------------\n')
                '''
                #Based of test dataset
                std_model_test_predict = std_model.predict(fs_df_test)
                L1_model_test_predict = L1_model.predict(fs_df_test)
                L2_model_test_predict = L2_model.predict(fs_df_test)
                EN_model_test_predict = EN_model.predict(fs_df_test)

                print('R^2 of Models as shown below:')
                print(f'---Without Regularization : {round(r2_score(y_pred=std_model_test_predict, y_true = target_test ) , 4) * 100}%')
                print(f'With L1 Regularization : {round(r2_score(y_pred=L1_model_test_predict, y_true = target_test ) , 4) * 100}%')
                print(f'With L2 Regularization : {round(r2_score(y_pred=L2_model_test_predict, y_true = target_test ) , 4) * 100}%')
                print(f'With EN Regularization : {round(r2_score(y_pred=EN_model_test_predict, y_true = target_test ) , 4) * 100}%')

#Defining The models  : Second Part
std_model = LinearRegression()
RF_model = RandomForestRegressor(criterion='squared_error', max_features='sqrt', random_state=10)
SVR_model = SVR(kernel = 'linear')

std_model.fit(fs_df_train, target_train)
RF_model.fit(fs_df_train, target_train)
SVR_model.fit(fs_df_train, target_train) #Note the lower the Alpha the more overfitted the model gets and conversely applies.

#Based of test dataset
std_model_test_predict = std_model.predict(fs_df_test)
RF_model_test_predict = RF_model.predict(fs_df_test)
SVR_model_test_predict = SVR_model.predict(fs_df_test)

print('R^2 of Models as shown below:')
print(f'---Without Regularization : {round(r2_score(y_pred=std_model_test_predict, y_true = target_test ) , 4) * 100}%')
print(f'With Random Forest Model : {round(r2_score(y_pred=RF_model_test_predict, y_true = target_test ) , 4) * 100}%')
print(f'With SVR Model : {round(r2_score(y_pred=SVR_model_test_predict, y_true = target_test ) , 4) * 100}%')
