In [5]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

df = pd.DataFrame(sns.load_dataset("diamonds"))

print(f"The shape of the dataset is {df.shape}")
if df.isnull().any().any():
  print(f"There are missing values in the dataset")
else:
  print(f"There are no missing values in the dataset")

print(df.head())

The shape of the dataset is (53940, 10)
There are no missing values in the dataset
   carat      cut color clarity  depth  table  price     x     y     z
0   0.23    Ideal     E     SI2   61.5   55.0    326  3.95  3.98  2.43
1   0.21  Premium     E     SI1   59.8   61.0    326  3.89  3.84  2.31
2   0.23     Good     E     VS1   56.9   65.0    327  4.05  4.07  2.31
3   0.29  Premium     I     VS2   62.4   58.0    334  4.20  4.23  2.63
4   0.31     Good     J     SI2   63.3   58.0    335  4.34  4.35  2.75


In [6]:
# encode categorical features with one-hot encoding
categorical_features = ["cut", "color", "clarity"]
df = pd.get_dummies(df, columns=categorical_features)

columns = df.columns.tolist()
columns.insert(len(columns), columns.pop(columns.index("price")))
df = df.loc[:, columns]
df.head()

Unnamed: 0,carat,depth,table,x,y,z,cut_Ideal,cut_Premium,cut_Very Good,cut_Good,...,color_J,clarity_IF,clarity_VVS1,clarity_VVS2,clarity_VS1,clarity_VS2,clarity_SI1,clarity_SI2,clarity_I1,price
0,0.23,61.5,55.0,3.95,3.98,2.43,True,False,False,False,...,False,False,False,False,False,False,False,True,False,326
1,0.21,59.8,61.0,3.89,3.84,2.31,False,True,False,False,...,False,False,False,False,False,False,True,False,False,326
2,0.23,56.9,65.0,4.05,4.07,2.31,False,False,False,True,...,False,False,False,False,True,False,False,False,False,327
3,0.29,62.4,58.0,4.2,4.23,2.63,False,True,False,False,...,False,False,False,False,False,True,False,False,False,334
4,0.31,63.3,58.0,4.34,4.35,2.75,False,False,False,True,...,True,False,False,False,False,False,False,True,False,335


In [7]:
X = df.iloc[:, :-1]
y = df.iloc[:, (len(df.columns.tolist()) - 1)]

In [8]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

models_and_hyperparams = {
  "LogisticRegression": (
    LogisticRegression(solver="liblinear"),
    # {"C": [0.01, 0.05, 0.1, 0.5, 1, 2]},
    {"C": [0.5]}
  )
}

# outer splitting: training vs test set (80/20)
X_train, X_test, y_train, y_test = train_test_split(
  X, y, test_size=0.2, random_state=1337,
)

# inner splitting (within the outer training set): training vs validation (80/20)
# training set is used to train the model, validation set is used to select the best hyperparameters
X_train_train, X_validation, y_train_train, y_validation = train_test_split(
  X_train, y_train, test_size=0.2, random_state=1337,
)

training_scores = {}
validation_scores = {}

best_training_score = {}
best_validation_score = {}

model = models_and_hyperparams["LogisticRegression"][0]
hyperparams = models_and_hyperparams["LogisticRegression"][1]

for hp in hyperparams:
  training_scores[hp] = {}
  validation_scores[hp] = {}
  
  for val in hyperparams[hp]:
    model.set_params(**{hp: val})
    
    model.fit(X_train_train, y_train_train)
    
    training_score = accuracy_score(y_train_train, model.predict(X_train_train))
    training_scores[hp][val] = training_score
    
    validation_score = accuracy_score(y_validation, model.predict(X_validation))
    validation_scores[hp][val] = validation_score
    
    if not best_validation_score:
      best_validation_score[hp] = (val, validation_score)
    else:
      if best_validation_score[hp][1] < validation_score:
        best_validation_score[hp] = (val, validation_score)

print("***** Evaluate Performance on Validation Set *****")
print(validation_scores)
print("***** Best Accuracy Score on Validation Set *****")
print(best_validation_score)