In [8]:
#Packages
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import math

In [2]:
#Read Training Data
gem_train = pd.read_csv('/Users/dilynzertuche/Desktop/VisualStudio/Kaggle Competitions/Regression with a Tabular Gemstone Price Dataset/playground-series-s3e8/train.csv')

In [3]:
#EDA
gem_train.describe(include = 'all')

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
count,193573.0,193573.0,193573,193573,193573,193573.0,193573.0,193573.0,193573.0,193573.0,193573.0
unique,,,5,7,8,,,,,,
top,,,Ideal,G,SI1,,,,,,
freq,,,92454,44391,53272,,,,,,
mean,96786.0,0.790688,,,,61.820574,57.227675,5.715312,5.720094,3.534246,3969.155414
std,55879.856166,0.462688,,,,1.081704,1.918844,1.109422,1.102333,0.688922,4034.374138
min,0.0,0.2,,,,52.1,49.0,0.0,0.0,0.0,326.0
25%,48393.0,0.4,,,,61.3,56.0,4.7,4.71,2.9,951.0
50%,96786.0,0.7,,,,61.9,57.0,5.7,5.72,3.53,2401.0
75%,145179.0,1.03,,,,62.4,58.0,6.51,6.51,4.03,5408.0


In [4]:
#Split Data Intro Train/Test Split
X = gem_train.drop('price', axis = 1)
X = pd.get_dummies(X)
y = gem_train['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)

In [5]:
#Create Linear Model and Fit Data
model = LinearRegression()

model.fit(X_train, y_train)
price_predictions = model.predict(X_test)

In [13]:
#Evaluate Linear Regression Model
lr_mae = mean_absolute_error(y_test, price_predictions)
lr_mse = mean_squared_error(y_test, price_predictions)
lr_rmse = math.sqrt(lr_mse)
lr_r2 = r2_score(y_test, price_predictions)

print('Mean Absolute Error: ', lr_mae)
print('Mean Squared Error: ', lr_mse)
print('Root Mean Squared Error: ', lr_rmse)
print('R-Squared: ', lr_r2)

SyntaxError: invalid syntax. Perhaps you forgot a comma? (3562913430.py, line 14)

In [11]:
#Try k-Nearest Neighbor
from sklearn.neighbors import KNeighborsRegressor

In [12]:
#Find Optimal Number of Neighbors
parameters = {'n_neighbors': range(1,50),
              'weights': ['uniform', 'distance']}
gridsearch = GridSearchCV(KNeighborsRegressor(), parameters)
gridsearch.fit(X_train, y_train)

print(gridsearch.best_params_)

{'n_neighbors': 3, 'weights': 'distance'}


In [26]:
#Build KNN Model
knn_model = KNeighborsRegressor(n_neighbors = 3, weights = 'distance', algorithm = 'brute' , metric = 'minkowski')

knn_model.fit(X_train, y_train)

In [27]:
#Score Model
knn_y_pred = knn_model.predict(X_test)

knn_mae = mean_absolute_error(y_test, knn_y_pred)
knn_mse = mean_squared_error(y_test, knn_y_pred)
knn_rmse = math.sqrt(knn_mse)
knn_r2 = r2_score(y_test, knn_y_pred)

print('Mean Absolute Error: ', knn_mae)
print('Mean Squared Error: ', knn_mse)
print('Root Mean Squared Error: ', knn_rmse)
print('R-Squared: ', knn_r2)

Mean Absolute Error:  2398.5893681330763
Mean Squared Error:  12356680.02463763
Root Mean Squared Error:  3515.2069675394123
R-Squared:  0.24639393647734975


In [28]:
#Try Random Forest
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

In [31]:
#Hypertune Model
param_dist = {
    'n_estimators': randint(50,500),
    'max_depth': randint(1,20)
}

rf = RandomForestRegressor()

rand_search = RandomizedSearchCV(rf,
                                 param_distributions = param_dist,
                                 n_iter = 5,
                                 cv = 5,
                                 random_state = 12345)

rand_search.fit(X_train, y_train)
best_rf = rand_search.best_estimator_

print('Best Parameters: ', rand_search.best_params_)

Best Parameters:  {'max_depth': 10, 'n_estimators': 432}


In [40]:
#Fit Random Forest Model
rf_y_pred = best_rf.predict(X_test)

In [41]:
#Evaluate the Model
rf_mae = mean_absolute_error(y_test, rf_y_pred)
rf_mse = mean_squared_error(y_test, rf_y_pred)
rf_rmse = math.sqrt(rf_mse)
rf_r2 = r2_score(y_test, rf_y_pred)

print('Mean Absolute Error: ', rf_mae)
print('Mean Squared Error: ', rf_mse)
print('Root Mean Squared Error: ', rf_rmse)
print('R-Squared: ', rf_r2)

Mean Absolute Error:  346.67796094700594
Mean Squared Error:  423651.33555609355
Root Mean Squared Error:  650.8850401999523
R-Squared:  0.9741624599279123


In [42]:
#List Model Metrics
model_eval = {
    'Model': ['Linear Regression', 'kNN', 'Random Forest'],
    'RMSE': [lr_rmse, knn_rmse, rf_rmse],
    'R-Squared': [lr_r2, knn_r2, rf_r2]
}

model_eval = pd.DataFrame(model_eval)

In [43]:
gem_test = pd.read_csv('/Users/dilynzertuche/Desktop/VisualStudio/Kaggle Competitions/Regression with a Tabular Gemstone Price Dataset/playground-series-s3e8/test.csv')
gem_test_dummy = pd.get_dummies(gem_test)

gemstone_predictions = best_rf.predict(gem_test_dummy)

rf_y_pred_df = pd.DataFrame(gemstone_predictions)
rf_y_pred_df = rf_y_pred_df.rename(columns={0: 'price'})

rf_y_pred_df.to_csv('Gemstone_Price_Pred.csv', index = False)