In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
import preprocessing as ps
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report

warnings.filterwarnings('ignore')
sns.set_theme(style="whitegrid", palette="Set2")

In [2]:
df=pd.read_csv('../datasets/clean_dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,3,2,5,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,4,6,3,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,2,3,4,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,2,3,4,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,3,3,5,62.6,59.0,7.65,7.61,4.77,14453


In [3]:
df=df.drop(columns=['Unnamed: 0'],axis=1)
df.head()


Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,3,2,5,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,4,6,3,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,2,3,4,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,2,3,4,61.6,56.0,4.38,4.41,2.71,666
4,1.7,3,3,5,62.6,59.0,7.65,7.61,4.77,14453


In [4]:
x = df.drop(columns=['price'])  # Features (all columns except 'price')
y = df['price'] 

In [5]:
y=np.log1p(y)

In [6]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [7]:
# Train Model Function (updated for regression)
def train_model(model, model_name, x_train, y_train, x_test, y_test, is_classification=True):
    
    # Fit the model on the training data
    model.fit(x_train, y_train)
    
    # Predictions on the training data
    y_train_pred = model.predict(x_train)
    # Predictions on the testing data
    y_test_pred = model.predict(x_test)
    
    # Collecting results in a dictionary
    model_results = {
        'model': model_name
    }
    
    if is_classification:
        # Classification metrics
        train_accuracy = accuracy_score(y_train, y_train_pred)
        test_accuracy = accuracy_score(y_test, y_test_pred)
        model_results['train_accuracy'] = train_accuracy
        model_results['test_accuracy'] = test_accuracy
        
        # Generate classification report for testing data (optional)
        report = classification_report(y_test, y_test_pred, output_dict=True)
        model_results['classification_report'] = report
    else:
        # Regression metrics
        train_mse = mean_squared_error(y_train, y_train_pred)
        test_mse = mean_squared_error(y_test, y_test_pred)
        train_r2 = r2_score(y_train, y_train_pred)
        test_r2 = r2_score(y_test, y_test_pred)
        
        model_results['train_mse'] = train_mse
        model_results['test_mse'] = test_mse
        model_results['train_r2'] = train_r2
        model_results['test_r2'] = test_r2
    
    return model_results

# List of models (for regression or classification tasks)
model_list = dict(
    linear_regression=LinearRegression(),
    random_forest=RandomForestRegressor(n_estimators=50, random_state=42),
    xgboost=xgb.XGBRegressor(max_depth=1),
)

# Assume you have your x_train, x_test, y_train, and y_test already defined
results = []
for key, value in model_list.items():
    # Check if classification or regression model and train accordingly
    is_classification =False  # change as per your use-case for classification vs regression
    model_results = train_model(value, key, x_train, y_train, x_test, y_test, is_classification)
    results.append(model_results)

# Convert the results into a DataFrame
results_df = pd.DataFrame(results)

# Print or return the results DataFrame
results_df

Unnamed: 0,model,train_mse,test_mse,train_r2,test_r2
0,linear_regression,0.032966,0.031924,0.968296,0.969306
1,random_forest,0.001851,0.012358,0.998219,0.988118
2,xgboost,0.01979,0.01928,0.980968,0.981463


In [19]:
model=RandomForestRegressor(n_estimators=50, random_state=42)
model.fit(x_train, y_train)

In [20]:
x.iloc[0,:]

carat       1.52
cut         3.00
color       2.00
clarity     5.00
depth      62.20
table      58.00
x           7.27
y           7.33
z           4.55
Name: 0, dtype: float64

In [21]:
model.predict(x.iloc[0,:].values.reshape(1,-1))

array([9.54263943])

In [22]:
y[0]

9.519294579703503

In [23]:
model.predict(x_test.iloc[0,:].values.reshape(1,-1))

array([8.04215224])

In [24]:
y_test[0]

9.519294579703503

In [25]:
import joblib  # or import pickle

In [26]:
# Save model to a file
joblib.dump(model, 'model.pkl')

['model.pkl']

In [28]:
loaded_model = joblib.load('model.pkl')


In [29]:
predictions = loaded_model.predict(x_test.iloc[0,:].values.reshape(1,-1))
print(predictions)  # Display first 5 predictions

[8.04215224]
