In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, brier_score_loss
from sklearn.metrics import classification_report, f1_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import GridSearchCV,StratifiedKFold,RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from scipy.stats import randint
from sklearn.feature_selection import SelectKBest, f_classif


In [10]:
df_dev = pd.read_csv('Final_Output_Class_dev_under.csv')
df_test = pd.read_csv('Final_Output_Class_test.csv')
#SPlit the label
label_dev = df_dev['Underpriced']
label_test = df_test['Underpriced']
#Drop the label column
df_dev = df_dev.drop(columns='Underpriced')
df_test = df_test.drop(columns='Underpriced')

In [11]:
x_train,x_dev,y_train,y_dev = train_test_split(df_dev,label_dev,test_size=0.2,random_state=42)

In [12]:
rf = RandomForestClassifier(random_state=42,class_weight=None)
#Use grid search for the optimal hyperparameter
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 15],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [3, 5],
    'max_features': ['sqrt'],
    'bootstrap': [True],
    'criterion': ['gini']
}

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring='f1',
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_jobs=-1,
    verbose=2
)

grid_search.fit(x_train,y_train)

best_rf = grid_search.best_estimator_
y_dev_pred = best_rf.predict(x_dev)
best_f1 = f1_score(y_dev, y_dev_pred)

print(f"Best F1 Score on Dev Set: {best_f1}")
print("Best Hyperparameters:")
print(grid_search.best_params_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best F1 Score on Dev Set: 0.48514851485148514
Best Hyperparameters:
{'bootstrap': True, 'criterion': 'gini', 'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 10, 'n_estimators': 200}


In [13]:
test_pred= best_rf.predict(df_test)
#Compute Training Accuracy
test_accuracy = best_rf.score(df_test,label_test)
#Compute F1 score
test_f1 = f1_score(label_test, test_pred)

print(f'Test Accuracy :{test_accuracy}')
print(f'Test F1 Score :{test_f1}')

Test Accuracy :0.7437185929648241
Test F1 Score :0.41040462427745666


In [108]:
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 0.01, 0.001],
    'kernel': ['rbf', 'poly', 'linear']
}

grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring='f1', verbose=2, n_jobs=-1)
grid_search.fit(x_train, y_train)

best_model = grid_search.best_estimator_

y_dev_pred = best_model.predict(x_dev)
f1 = f1_score(y_dev, y_dev_pred)

print("Best SVM Params:", grid_search.best_params_)
print("Best SVM F1 Score on Dev Set:", f1)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best SVM Params: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Best SVM F1 Score on Dev Set: 0.5555555555555556


In [109]:
test_pred= best_model.predict(df_test)
#Compute Training Accuracy
test_accuracy = best_model.score(df_test,label_test)
#Compute F1 score
test_f1 = f1_score(label_test, test_pred)

print(f'Test Accuracy :{test_accuracy}')
print(f'Test F1 Score :{test_f1}')



Test Accuracy :0.6545226130653267
Test F1 Score :0.3764172335600907


In [3]:
df_dev = pd.read_csv('Final_Output_Class_dev_tomek.csv')
df_test = pd.read_csv('Final_Output_Class_test.csv')
#SPlit the label
label_dev = df_dev['Underpriced']
label_test = df_test['Underpriced']
#Drop the label column
df_dev = df_dev.drop(columns='Underpriced')
df_test = df_test.drop(columns='Underpriced')

x_train,x_dev,y_train,y_dev = train_test_split(df_dev,label_dev,test_size=0.2,random_state=42)

In [4]:
rf = RandomForestClassifier(random_state=42,class_weight=None)
#Use grid search for the optimal hyperparameter
param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [None, 10, 15, 20, 30],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 3, 5],
    'max_features': ['sqrt', 'log2', 0.3, 0.5],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy'],
    'class_weight': [None, 'balanced', 'balanced_subsample']
}

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring='f1',
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_jobs=-1,
    verbose=2
)

grid_search.fit(x_train,y_train)

best_rf = grid_search.best_estimator_
y_dev_pred = best_rf.predict(x_dev)
best_f1 = f1_score(y_dev, y_dev_pred)

print(f"Best F1 Score on Dev Set: {best_f1}")
print("Best Hyperparameters:")
print(grid_search.best_params_)

Fitting 5 folds for each of 15360 candidates, totalling 76800 fits
Best F1 Score on Dev Set: 0.8794788273615635
Best Hyperparameters:
{'bootstrap': False, 'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}


In [5]:
test_pred = best_rf.predict(df_test)
#Compute Training Accuracy
test_accuracy = best_rf.score(df_test,label_test)
#Compute F1 score
test_f1 = f1_score(label_test, test_pred)

print(f'Test Accuracy :{test_accuracy}')
print(f'Test F1 Score :{test_f1}')

Test Accuracy :0.7865311308767471
Test F1 Score :0.13402061855670103
