In [149]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import OneHotEncoder

from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

from sklearn.metrics import mean_absolute_error

In [74]:
df = pd.read_csv('superstore.csv',encoding='windows-1254', index_col='Row ID')

In [138]:
features = ['Product Name', 'Sales', 'Quantity', 'Discount']
X = df[features]
y = df['Profit']
X_train, X_valid, y_train, y_valid = train_test_split(X, y)

In [139]:
# This code converts categorical data into numerical so ML algorithms can use it.

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

In [147]:
#Decision Tree

hyperparameters = {
    'max_depth': [2],
    'min_samples_leaf': [5],
    "max_leaf_nodes":[None,10,20],
    'splitter': ["best","random"]
}

dt = DecisionTreeRegressor()
clf = GridSearchCV(dt, hyperparameters, cv=10)

model = clf.fit(X_train, y_train)
print('Best max_depth:', model.best_estimator_.get_params()['max_depth'])
print('Best min_samples_leaf:', model.best_estimator_.get_params()['min_samples_leaf'])
print('Best splitter:', model.best_estimator_.get_params()['splitter'])

Best max_depth: 2
Best min_samples_leaf: 5
Best splitter: random


In [148]:
#K-nearest neighbors

hyperparameters = {
    'leaf_size': [1,2,3,4,5],
    'n_neighbors': [1,2,3,4,5],
    "p":[1,2],
}

knn = KNeighborsRegressor()
clf = GridSearchCV(knn, hyperparameters, cv=10)

model = clf.fit(train_X, train_y)
print('Best leaf_size:', model.best_estimator_.get_params()['leaf_size'])
print('Best p:', model.best_estimator_.get_params()['p'])
print('Best n_neighbors:', model.best_estimator_.get_params()['n_neighbors'])

Best leaf_size: 2
Best p: 2
Best n_neighbors: 3


In [None]:
# SVM

hyperparameters = {
    'C': [0.1], 
    'gamma': [1,], 
    'kernel': ['rbf', 'poly', 'sigmoid']
}

svr = SVR()
clf = GridSearchCV(svr, hyperparameters, cv=10)

model = clf.fit(train_X, train_y)
print('Best C:', model.best_estimator_.get_params()['C'])
print('Best gamma:', model.best_estimator_.get_params()['gamma'])
print('Best kernel:', model.best_estimator_.get_params()['kernel'])