In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm
from sklearn. neighbors import LocalOutlierFactor
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, roc_auc_score
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

# from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures

from scipy.stats import expon, reciprocal
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier
from missingpy import MissForest

import warnings
warnings.filterwarnings('ignore')

In [2]:
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
            # Print current column type
            #print("******************************")
            #print("Column: ",col)
            #print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
            #print("dtype after: ",props[col].dtype)
            #print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props, NAlist

In [3]:
df = pd.read_csv('./data/galaxy_final.csv', index_col=0)
df.info()
df, na_li = reduce_mem_usage(df)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1485 entries, 0 to 1484
Data columns (total 14 columns):
index                        1485 non-null int64
BuyItNow                     1485 non-null int64
startprice                   1485 non-null float64
productSeries_imputed        1485 non-null int64
product_isNote_imputed       1485 non-null int64
hasDescription               1485 non-null int64
charCountDescriptionBins     1485 non-null int64
upperCaseDescription_rate    1485 non-null float64
startprice_point9            1485 non-null int64
sold                         1485 non-null int64
color_sentiment_0            1485 non-null int64
color_sentiment_1            1485 non-null int64
carrier_none_0               1485 non-null int64
carrier_none_1               1485 non-null int64
dtypes: float64(2), int64(12)
memory usage: 174.0 KB
Memory usage of properties dataframe is : 0.16994476318359375  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  0.041069984436035156  MB
This

In [4]:
X = df.drop('sold', axis=1)
y = df.sold
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11,
                                                       stratify=y, shuffle=True)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=11,
                                                       stratify=y_train, shuffle=True)

# Logistic Regression

In [None]:
LogisticRegression()

In [7]:
pipe = Pipeline([
                ('scale', MinMaxScaler()),
#                 ('poly', PolynomialFeatures()),
#                 ('feature_selection', RFE(LGBMClassifier())),
                ('classifier', LogisticRegression())
                ])

param_grid1 = [              
              {'classifier': [LogisticRegression()],
               'classifier__penalty':['l1', 'l2'],
               'classifier__tol':[1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1, 10, 100],
               'classifier__C':[1e-2, 0.1, 0.0, 1.0, 1.0, 100.0],
              'classifier__fit_intercept':[True],
              'classifier__intercept_scaling':[0.01, 0.1, 1, 10],
               'classifier__solver':['liblinear'],
               'classifier__max_iter':[25, 50, 100, 150, 200],
               'classifier__multi_class':['ovr'],
              'classifier__l1_ratio': [0.1*i for i in range(1, 10)],
               'classifier__n_jobs':[-1],
               'scale':[MinMaxScaler(), StandardScaler(), RobustScaler()],
#                'poly':[PolynomialFeatures()],
#                'poly__degree':[3],
#               'feature_selection' : [RFE(LGBMClassifier(objective='binary',
#                                                         metric='binary_logloss'))],
#                 'feature_selection__n_features_to_select' : [140, 70, 35]
              }
             ]
grid1 = GridSearchCV(pipe, param_grid1, scoring = 'accuracy',
                    cv=StratifiedKFold(n_splits=5),
                    verbose=1, n_jobs=-1)
grid1.fit(X_train, y_train)
print(grid1.best_params_)
print(grid1.best_score_)

Fitting 5 folds for each of 51840 candidates, totalling 259200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 2380 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done 6380 tasks      | elapsed:   24.6s
[Parallel(n_jobs=-1)]: Done 11980 tasks      | elapsed:   48.7s
[Parallel(n_jobs=-1)]: Done 19180 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 27980 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 38380 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 50380 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 63980 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 79180 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 95980 tasks      | elapsed:  9.1min
[Parallel(n_jobs=-1)]: Done 104900 tasks      | elapsed: 10.4min
[Parallel(n_jobs=-1)]: Done 109900 tasks      | elapsed: 11.1min
[Parallel(n_jobs=-1)]: Done 115604 tasks      | elapsed: 11.9min
[Parallel(n_jobs=-1)]: Done 1

{'classifier': LogisticRegression(C=100.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=0.1, max_iter=25,
                   multi_class='ovr', n_jobs=-1, penalty='l1',
                   random_state=None, solver='liblinear', tol=1, verbose=0,
                   warm_start=False), 'classifier__C': 100.0, 'classifier__fit_intercept': True, 'classifier__intercept_scaling': 1, 'classifier__l1_ratio': 0.1, 'classifier__max_iter': 25, 'classifier__multi_class': 'ovr', 'classifier__n_jobs': -1, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear', 'classifier__tol': 1, 'scale': MinMaxScaler(copy=True, feature_range=(0, 1))}
0.7789473684210526


# Random Forest
- https://www.analyticsvidhya.com/blog/2020/03/beginners-guide-random-forest-hyperparameter-tuning/
- https://www.upgrad.com/blog/random-forest-hyperparameter-tuning/

In [None]:
pipe1 = Pipeline([
                ('scale', MinMaxScaler()),
                 ('poly', PolynomialFeatures()),
                ('feature_selection', RFE(XGBClassifier())),
                ('classifier', XGBClassifier())
                ])

param_grid1 = [              
              {'classifier': [XGBClassifier()],
              'classifier__booster': ['dart'],
              'classifier__rate_drop': [0.15],
              'classifier__skip_drop': [0.33],
             'classifier__learning_rate':[0.1],
             'classifier__n_estimators':[1000],
               'classifier__max_depth':[5],
               'classifier__min_child_weight':[1],
               'classifier__gamma':[0],
               'classifier__subsample':[0.8],
               'classifier__colsample_bytree':[0.8],
               'classifier__objective':['binary:logistic'],
               'classifier__nthread':[-1],
               'classifier__scale_pos_weight':[1],
               'classifier__seed':[2021],
               'classifier__eval_metric':['error'],
               'classifier__n_jobs':[-1],
               'scale':[MinMaxScaler(), StandardScaler(), RobustScaler()],
               'poly':[PolynomialFeatures()],
               'poly__degree': [3],
               'feature_selection' : [RFE(RandomForestClassifier())],
               'feature_selection__n_features_to_select' : [140, 70]
              }
             ]
grid1 = GridSearchCV(pipe1, param_grid1, scoring = 'accuracy',
                    cv=StratifiedKFold(n_splits=5),
                    verbose=1, n_jobs=-1)
grid1.fit(X_valid, y_valid)
print(grid1.best_params_)
print(grid1.best_score_)

In [None]:
param_grid = {
            'bootstrap': [True, False],
            'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
            'max_features': ['auto', 'sqrt'],
            'min_samples_leaf': [1, 2, 4],
            'min_samples_split': [2, 5, 10],
            'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf,
                               param_distributions = random_grid,
                               n_iter = 100, cv = 3,
                               verbose=2, random_state=2021, n_jobs = -1)
# Fit the random search model
rf_random.fit(train_features, train_labels)

In [None]:
params = {
    'n_estimators':[50, 100, 150],
    'criterion':['gini', 'entropy'],
    'max_depth':[6, 8, 10, 12],
    'min_samples_leaf':[8, 12, 18],
    'min_samples_split':[8, 16, 20]
}

In [None]:
Random

In [None]:
pipe = Pipeline([
                ('scale', MinMaxScaler()),
#                 ('poly', PolynomialFeatures()),
#                 ('feature_selection', RFE(LGBMClassifier())),
                ('classifier', LGBMClassifier())
                ])

param_grid1 = [              
              {'classifier': [LGBMClassifier()],
               'classifier__categorical_feature':[cat_features_idx],
               'classifier__objective':['binary'],
               'classifier__metric':['binary_logloss'],
              'classifier__boosting_type':['gbdt', 'dart'],
              'classifier__drop_rate':[0.1],
               'classifier__skip_drop':[0.5],
               'classifier__learning_rate':[0.01, 0.03, 0.1],
               'classifier__num_iterations':[500, 1000, 2000, 3000, 5000],
              'classifier__bagging_fraction': [0.8],
               'classifier__feature_fraction':[0.8],
               'classifier__early_stopping_round':[0],
               'classifier__max_depth': [5],
               'classifier__num_leaves':[2**3],
               'classifier__min_data_in_leaf':[20],
               'classifier__max_bin':[255],
               'classifier__n_estimators':[1000],
               'classifier__lambda_l1':[0],
               'classifier__lambda_l2':[0],
               'classifier__scale_pos_weight':[1.0],
               'scale':[MinMaxScaler(), StandardScaler(), RobustScaler()],
#                'poly':[PolynomialFeatures()],
#                'poly__degree':[3],
#               'feature_selection' : [RFE(LGBMClassifier(objective='binary',
#                                                         metric='binary_logloss'))],
#                 'feature_selection__n_features_to_select' : [140, 70, 35]
              }
             ]
grid1 = GridSearchCV(pipe, param_grid1, scoring = 'accuracy',
                    cv=StratifiedKFold(n_splits=5),
                    verbose=1, n_jobs=-1)
grid1.fit(X_train, y_train)
print(grid1.best_params_)
print(grid1.best_score_)

# Catboost vs. LGBM vs. XGBOOST
https://www.kdnuggets.com/2018/03/catboost-vs-light-gbm-vs-xgboost.html