In [29]:
import pandas as pd
from sklearn.impute import SimpleImputer, KNNImputer
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler,MinMaxScaler,RobustScaler, MaxAbsScaler
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report,roc_curve
from sklearn.feature_selection import SelectKBest, SelectFromModel, chi2, mutual_info_classif,f_classif
from sklearn.metrics import classification_report,confusion_matrix, RocCurveDisplay, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, roc_auc_score,balanced_accuracy_score,f1_score,precision_score,recall_score
from lightgbm  import LGBMClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import time
from mrmr import mrmr_classif
import seaborn as sns
import pickle
import shap

pd.set_option('display.max_columns', None)

In [3]:
df=pd.read_csv('train.csv')

In [4]:
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,2,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,6,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,6,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,9,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,14,1208,1212,1411,8,2,15,1,1,0,1


In [5]:
# Let's copy some basic preprocess from the previous notebook
s1 = df.groupby('sc_h')["sc_w"].mean()

def fillbv (col_h,col_w,ser):
    if col_w==0:
        return s1[col_h] 
    else:
        return col_w

df['sc_w'] = df.apply((lambda x: fillbv(x['sc_h'],x['sc_w'],s1)),axis=1)

In [7]:
df.describe()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,1238.5185,0.495,1.52225,0.5095,4.3095,0.5215,32.0465,0.50175,140.249,4.5205,9.9165,645.108,1251.5155,2124.213,12.3065,6.19824,11.011,0.7615,0.503,0.507,1.5
std,439.418206,0.5001,0.816004,0.500035,4.341444,0.499662,18.145715,0.288416,35.399655,2.287837,6.064315,443.780811,432.199447,1084.732044,4.213245,4.042239,5.463955,0.426273,0.500116,0.500076,1.118314
min,501.0,0.0,0.5,0.0,0.0,0.0,2.0,0.1,80.0,1.0,0.0,0.0,500.0,256.0,5.0,1.0,2.0,0.0,0.0,0.0,0.0
25%,851.75,0.0,0.7,0.0,1.0,0.0,16.0,0.2,109.0,3.0,5.0,282.75,874.75,1207.5,9.0,3.0,6.0,1.0,0.0,0.0,0.75
50%,1226.0,0.0,1.5,1.0,3.0,1.0,32.0,0.5,141.0,4.0,10.0,564.0,1247.0,2146.5,12.0,5.0,11.0,1.0,1.0,1.0,1.5
75%,1615.25,1.0,2.2,1.0,7.0,1.0,48.0,0.8,170.0,7.0,15.0,947.25,1633.0,3064.5,16.0,9.0,16.0,1.0,1.0,1.0,2.25
max,1998.0,1.0,3.0,1.0,19.0,1.0,64.0,1.0,200.0,8.0,20.0,1960.0,1998.0,3998.0,19.0,18.0,20.0,1.0,1.0,1.0,3.0


### 1.Choose model and scaler

In [9]:
X = df.drop(['price_range'], axis = 1)
y = df['price_range']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

res = pd.DataFrame(columns = [ 'Scaler', 'Classifier', 'Accuracy Score', 'F1-Score', 'T-Run'])

scalers=[RobustScaler(), MinMaxScaler(),StandardScaler(),None]
classifiers = [SVC(random_state = 42), 
               RandomForestClassifier(n_estimators=500, random_state = 42),
              KNeighborsClassifier(n_neighbors=5)]


for scaler in scalers:
    for classi in classifiers:
            pipe = Pipeline([
                ("Scaler",scaler),
                ("Classifier",classi)
            ])
            t0=time.time()
            pipe.fit(X_train, y_train)
            y_pred = pipe.predict(X_test)
            t1=time.time()
            results= {
                      'Scaler' : scaler,
                      'Classifier' : classi,
                      'Accuracy Score' : accuracy_score(y_test, y_pred),
                      'F1-Score' : f1_score(y_test, y_pred, average='macro'), 
                      'T-Run': t1-t0}
            res = res.append(results, ignore_index = True)

In [10]:
res

Unnamed: 0,Scaler,Classifier,Accuracy Score,F1-Score,T-Run
0,RobustScaler(),SVC(random_state=42),0.91,0.907531,0.387251
1,RobustScaler(),"(DecisionTreeClassifier(max_features='sqrt', r...",0.89,0.887252,4.285965
2,RobustScaler(),KNeighborsClassifier(),0.5425,0.538809,0.425703
3,MinMaxScaler(),SVC(random_state=42),0.865,0.861549,0.474799
4,MinMaxScaler(),"(DecisionTreeClassifier(max_features='sqrt', r...",0.8875,0.8845,3.967226
5,MinMaxScaler(),KNeighborsClassifier(),0.4025,0.399758,0.113888
6,StandardScaler(),SVC(random_state=42),0.89,0.885978,0.300061
7,StandardScaler(),"(DecisionTreeClassifier(max_features='sqrt', r...",0.8875,0.8845,3.983592
8,StandardScaler(),KNeighborsClassifier(),0.515,0.514232,0.115846
9,,SVC(random_state=42),0.965,0.964391,0.12254


So we will continue with no scaler and we will use SVC classifier since it has the best F1-Score and also was between the more quick.

### 2.Feature Selection

#### K-Best

In [13]:
bestfeatures = SelectKBest(score_func=chi2) 
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1) 
featureScores.columns = ['Specs','Score'] #naming the dataframe columns

featureScores.nlargest(10,'Score')

Unnamed: 0,Specs,Score
13,ram,931267.519053
11,px_height,17363.569536
0,battery_power,14129.866576
12,px_width,9810.58675
8,mobile_wt,95.972863
6,int_memory,89.839124
16,talk_time,13.2364
15,sc_w,12.407489
4,fc,10.135166
14,sc_h,9.614878


In [15]:
K_best = featureScores.nlargest(4,'Score')['Specs'].values
X_kbest = X[K_best]
X_train_k, X_test_k, y_train_k, y_test_k = train_test_split(X_kbest,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

svm = SVC(random_state=42)
svm_k=svm.fit(X_train_k, y_train_k)
svm_predict_k =svm_k.predict(X_test_k)

print("=== Confusion Matrix ===")
print(confusion_matrix(y_test_k, svm_predict_k))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test_k, svm_predict_k))

=== Confusion Matrix ===
[[103   2   0   0]
 [  0  91   0   0]
 [  0   4  87   1]
 [  0   0   6 106]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       105
           1       0.94      1.00      0.97        91
           2       0.94      0.95      0.94        92
           3       0.99      0.95      0.97       112

    accuracy                           0.97       400
   macro avg       0.97      0.97      0.97       400
weighted avg       0.97      0.97      0.97       400



#### MRMR

In [16]:
selected_cols = mrmr_classif(pd.DataFrame(X_train, columns = X.columns), y_train.values, K=4, n_jobs=1)
selected_cols

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 42.99it/s]


['ram', 'px_width', 'battery_power', 'mobile_wt']

In [17]:
selected_cols10 = mrmr_classif(pd.DataFrame(X_train, columns = X.columns), y_train.values, K=10,n_jobs=1)
selected_cols10

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 38.59it/s]


['ram',
 'px_width',
 'battery_power',
 'mobile_wt',
 'int_memory',
 'px_height',
 'talk_time',
 'n_cores',
 'sc_h',
 'fc']

In [18]:
X_mrmr = X[selected_cols]
X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(X_mrmr,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

svm = SVC(random_state=42)
svm_m=svm.fit(X_train_m, y_train_m)
svm_predict_m =svm_m.predict(X_test_m)

print("=== Confusion Matrix ===")
print(confusion_matrix(y_test_m, svm_predict_m))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test_m, svm_predict_m))

=== Confusion Matrix ===
[[101   4   0   0]
 [  3  83   5   0]
 [  0   9  78   5]
 [  0   0  12 100]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.97      0.96      0.97       105
           1       0.86      0.91      0.89        91
           2       0.82      0.85      0.83        92
           3       0.95      0.89      0.92       112

    accuracy                           0.91       400
   macro avg       0.90      0.90      0.90       400
weighted avg       0.91      0.91      0.91       400



So we keep Select K Best

In [19]:
pipe = Pipeline([
    ('feat_selection', SelectKBest(chi2)),
    ('classifier', SVC(random_state = 42))
])

parameters = {
    'feat_selection__k':[4]      
    }

CV = GridSearchCV(pipe, parameters, 
                  scoring='f1_macro', n_jobs=-1, verbose=1, cv=3) # cv 3 * 2 neighbors = 6 fits
CV.fit(X, y)
pd.DataFrame(CV.cv_results_)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_feat_selection__k,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.035358,0.002833,0.040675,0.005834,4,{'feat_selection__k': 4},0.954979,0.944196,0.950367,0.949848,0.004417,1
