In [15]:
import numpy as np
import pandas as pd

In [16]:
import pickle

In [17]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib_venn import venn2, venn2_circles 
from tqdm import tqdm

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.svm import SVC

In [20]:
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif,SelectFromModel
from mrmr import mrmr_classif, mrmr_regression

In [21]:
from sklearn.metrics import f1_score, balanced_accuracy_score
from sklearn.metrics import classification_report,confusion_matrix

In [22]:
df = pd.read_csv(r'C:\Users\USER\Downloads\final_df.csv')

In [23]:
pd.set_option('display.max_columns', None)

In [24]:
df['is_canceled'].value_counts()

0    72476
1    42292
Name: is_canceled, dtype: int64

# Grid Search

In [25]:
df_test = df.sample(n = 100000)

In [26]:
X = df_test.iloc[:,:-1]

In [27]:
y = df_test.iloc[:,-1]

In [28]:
X_train, X_test, y_train,y_test = train_test_split(X, y, random_state = 42)

In [29]:
classifiers = [
    RandomForestClassifier(class_weight='balanced'),
    xgb.XGBClassifier(),
    SVC()]

In [30]:
scalers = [RobustScaler(),MinMaxScaler(),StandardScaler()]

In [None]:
lstresults = []
for scaler in tqdm(scalers):
    for classifier in tqdm(classifiers):
        pipe = Pipeline([
            ('scaler',scaler),
            ('feat_selection', SelectKBest(mutual_info_classif)),
            ('classifier', classifier)
        ])
        parameters = {
            'feat_selection__k':[i for i in range(15,30)]       
            }
        CV = GridSearchCV(pipe, parameters, 
                          scoring='f1_macro')
        CV.fit(X_train, y_train)
        a = classifier
        b = scaler
        c = CV.cv_results_['params']
        d = CV.cv_results_['mean_test_score']
        lstresults.append([a,b,c,d])

  0%|                                                                                            | 0/3 [00:00<?, ?it/s]
  0%|                                                                                            | 0/3 [00:00<?, ?it/s][A

In [None]:
lstresults

In [None]:
res = pd.DataFrame(lstresults)
res = res.explode([2,3])

In [None]:
res

In [None]:
res.sort_values(3,ascending = False)

In [None]:
pipe = Pipeline([
  ('scaler',RobustScaler()),
  ('feat_select', SelectKBest(mutual_info_classif,k=29)),
  ('classifier', RandomForestClassifier(random_state=42))
])

pipe.fit(X_train, y_train)
predictions = pipe.predict(X_test)
print(' - balanced accuracy:', round(balanced_accuracy_score(y_test, predictions),2))
print(' - macro f1:', round(f1_score(y_test, predictions, average='macro'),2))

In [None]:
X_new = SelectKBest(score_func= mutual_info_classif, k=29).fit(X_train,y_train)

In [None]:
mask = X_new.get_support(indices=True)

In [None]:
mask

In [None]:
lstcolumns = X_train.columns

In [None]:
dic = {i:lstcolumns[i] for i in range(len(lstcolumns))}

In [None]:
dic

In [None]:
importantkbest = [dic[i] for i in list(mask)]

In [None]:
importantkbest

In [None]:
importantkbest = list(X_train.columns)

# MRMR

In [None]:
df.columns

In [None]:
classifiers = [
    RandomForestClassifier(class_weight='balanced'),
    xgb.XGBClassifier(),
    SVC()]

In [None]:
results = []
for classifier in classifiers:
    clas = classifier
    for i in range(5,30):
        lst = (mrmr_classif(X_train,y_train.values,K=i))
        print(lst)
        X_train_temp = X_train.loc[:,lst]
        X_test_temp = X_test.loc[:,lst]
        clas.fit(X_train_temp, y_train)
        y_pred = classifier.predict(X_test_temp)
        a = len(lst)
        b = lst
        c = str(classifier)
        d = balanced_accuracy_score(y_test, y_pred)
        e = f1_score(y_test, y_pred, average='macro')
        results.append([a,b,c,d,e])

In [None]:
res = pd.DataFrame(results)

In [None]:
res

In [None]:
res.sort_values(4,ascending = False)

In [None]:
importantmrmr = res.iloc[23,1]

In [None]:
importantmrmr

In [None]:
importantkbest

In [None]:
set2 = set(importantmrmr) & set(importantkbest)

In [None]:
venn2(subsets=(len(importantmrmr), len(importantkbest), len(set2)),  
      set_labels=('MRMR Important', 'Select K Best Important'), 
      set_colors=("orange", "blue"), alpha=0.7) 
  
# add outline 
venn2_circles(subsets=(len(importantmrmr), len(importantkbest), len(set2)),  
              linestyle="dashed",  
              linewidth=2) 
  
# assign title of the venn diagram 
plt.title("Venn Diagram in Feature Importance")   
plt.show() 

In [None]:
finalmodel = Pipeline([
  ('scaler',RobustScaler()),
  ('feat_select', SelectKBest(mutual_info_classif,k=29)),
  ('classifier', RandomForestClassifier(random_state=42))
])

finalmodel.fit(X_train, y_train)
predictions = finalmodel.predict(X_test)
print(' - balanced accuracy:', round(balanced_accuracy_score(y_test, predictions),2))
print(' - macro f1:', round(f1_score(y_test, predictions, average='macro'),2))

In [None]:
cnf_matrix = confusion_matrix(predictions, y_test)
cnf_matrix

In [None]:
class_names = [0,1]
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks,class_names)
plt.yticks(tick_marks,class_names)

#Create Heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot= True, cmap = "OrRd", fmt ="g")
plt.tight_layout()
plt.title("Confusion matrix")
plt.xlabel("Actual Label")
plt.ylabel("Predicted Label")
None