In [42]:
import pandas as pd 
import numpy as np 
import os
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit
import seaborn as sns 
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_validate
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [26]:
train=pd.read_csv('./data/bank_marketing_train.csv')
train['y'].value_counts()/len(train)
### distribution of y is No:0.88, Yes:0.11
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(train, train["y"]):
    strat_train_set = train.loc[train_index]
    strat_test_set = train.loc[test_index]
print('shape of strat train',strat_train_set.shape)
print('shape of strat test',strat_test_set.shape)
strat_train_set.to_csv('./data/strat_train_set.csv',index=False)
strat_test_set.to_csv('./data/strat_test_set.csv',index=False)

shape of strat train (20996, 20)
shape of strat test (5250, 20)


In [27]:
class campaign_age_unknown_trans(BaseEstimator,TransformerMixin):
    def fit(self,df,y=None):
        return self
    def transform(self,df):
        df=df.replace(to_replace={"unknown":np.nan})
        df.loc[(df['age']<0)|(df['age']>100),'age'] =  np.nan
        df['marital'] = df['marital'].replace(to_replace={"sungle":"single"})
        df.loc[(df['pdays'] ==999) & (df['poutcome'] !='nonexistent'),'pdays'] = np.nan
        q1,q3 = df['campaign'].quantile([0.25,0.75])
        lower = q1 - 3*(q3-q1)
        upper = q3 + 3*(q3-q1)
        df.loc[(df['campaign']<lower)|(df['campaign']>upper),'campaign'] = np.nan
        df.set_index('previous',inplace=True)
        df['campaign'] = df['campaign'].interpolate('linear')
        df.reset_index(inplace=True)
        df = df.assign(contacts_daily=(df['campaign']/(df['pdays']+1)).values)
        df[(df['age']>=60)&(pd.isnull(df.job))]['job']='retired'
        return df

class assign_educ_job_marital(BaseEstimator,TransformerMixin):
    def fit(self,df,y=None):
        return self 
    def transform(self,df):
        imp=SimpleImputer(strategy='most_frequent')
        df[['job']]=imp.fit_transform(df[['job']])
        df[['education']]=imp.fit_transform(df[['education']])
        df[['loan']]=imp.fit_transform(df[['loan']])
        df[['housing']]=imp.fit_transform(df[['housing']])
        return df

class fix_imbalance(BaseEstimator,TransformerMixin):
    def fit(self,df,y=None):
        return self 
    def transform(self,df):
        self.class_priors_pos = (df['y']  == 'yes').sum()
        self.class_priors_neg = (df['y']  == 'no').sum()
        self.df_pos = df[df['y'] == 'yes']
        self.df_neg = df[df['y']  == 'no']
        self.df_pos_over = self.df_pos.sample(int(0.5*self.class_priors_neg), replace=True)
        df = pd.concat([self.df_pos_over,self.df_neg])
        return df

In [28]:
handle_pipeline = Pipeline([
        ('step1', campaign_age_unknown_trans()),
        ('step2', assign_educ_job_marital()),
        ('step3', fix_imbalance()),
    ])
handle_pipeline_test = Pipeline([
        ('step1', campaign_age_unknown_trans()),
        ('step2', assign_educ_job_marital()),
    ])

strat_train_set=pd.read_csv('/Users/zhangdi/ACT4311/individualproject/data/strat_train_set.csv',index_col=False)
strat_test_set=pd.read_csv('/Users/zhangdi/ACT4311/individualproject/data/strat_test_set.csv',index_col=False)

train=handle_pipeline.fit_transform(strat_train_set)
test=handle_pipeline_test.fit_transform(strat_test_set)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [29]:
# train['age']=pd.cut(train['age'],bins=np.linspace(15,100,num=18))
# test['age']=pd.cut(test['age'],bins=np.linspace(15,100,num=18))
num_list=['age','campaign','pdays','previous','emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed','contacts_daily']
train_num_group=train[num_list]
test_num_group=test[num_list]
num_imp=SimpleImputer(strategy='mean')
train_num_group=num_imp.fit_transform(train_num_group)
test_num_group=num_imp.fit_transform(test_num_group)
std_=StandardScaler()
train_num_group=std_.fit_transform(train_num_group)
test_num_group=std_.fit_transform(test_num_group)

In [30]:
train['y']=train['y'].replace({'yes':True,'no':False})
test['y']=test['y'].replace({'yes':True,'no':False})

In [31]:
train=train.drop(num_list,axis=1)
test=test.drop(num_list,axis=1)

In [32]:
one_hot_groupname=['marital','day_of_week']
one_hot_train=train[one_hot_groupname]
one_hot_test=test[one_hot_groupname]
train=train.drop(one_hot_groupname,axis=1)
test=test.drop(one_hot_groupname,axis=1)
imp=SimpleImputer(strategy='most_frequent')
one_hot_train=imp.fit_transform(one_hot_train)
one_hot_test=imp.transform(one_hot_test)
one_hot=OneHotEncoder(handle_unknown='ignore')
one_hot_train=one_hot.fit_transform(one_hot_train)
one_hot_train=one_hot_train.toarray()
one_hot_test=one_hot.transform(one_hot_test)
one_hot_test=one_hot_test.toarray()

In [33]:
encoder_columns=list(train.columns)
encoder_columns=encoder_columns[:-1]
for col_name in encoder_columns:
    df_change=train[[col_name,'y']]
    df_change=df_change.groupby(col_name).mean().sort_values('y').reset_index()
    num=1
    match_dict=dict()
    for i in df_change.iloc[:,0]:
        match_dict[i]=num
        num+=1
        train[col_name]=train[col_name].replace(match_dict)
        test[col_name]=test[col_name].replace(match_dict)

In [34]:
train_label=train['y']
test_label=test['y']
train=train.drop('y',axis=1)
test=test.drop('y',axis=1)
train_data=np.concatenate([train_num_group,one_hot_train,train.values],axis=1)
test_data=np.concatenate([test_num_group,one_hot_test,test.values],axis=1)
knn_imp=KNNImputer(n_neighbors=200)
train_data=knn_imp.fit_transform(train_data)
test_data=knn_imp.transform(test_data)

In [45]:
clf = LogisticRegression()
best_clf = GridSearchCV(clf,scoring='roc_auc',cv=5,n_jobs=-1,
                        param_grid={'C': [0.001,0.01,0.1,1,10,100]})
best_clf.fit(train_data,train_label)
print("Select best Logistic Regression model with C = {} with best_score={}".format(
    best_clf.best_params_['C'],
    best_clf.best_score_))

Select best Logistic Regression model with C = 1 with best_score=0.7914831794158763


In [37]:
for c in  [0.001,0.01,0.1,1,10,100]:
    test_clf=LogisticRegression(C=c,max_iter=500)
    test_clf.fit(train_data,train_label)
    print("The avarage AUC_ROC of the best logistic regression with {} from 5-fold CV on test data is".format(c),
        roc_auc_score(test_label.tolist(),test_clf.predict(test_data).tolist()))

The avarage AUC_ROC of the best logistic regression with 0.001 from 5-fold CV on test data is 0.7226119508463866
The avarage AUC_ROC of the best logistic regression with 0.01 from 5-fold CV on test data is 0.7211995743124148
The avarage AUC_ROC of the best logistic regression with 0.1 from 5-fold CV on test data is 0.7208306312148725
The avarage AUC_ROC of the best logistic regression with 1 from 5-fold CV on test data is 0.7194442365891782
The avarage AUC_ROC of the best logistic regression with 10 from 5-fold CV on test data is 0.7195523213276132
The avarage AUC_ROC of the best logistic regression with 100 from 5-fold CV on test data is 0.7195523213276132


In [46]:
clf=MLPClassifier(hidden_layer_sizes=(30,10),random_state=42,activation='logistic')
best_clf = GridSearchCV(clf,scoring='roc_auc',cv=5,n_jobs=-1,
                        param_grid={'alpha': [0.05,0.08,0.1,0.12]})
best_clf.fit(train_data,train_label)
print("Select best neural network model with alpha = {} with best_score={}".format(
    best_clf.best_params_['alpha'],
    best_clf.best_score_))

Select best neural network model with alpha = 0.05 with best_score=0.8132290150499811


In [40]:
for lay in [(40,20),(50,20),(45,15)]:
    for a in [0.001,0.05,0.08,0.1,0.12,0.5,1]:
        nn_clf=MLPClassifier(random_state = 42,activation='logistic',alpha=a,hidden_layer_sizes=lay,max_iter=500)
        nn_clf.fit(train_data,train_label)
        print("The avarage AUC_ROC of the best nn and lay={}, alpha={} from 5-fold CV on test data is".format(lay,a),
            roc_auc_score(test_label.tolist(),nn_clf.predict(test_data).tolist()))

The avarage AUC_ROC of the best nn and lay=(40, 20), alpha=0.001 from 5-fold CV on test data is 0.6736246217034154
The avarage AUC_ROC of the best nn and lay=(40, 20), alpha=0.05 from 5-fold CV on test data is 0.7325557467824004
The avarage AUC_ROC of the best nn and lay=(40, 20), alpha=0.08 from 5-fold CV on test data is 0.7338226246300177
The avarage AUC_ROC of the best nn and lay=(40, 20), alpha=0.1 from 5-fold CV on test data is 0.7326711064551532
The avarage AUC_ROC of the best nn and lay=(40, 20), alpha=0.12 from 5-fold CV on test data is 0.7339941052246499
The avarage AUC_ROC of the best nn and lay=(40, 20), alpha=0.5 from 5-fold CV on test data is 0.7253847400977751
The avarage AUC_ROC of the best nn and lay=(40, 20), alpha=1 from 5-fold CV on test data is 0.7230889786823639




The avarage AUC_ROC of the best nn and lay=(50, 20), alpha=0.001 from 5-fold CV on test data is 0.6223020386444511
The avarage AUC_ROC of the best nn and lay=(50, 20), alpha=0.05 from 5-fold CV on test data is 0.7235628886893479
The avarage AUC_ROC of the best nn and lay=(50, 20), alpha=0.08 from 5-fold CV on test data is 0.7309084937975989
The avarage AUC_ROC of the best nn and lay=(50, 20), alpha=0.1 from 5-fold CV on test data is 0.7315642771625261
The avarage AUC_ROC of the best nn and lay=(50, 20), alpha=0.12 from 5-fold CV on test data is 0.732887275932023
The avarage AUC_ROC of the best nn and lay=(50, 20), alpha=0.5 from 5-fold CV on test data is 0.7238642788253683
The avarage AUC_ROC of the best nn and lay=(50, 20), alpha=1 from 5-fold CV on test data is 0.7209834048355449




The avarage AUC_ROC of the best nn and lay=(45, 15), alpha=0.001 from 5-fold CV on test data is 0.6432611244138482
The avarage AUC_ROC of the best nn and lay=(45, 15), alpha=0.05 from 5-fold CV on test data is 0.7278114503309056
The avarage AUC_ROC of the best nn and lay=(45, 15), alpha=0.08 from 5-fold CV on test data is 0.7316536549270012
The avarage AUC_ROC of the best nn and lay=(45, 15), alpha=0.1 from 5-fold CV on test data is 0.7347319914197346
The avarage AUC_ROC of the best nn and lay=(45, 15), alpha=0.12 from 5-fold CV on test data is 0.734262238518075
The avarage AUC_ROC of the best nn and lay=(45, 15), alpha=0.5 from 5-fold CV on test data is 0.7264021916259271
The avarage AUC_ROC of the best nn and lay=(45, 15), alpha=1 from 5-fold CV on test data is 0.7217025840566696


In [43]:
nn_clf_1=MLPClassifier(random_state = 42,activation='logistic',alpha=0.1,hidden_layer_sizes=(50,20))
nn_clf_2=MLPClassifier(random_state = 42,activation='logistic',alpha=0.1,hidden_layer_sizes=(40,20))
nn_clf_3=MLPClassifier(random_state = 42,activation='logistic',alpha=0.08,hidden_layer_sizes=(40,20))
voting_clf=VotingClassifier(estimators=[('nn1',nn_clf_1),('nn2',nn_clf_2),('nn3',nn_clf_3)],voting='soft',weights=[0.33,0.33,0.34])
cv_res=cross_validate(voting_clf,train_data,train_label,scoring='roc_auc',cv=5)

In [44]:
sc=cv_res['test_score']
print(np.mean(sc))
print(np.std(sc))

0.8078833775324445
0.005109766486203547
