In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import eli5
from scipy import stats

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
train = pd.read_csv('/kaggle/input/bank-marketing/bank-additional-full.csv', sep = ';')

train.head()


In [None]:
y = train["y"].map({"no":0, "yes":1})
X = train.drop("y", axis=1)
X.columns
X.dtypes

# **Data exploration**

In [None]:
count = len(X)
print(count)
X.shape

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(3,4)
sns.countplot(x = 'y', data = train)
ax.set_xlabel('y', fontsize=15)
ax.set_ylabel('Count', fontsize=15)
sns.despine()

no_ = len(train[train['y'] == 'no'])
yes_ = len(train[train['y'] == 'yes'])
print("NO:", no_, "which is", (no_/count)*100, "%")
print("YES:", yes_, "which is", (yes_/count)*100, "%")
      

Dataset is highly imbalanced! The response rate is only 11.6%. Hence the Y variable has a high class imbalance. Because of this accuracy will not be a reliable model performance measure. 

1. One solution is to do over-sampling using SMOTE and create a balanced dataset
2. Other solution is to use precision or recall as the performance measures, and decide which one is more important (to have less FN or less FP)

In this particular problem, where model is trying to predict wether a person will subscribe or not, more damage would be caused by false positives. This is because bank might be counting on more subscribed clients, when acctually there are less then predicted. This is the reason that precision will be used as evaluation method.

From dataset description:

* duration: 

"last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model."

Because of this we will drop this column!

In [None]:
X.drop("duration", inplace=True, axis=1)
train.drop("duration", inplace=True, axis=1)

**1. AGE**

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(23, 8)
sns.countplot(x = 'age', data = train)
ax.set_xlabel('Age', fontsize=15)
ax.set_ylabel('Count', fontsize=15)
ax.set_title('Age Count Distribution', fontsize=15)
sns.despine()

train['age'].describe()

In [None]:
# # Quartiles
# print('1º Quartile: ', train['age'].quantile(q = 0.25))
# print('2º Quartile: ', train['age'].quantile(q = 0.50))
# print('3º Quartile: ', train['age'].quantile(q = 0.75))
# print('4º Quartile: ', train['age'].quantile(q = 1.00))
# #Calculate the outliers:
#   # Interquartile range, IQR = Q3 - Q1
#   # lower 1.5*IQR whisker = Q1 - 1.5 * IQR 
#   # Upper 1.5*IQR whisker = Q3 + 1.5 * IQR
    
# print('Ages above: ', train['age'].quantile(q = 0.75) + 
#                       1.5*(train['age'].quantile(q = 0.75) - train['age'].quantile(q = 0.25)), 'are outliers')

In [None]:
# print('Numerber of outliers: ', len(train[train['age'] > 69.6]))
# print('Outliers are:', round(len(train[train['age'] > 69.6])*100/count,2), '%')

In [None]:
# sns.distplot(train['age']);
# fig = plt.figure()
# res = stats.probplot(train['age'], plot=plt)

In [None]:
print("YES age mean:",train[train['y'] == 'yes']['age'].mean())
print("NO age mean:", train[train['y'] == 'no']['age'].mean())

In [None]:
#Bivariate analysis using crosstab:
pd.crosstab(train['age'], train['y'], normalize='index').sort_values(by='yes',ascending=False )

The average age of customers who bought the term deposit is slightly higher than that of the customers who didn’t.

We will transform this feature using RobustScaler to reduce the influence of the outliers instead of dropping them since we might loose some usefull information!

**2. JOB**

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(23, 8)
sns.countplot(x = 'job', data = train)
ax.set_xlabel('Job', fontsize=15)
ax.set_ylabel('Count', fontsize=15)
ax.set_title('Job Count Distribution', fontsize=15)
ax.tick_params(labelsize=15)
sns.despine()

In [None]:
train.groupby('job').mean()

This is a categorical variable and since it does not form any logical sequence, we will use OneHotEncoder!

In [None]:
#Bivariate analysis using crosstab:
pd.crosstab(train['job'], train['y'], normalize='index').sort_values(by='yes',ascending=False )

The highest conversion is for students (31%) and lowest is for blue-collar(7%)!

In [None]:
pd.crosstab(train.job,train.y).plot(kind='bar')
plt.title('Purchase Frequency for Job Title')
plt.xlabel('Job')
plt.ylabel('Frequency of Purchase')
plt.savefig('purchase_fre_job')

The frequency of purchase of the deposit vastly depends on the job title. Thus, the job title can be a good predictor of the outcome variable.

**3. MARITAL**

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(5, 5)
sns.countplot(x = 'marital', data = train)
ax.set_xlabel('Marital', fontsize=15)
ax.set_ylabel('Count', fontsize=15)
ax.set_title('Marital', fontsize=15)
ax.tick_params(labelsize=15)
sns.despine()

print('Married:', len(train[train['marital'] == 'married']))
print('Single:' , len(train[train['marital'] == 'single']))
print('Divorced:' , len(train[train['marital'] == 'divorced']))
print('Unknown:', len(train[train['marital'] == 'unknown']))

In [None]:
train.groupby('marital').mean()

Same as before, for this variable we will use OneHotEncoder!

In [None]:
#Bivariate analysis using crosstab:
pd.crosstab(train['marital'], train['y'], normalize='index').sort_values(by='yes',ascending=False )

In [None]:
table=pd.crosstab(train.marital,train.y)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of Marital Status vs Purchase')
plt.xlabel('Marital Status')
plt.ylabel('Proportion of Customers')
plt.savefig('mariral_vs_pur_stack')

The marital status does not seem a strong predictor for the outcome variable.

**4. EDUCATION**

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(17, 5)
sns.countplot(x = 'education', data = train)
ax.set_xlabel('Education', fontsize=15)
ax.set_ylabel('Count', fontsize=15)
ax.set_title('Education Count Distribution', fontsize=15)
ax.tick_params(labelsize=15)
sns.despine()

In [None]:
#Bivariate analysis using crosstab:
pd.crosstab(train['education'], train['y'], normalize='index').sort_values(by='yes',ascending=False )

In [None]:
table=pd.crosstab(train.education,train.y)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of Education vs Purchase')
plt.xlabel('Education')
plt.ylabel('Proportion of Customers')
plt.savefig('edu_vs_pur_stack')

basic.4y basic.6y and bascic.9y have similar meaning and behaviour when compared to target variable so we can bind them into one value - basic.

In [None]:
X['education']=np.where(X['education'] =='basic.9y', 'basic', X['education'])
X['education']=np.where(X['education'] =='basic.6y', 'basic', X['education'])
X['education']=np.where(X['education'] =='basic.4y', 'basic', X['education'])

In [None]:
X.groupby('education').mean()

In [None]:
#Bivariate analysis using crosstab:
pd.crosstab(X['education'], train['y'], normalize='index').sort_values(by='yes',ascending=False )

Again, we will use OneHotEncoder!

In [None]:
table=pd.crosstab(X.education,train.y)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of Education vs Purchase')
plt.xlabel('Education')
plt.ylabel('Proportion of Customers')
plt.savefig('edu_vs_pur_stack')

Education seems a good predictor of the outcome variable.

**5. DEFAULT**

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(3, 4)
sns.countplot(x = 'default', data = train, order = ['no', 'yes', 'unknown'])
ax.set_xlabel('')
ax.set_ylabel('Count', fontsize=15)
ax.set_title('Default', fontsize=15)
ax.tick_params(labelsize=15)
sns.despine()

In [None]:
print('No credit in default:', len(train[train['default'] == 'no']))
print('Yes to credit in default:' , len(train[train['default'] == 'yes']), "which is:", 100*(len(train[train['default'] == 'yes'])/count),"%")
print('Unknown credit in default:', len(train[train['default'] == 'unknown']))

In [None]:
#Bivariate analysis using crosstab:
pd.crosstab(train['default'], train['y'], normalize='index').sort_values(by='yes',ascending=False )

Since this feature - yes is only 0.073% of the data and the conversion is also comparitively lower for default - yes, we can remove this column!

In [None]:
X.drop("default", inplace=True, axis=1)

The solution was tested with and without this column, and it show that this approach was right and that results are better without this column!

**6. HOUSING**

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(3,4)
sns.countplot(x = 'housing', data = train)
ax.set_xlabel('')
ax.set_ylabel('Count', fontsize=15)
ax.set_title('Housing', fontsize=15)
ax.tick_params(labelsize=15)
sns.despine()

print('No housing in loan:', len(train[train['housing'] == 'no']))
print('Yes housing in loan:' , len(train[train['housing'] == 'yes']))
print('Unknown housing in loan:', len(train[train['housing'] == 'unknown']))

We will also encode this feature with OneHotEncoder!

In [None]:
#Bivariate analysis using crosstab:
pd.crosstab(train['housing'], train['y'], normalize='index').sort_values(by='yes',ascending=False )

In [None]:
table=pd.crosstab(train.housing,train.y)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of Housing vs Purchase')
plt.xlabel('Housing')
plt.ylabel('Proportion of Customers')
plt.savefig('housing_vs_pur_stack')

Housing does not seem like a good predictor.

**7. LOAN**

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(3,4)
sns.countplot(x = 'loan', data = train)
ax.set_xlabel('')
ax.set_ylabel('Count', fontsize=15)
ax.set_title('Loan', fontsize=15)
ax.tick_params(labelsize=15)
sns.despine()

print('No to personal loan:', len(train[train['loan'] == 'no']))
print('Yes to personal loan:' , len(train[train['loan'] == 'yes']))
print('Unknown to personal loan:', len(train[train['loan'] == 'unknown']))

Encoding will be done with OneHotEncoder!

In [None]:
#Bivariate analysis using crosstab:
pd.crosstab(train['loan'], train['y'], normalize='index').sort_values(by='yes',ascending=False )

In [None]:
table=pd.crosstab(train.loan,train.y)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of Loan vs Purchase')
plt.xlabel('Loan')
plt.ylabel('Proportion of Customers')
plt.savefig('loan_vs_pur_stack')

Loan does not seem like a good predictor!

**8. CONTACT**

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(3,4)
sns.countplot(x = 'contact', data =train )
ax.set_xlabel('')
ax.set_ylabel('Count', fontsize=15)
ax.set_title('Contact', fontsize=15)
ax.tick_params(labelsize=15)
sns.despine()

print('Telephone:', len(train[train['contact'] == 'telephone']))
print('Celular:' , len(train[train['contact'] == 'cellular']))
print('Missing:', count - len(train[train['contact'] == 'telephone']) - len(X[X['contact'] == 'cellular']))

OneHotEncoder!

In [None]:
#Bivariate analysis using crosstab:
pd.crosstab(train['contact'], train['y'], normalize='index').sort_values(by='yes',ascending=False )

In [None]:
table=pd.crosstab(train.contact,train.y)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of Contact vs Purchase')
plt.xlabel('Contact')
plt.ylabel('Proportion of Customers')
plt.savefig('contact_vs_pur_stack')

Contact variable could be a good predictor.

**9. Month and day of last contact**

In [None]:
fig, (ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize = (20,8))
sns.countplot(x = 'month', data = train, ax = ax1)
ax1.set_title('Month', fontsize=15)
ax1.set_xlabel('')
ax1.set_ylabel('Count', fontsize=15)
ax1.tick_params(labelsize=15)

# Housing, has housing loan ?
sns.countplot(x = 'day_of_week', data = train, ax = ax2)
ax2.set_title('Day of Week', fontsize=15)
ax2.set_xlabel('')
ax2.set_ylabel('Count', fontsize=15)
ax2.tick_params(labelsize=15)

plt.subplots_adjust(wspace=0.25)

In contrast to previous categorical features, these two do form some sequence so we will try our solution with LabelEncoder and with OneHotEncoder!

In [None]:
#Bivariate analysis using crosstab:
pd.crosstab(train['day_of_week'], train['y'], normalize='index').sort_values(by='yes',ascending=False )

In [None]:
pd.crosstab(train.day_of_week,train.y).plot(kind='bar')
plt.title('Purchase Frequency for Day of Week')
plt.xlabel('Day of Week')
plt.ylabel('Frequency of Purchase')
plt.savefig('pur_dayofweek_bar')

Day of week may not be a good predictor of the outcome.

In [None]:
#Bivariate analysis using crosstab:
pd.crosstab(train['month'], train['y'], normalize='index').sort_values(by='yes',ascending=False )

In [None]:
pd.crosstab(train.month,train.y).plot(kind='bar')
plt.title('Purchase Frequency for Month')
plt.xlabel('Month')
plt.ylabel('Frequency of Purchase')
plt.savefig('pur_fre_month_bar')

Month might be a good predictor of the outcome variable.

**10. Previous Outcome**

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(5,4)
sns.countplot(x = 'poutcome', data = train)
ax.set_xlabel('')
ax.set_ylabel('Count', fontsize=15)
ax.set_title('Poutcome', fontsize=15)
ax.tick_params(labelsize=15)
sns.despine()

print('Nonexistent outcome:', len(train[train['poutcome'] == 'nonexistent']), "which is", (len(X[X['poutcome'] == 'nonexistent'])/count)*100, "%")
print('Failed outcome:', len(train[train['poutcome'] == 'failure']))
print('Success outcome:', len(train[train['poutcome'] == 'success']))

There are a lot of nonexistent values in this column (86.3%) but we hope that the values that are known carry important information about the client. We will encode this with OneHotEncoder.

In [None]:
#Bivariate analysis using crosstab:
pd.crosstab(train['poutcome'], train['y'], normalize='index').sort_values(by='yes',ascending=False )

In [None]:
pd.crosstab(train.poutcome,train.y).plot(kind='bar')
plt.title('Purchase Frequency for Poutcome')
plt.xlabel('Poutcome')
plt.ylabel('Frequency of Purchase')
plt.savefig('pur_fre_pout_bar')

Poutcome seems to be a good predictor of the outcome variable.

**11. PDAYS**

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(20,4)
sns.countplot(x = 'pdays', data = train)
ax.set_xlabel('')
ax.set_ylabel('Count', fontsize=15)
ax.set_title('pdays', fontsize=15)
ax.tick_params(labelsize=15)
sns.despine()

In [None]:
count_unknown_pdays = len(train[train['pdays'] == 999])
count = len(X)
print('Percent of unknown pdays: ', (count_unknown_pdays/count)*100, "%")

Since almost all entries have unknown pdays attribute we could conclude to drop this column since it seems like it does not carry a lot of information. 

In [None]:
#Bivariate analysis using crosstab:
pd.crosstab(train['pdays'], train['y'], normalize='index').sort_values(by='yes',ascending=False )

The pdays (days since the customer was last contacted) is understandably lower for the customers who bought it. The lower the pdays, the better the memory of the last call and hence the better chances of a sale.

From bivariate analysis we see that pdays seems important for the resulting column, but this may be the product of many missing values. The results will be compared with and without this column.

In [None]:
#Certain variables are more relevant if they are categorical variable than numerical variables. 
#Because of that we will try our results when we convert this variable to categoric!
#X['pdays']=X['pdays'].astype('category')

**12. PREVIOUS**

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(5,4)
sns.countplot(x = 'previous', data = train)
ax.set_xlabel('')
ax.set_ylabel('Count', fontsize=15)
ax.set_title('previous', fontsize=15)
ax.tick_params(labelsize=15)
sns.despine()

From dataset description:
> Previous: number of contacts performed before this campaign and for this client (numeric)

In [None]:
no_previous = len(train[train['previous'] == 0])
print('Percent of no previous: ', (no_previous/count)*100, "%")

We see that not many clients had been contacted before this campaign!

In [None]:
#Bivariate analysis using crosstab:
pd.crosstab(train['previous'], train['y'], normalize='index').sort_values(by='yes',ascending=False )

**13. CAMPAIGN**

> Campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)

In [None]:
sns.distplot(train['campaign']);
fig = plt.figure()
#res = stats.probplot(train['campaign'], plot=plt)

train['campaign'].describe()

In [None]:
sns.boxplot(x=train['campaign'])

In [None]:
# fig, ax = plt.subplots(figsize=(10,5))
# ax.scatter(train['campaign'], y)
# plt.show()

In [None]:
print('Percent of campaign = 1: ', (len(train[train['campaign'] == 1])/count)*100, "%")
print('Percent of campaign = 2: ', (len(train[train['campaign'] == 2])/count)*100, "%")
print('Percent of campaign = 3: ', (len(train[train['campaign'] == 3])/count)*100, "%")
print('Percent of campaign >= 4: ', (len(train[train['campaign'] >= 4])/count)*100, "%")

We see that most of the values for the campaign are 1,2 and 3. Which we also see from the boxplot. That is why it is a good idea to bind these values into 4 categorical values.

In [None]:
#Bivariate analysis using crosstab:
pd.crosstab(train['campaign'], train['y'], normalize='index').sort_values(by='yes',ascending=False )

This feature has many outliers!

From boxplot we also see that aproximately all values X['campaign'] > 10 are potential outliers, and there is 2% of such data, but dropping all that could cause a lost of infromation, so we won't drop them for now!

To reduce the importance of the outliers we can transform this column to be a categorical variable and to have only 4 possible values:
* campaign <= 2
* campaign = 3
* campaign = 4
* campaign > 4


In [None]:
#Binning campaign
col = X['campaign']
cut_points = [2,3,4]
labels = ["<=2","3","4",">4"]
minval = col.min()
maxval = col.max()

#create list by adding min and max to cut_points
break_points = [minval] + cut_points + [maxval]

#Binning using cut function of pandas
colBin = pd.cut(col,bins=break_points,labels=labels,include_lowest=True)
X['campaign_new'] = colBin
X.drop(['campaign'], axis=1, inplace=True)

In [None]:
#fig, ax = plt.subplots()
#fig.set_size_inches(5,4)
#sns.countplot(x = 'campaign_new', data = X)
#ax.set_xlabel('')
#ax.set_ylabel('Count', fontsize=15)
#ax.set_title('campaign_new', fontsize=15)
#ax.tick_params(labelsize=15)
#sns.despine()

In [None]:
#X.drop(X[X['campaign'] == 56].index, inplace=True)
#y = pd.DataFrame(y)
#y.drop(y.index[4107], inplace=True)
#y = y['y']

**14. Social and economic context attributes**

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(8,4)
sns.countplot(x = 'emp.var.rate', data = train)
ax.set_xlabel('')
ax.set_ylabel('Count', fontsize=15)
ax.set_title('employment variation rate', fontsize=15)
ax.tick_params(labelsize=15)
sns.despine()

X['emp.var.rate'].describe()

In [None]:
#Bivariate analysis using crosstab:
pd.crosstab(train['emp.var.rate'], train['y'], normalize='index').sort_values(by='yes',ascending=False )

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(30,4)
sns.countplot(x = 'cons.price.idx', data = train)
ax.set_xlabel('')
ax.set_ylabel('Count', fontsize=15)
ax.set_title('consumer price index', fontsize=15)
ax.tick_params(labelsize=15)
sns.despine()

X['cons.price.idx'].describe()

In [None]:
#Bivariate analysis using crosstab:
pd.crosstab(train['cons.price.idx'], train['y'], normalize='index').sort_values(by='yes',ascending=False )

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(30,4)
sns.countplot(x = 'cons.conf.idx', data = train)
ax.set_xlabel('')
ax.set_ylabel('Count', fontsize=15)
ax.set_title('consumer confidence index', fontsize=15)
ax.tick_params(labelsize=15)
sns.despine()

X['cons.conf.idx'].describe()

In [None]:
#Bivariate analysis using crosstab:
pd.crosstab(train['cons.conf.idx'], train['y'], normalize='index').sort_values(by='yes',ascending=False )

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(35,4)
sns.countplot(x = 'euribor3m', data = train)
ax.set_xlabel('')
ax.set_ylabel('Count', fontsize=15)
ax.set_title('euribor 3 month rate', fontsize=15)
ax.tick_params(labelsize=15)
sns.despine()

X['euribor3m'].describe()

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(12,4)
sns.countplot(x = 'nr.employed', data = train)
ax.set_xlabel('')
ax.set_ylabel('Count', fontsize=15)
ax.set_title('number of employees', fontsize=15)
ax.tick_params(labelsize=15)
sns.despine()

X['nr.employed'].describe()

In [None]:
#Bivariate analysis using crosstab:
pd.crosstab(train['nr.employed'], train['y'], normalize='index').sort_values(by='yes',ascending=False )

In [None]:
#with and without columns:
#X.drop("pdays", inplace=True, axis=1)
#X.drop("emp.var.rate", inplace=True, axis=1)

# PREPROCESSING

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer

In [None]:
#numeric features
num_features = ["age", "previous", "emp.var.rate",'pdays','campaign_new',
                 "cons.price.idx", "cons.conf.idx","euribor3m", "nr.employed"]

#categorical features to be encoded with OneHotEncoder:
cat_oh_features = ["job", "marital", "education", "housing", "loan", "contact", "poutcome", "month", "day_of_week"]

#categorical features to be encoded with LabelEncoder:
cat_le_features = []#, "month", "day_of_week"]#, "pdays"]

In [None]:
# min_max = ["previous", "emp.var.rate", "cons.price.idx", "cons.conf.idx","euribor3m", "nr.employed"]
# m_scaler = MinMaxScaler()
# X[min_max] = m_scaler.fit_transform(X[min_max])
# robust = ["age", 'pdays','campaign']
# r_scaler = RobustScaler()
# X[robust] = r_scaler.fit_transform(X[robust])

In [None]:
le = LabelEncoder()
#X['month'] = le.fit_transform(X['month'])
#X['day_of_week'] = le.fit_transform(X['day_of_week'])
X['campaign_new'] = le.fit_transform(X['campaign_new'])
#X['pdays'] = le.fit_transform(X['pdays'])

In [None]:
sc = StandardScaler()
X[num_features] = sc.fit_transform(X[num_features])

In [None]:
preprocessor = ColumnTransformer([("numerical", "passthrough", num_features+cat_le_features), 
                                  ("categorical", OneHotEncoder(sparse=False, handle_unknown="ignore"),
                                   cat_oh_features)])

# **PREDICTION AND EXPLAINABLE AI**

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, recall_score, precision_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
import shap

shap.initjs()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)

1. Logistic regression


In [None]:
from sklearn.linear_model import LogisticRegression

lr_model = Pipeline([("preprocessor", preprocessor),
                    ("model", LogisticRegression(class_weight="balanced", solver="liblinear", random_state=42))])

In [None]:
lr_model.fit(X_train, y_train)
y_pred1 = lr_model.predict(X_test)

In [None]:
results = pd.DataFrame({'Method':['Logistic Regression'], 
                        'accuracy': accuracy_score(y_test, y_pred1), 
                        'precision': precision_score(y_test, y_pred1, pos_label=1),
                        'recall': recall_score(y_test, y_pred1, pos_label=1),
                        'f1': f1_score(y_test, y_pred1, pos_label=1)
                      })

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred1))
print(classification_report(y_test, y_pred1))
print(confusion_matrix(y_test, y_pred1))

We need to show real feature names, to get real insight in this explanation!

In [None]:
preprocessor = lr_model.named_steps["preprocessor"]
ohe_categories = preprocessor.named_transformers_["categorical"].categories_
new_ohe_features = [f"{col}__{val}" for col, vals in zip(cat_oh_features, ohe_categories) for val in vals]
all_features = num_features+ cat_le_features + new_ohe_features

In [None]:
pd.DataFrame(lr_model.named_steps["preprocessor"].transform(X_train), columns=all_features).head()

ELI5

In [None]:
eli5.show_weights(lr_model.named_steps["model"], feature_names=all_features)

In [None]:
prep_instances = lr_model.named_steps['preprocessor'].fit_transform(X_test)
eli5.explain_prediction(lr_model.named_steps["model"], prep_instances[0] ,feature_names=all_features)

In [None]:
eli5.explain_prediction(lr_model.named_steps["model"], prep_instances[42] ,feature_names=all_features)

SHAP

In [None]:
prep1 = lr_model.named_steps['preprocessor'].fit_transform(X_train)
explainer1 = shap.LinearExplainer(lr_model.named_steps["model"], prep1)
observations1 = lr_model.named_steps["preprocessor"].transform(X_test.sample(1000, random_state=42))
shap_values1 = explainer1.shap_values(observations1)

In [None]:
i = 0
shap.force_plot(explainer1.expected_value, shap_values1[i],
                features=observations1[i], feature_names=all_features)

In [None]:
shap.force_plot(explainer1.expected_value, shap_values1,
                features=observations1, feature_names=all_features)

In [None]:
shap.summary_plot(shap_values1, features=observations1, feature_names=all_features, max_display=15)

2. Decision trees

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_model = Pipeline([("preprocessor", preprocessor), 
                     ("model", DecisionTreeClassifier(class_weight="balanced"))])


Using GridSearch to find best params for decision tree model!

In [None]:
gs2 = GridSearchCV(dt_model, {"model__max_depth": [3, 5, 7], 
                             "model__min_samples_split": [2, 5]}, 
                  cv=5,
                  n_jobs = -1,
                  scoring="accuracy")

gs2.fit(X_train, y_train)
print(gs2.best_params_)
print(gs2.best_score_)

In [None]:
dt_model.set_params(**gs2.best_params_)
dt_model.fit(X_train, y_train)
y_pred2 = dt_model.predict(X_test)

In [None]:
temp = pd.DataFrame({'Method':['Decision Trees'], 
                        'accuracy': accuracy_score(y_test, y_pred2), 
                        'precision': precision_score(y_test, y_pred2, pos_label=1),
                        'recall': recall_score(y_test, y_pred2, pos_label=1),
                        'f1': f1_score(y_test, y_pred2, pos_label=1)
                      })
results = pd.concat([results, temp])
print("Accuracy:", accuracy_score(y_test, y_pred2))
print(classification_report(y_test, y_pred2))
print(confusion_matrix(y_test, y_pred2))

ELI5

In [None]:
eli5.explain_weights(dt_model.named_steps["model"],feature_names=all_features)

In [None]:
prep_instances = dt_model.named_steps['preprocessor'].fit_transform(X_test)
eli5.explain_prediction(dt_model.named_steps["model"], prep_instances[0] ,feature_names=all_features)

In [None]:
eli5.explain_prediction(dt_model.named_steps["model"], prep_instances[42] ,feature_names=all_features)

SHAP

In [None]:
explainer2 = shap.TreeExplainer(dt_model.named_steps["model"])
observations2 = dt_model.named_steps["preprocessor"].transform(X_train.sample(1000, random_state=42))
shap_values2 = explainer2.shap_values(observations2)

In [None]:
i = 0
shap.force_plot(explainer2.expected_value[i], shap_values2[i], 
                features=observations2, feature_names=all_features)

In [None]:
shap.summary_plot(shap_values2, features=observations2, feature_names=all_features, max_display=15)

3. Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_model = Pipeline([("preprocessor", preprocessor), 
                     ("model", RandomForestClassifier(class_weight="balanced", n_estimators=100, n_jobs=-1))])

In [None]:
gs3 = GridSearchCV(rf_model, {"model__max_depth": [10, 15], 
                             "model__min_samples_split": [5, 10]},  
                  cv=5,
                  n_jobs = -1,
                  scoring="precision")

gs3.fit(X_train, y_train)
print(gs3.best_params_)
print(gs3.best_score_)

In [None]:
rf_model.set_params(**gs3.best_params_)
rf_model.fit(X_train, y_train)
y_pred3 = rf_model.predict(X_test)

In [None]:
temp = pd.DataFrame({'Method':['Random Forest'], 
                        'accuracy': accuracy_score(y_test, y_pred3), 
                        'precision': precision_score(y_test, y_pred3, pos_label=1),
                        'recall': recall_score(y_test, y_pred3, pos_label=1),
                          'f1': f1_score(y_test, y_pred3, pos_label=1)
                      })
results = pd.concat([results, temp])
print("Accuracy:", accuracy_score(y_test, y_pred3))
print(classification_report(y_test, y_pred3))
print(confusion_matrix(y_test, y_pred3))

ELI5

In [None]:
eli5.explain_weights(rf_model.named_steps["model"], feature_names=all_features)

In [None]:
prep_instances = rf_model.named_steps['preprocessor'].fit_transform(X_test)
eli5.explain_prediction(rf_model.named_steps["model"], prep_instances[42] ,feature_names=all_features)

In [None]:
eli5.explain_prediction(rf_model.named_steps["model"], prep_instances[0] ,feature_names=all_features)

SHAP


In [None]:
explainer3 = shap.TreeExplainer(rf_model.named_steps["model"])
observations3 = rf_model.named_steps["preprocessor"].transform(X_train.sample(1000, random_state=42))
shap_values3 = explainer3.shap_values(observations3)

In [None]:
i = 0
shap.force_plot(explainer3.expected_value[i], shap_values3[i], 
                features=observations3, feature_names=all_features)

In [None]:
shap.summary_plot(shap_values3, features=observations3, feature_names=all_features, max_display=15)

4. XGBoost 

In [None]:
from xgboost.sklearn import XGBClassifier

xgb_model = Pipeline([("preprocessor", preprocessor), 
                      # Add a scale_pos_weight to make it balanced
                     ("model", XGBClassifier(scale_pos_weight=(1 - y.mean()), n_jobs=-1))])

In [None]:
gs4 = GridSearchCV(xgb_model, {"model__max_depth": [5, 10],
                              "model__min_child_weight": [10, 15],
                              "model__n_estimators": [15, 25]},
                  n_jobs=-1, 
                  cv=5,
                  scoring="precision")

gs4.fit(X_train, y_train)
print(gs4.best_params_)
print(gs4.best_score_)

In [None]:
xgb_model.set_params(**gs4.best_params_)
xgb_model.fit(X_train, y_train)
y_pred4 = xgb_model.predict(X_test)

In [None]:
temp = pd.DataFrame({'Method':['XGBoost'], 
                        'accuracy': accuracy_score(y_test, y_pred4), 
                        'precision': precision_score(y_test, y_pred4, pos_label=1),
                        'recall': recall_score(y_test, y_pred4, pos_label=1),
                        'f1': f1_score(y_test, y_pred4, pos_label=1)
                      })
results = pd.concat([results, temp])

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred4))
print(classification_report(y_test, y_pred4))
print(confusion_matrix(y_test, y_pred4))

ELI5

In [None]:
eli5.show_weights(xgb_model.named_steps["model"], feature_names=all_features)

In [None]:
prep_instances4 = xgb_model.named_steps['preprocessor'].fit_transform(X_test)
eli5.explain_prediction(xgb_model.named_steps["model"], prep_instances4[42] ,feature_names=all_features)

In [None]:
eli5.explain_prediction(xgb_model.named_steps["model"], prep_instances4[0] ,feature_names=all_features)

SHAP

Since passing whole dataset to shap would cause too much computational power, I sampled 1000 random samples.

In [None]:
explainer4 = shap.TreeExplainer(xgb_model.named_steps["model"])
observations4 = xgb_model.named_steps["preprocessor"].transform(X_train.sample(1000, random_state=42))
shap_values4 = explainer4.shap_values(observations4)

In [None]:
i = 0
shap.force_plot(explainer4.expected_value, shap_values4[i], 
                features=observations4[i], feature_names=all_features)

In [None]:
shap.force_plot(explainer4.expected_value, shap_values4,
                features=observations4, feature_names=all_features)

In [None]:
shap.summary_plot(shap_values4, features=observations4, feature_names=all_features, max_display=15)

5. SVM

In [None]:
from sklearn.svm import LinearSVC

svc_model = Pipeline([("preprocessor", preprocessor), 
                      ("model", LinearSVC(max_iter=5000, random_state=42, dual = False))])

In [None]:
svc_model.fit(X_train, y_train)
y_pred5 = svc_model.predict(X_test)

In [None]:
temp = pd.DataFrame({'Method':['LinearSVC'], 
                        'accuracy': accuracy_score(y_test, y_pred5), 
                        'precision': precision_score(y_test, y_pred5, pos_label=1),
                        'recall': recall_score(y_test, y_pred5, pos_label=1),
                        'f1': f1_score(y_test, y_pred5, pos_label=1)
                      })
results = pd.concat([results, temp])

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred5))
print(classification_report(y_test, y_pred5))
print(confusion_matrix(y_test, y_pred5))

In [None]:
eli5.show_weights(svc_model.named_steps["model"], feature_names=all_features)

In [None]:
# show test instances that are predicted to be 1
# for i in range(0, len(y_pred6)):
#     if y_pred6[i] == 1:
#         print(i)

In [None]:
prep_instances = svc_model.named_steps['preprocessor'].fit_transform(X_test)
eli5.explain_prediction(svc_model.named_steps["model"], prep_instances[42] ,feature_names=all_features)

In [None]:
eli5.explain_prediction(svc_model.named_steps["model"], prep_instances[0] ,feature_names=all_features)

SHAP

In [None]:
prep3 = svc_model.named_steps['preprocessor'].fit_transform(X_train)
explainer6 = shap.LinearExplainer(svc_model.named_steps["model"], prep3)
observations6 = svc_model.named_steps["preprocessor"].transform(X_test.sample(1000, random_state=42))
shap_values6 = explainer6.shap_values(observations6)

In [None]:
i = 641
shap.force_plot(explainer6.expected_value, shap_values6[i],
                features=observations6[i], feature_names=all_features)

In [None]:
shap.force_plot(explainer6.expected_value, shap_values6,
                features=observations6, feature_names=all_features)

In [None]:
shap.summary_plot(shap_values6, features=observations6, feature_names=all_features, max_display=15)

In [None]:
results

In [None]:
import community as community_louvain
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import networkx as nx

# load the karate club graph
G = nx.karate_club_graph()

#first compute the best partition
partition = community_louvain.best_partition(G)


print(partition)
partition[34] = 3
print(partition)

# draw the graph
# pos = nx.spring_layout(G)
# # color the nodes according to their partition
# cmap = cm.get_cmap('viridis', max(partition.values()) + 1)
# nx.draw_networkx_nodes(G, pos, partition.keys(), node_size=40,
#                        cmap=cmap, node_color=list(partition.values()))
# nx.draw_networkx_edges(G, pos, alpha=0.5)
# plt.show()

In [None]:
from nltk.sentiment.util import demo_liu_hu_lexicon
line = "Hello, you are amazing"
count = [0, 0, 0]
for word in line:
    d = {"Positive":0, "Negative":1, "Neutral": 2}
    res = demo_liu_hu_lexicon(word)
    print(res)
    i = d[res]
    count[i] += 1
    
print(count)