In [70]:
import pandas as pd
import numpy as np

from IPython.display import Image
from io import StringIO
import pydotplus
from sklearn.tree import export_graphviz

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_auc_score

from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)



In [71]:
df_train = pd.read_csv('train.csv', encoding='utf8', index_col='_id')
df_test = pd.read_csv('test.csv', encoding='utf8', index_col='_id')

In [77]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24712 entries, df7489733b004bbe40d3d37b34f82419 to 46d0d25dfd1def79632dd437261d0b5c
Data columns (total 21 columns):
age               24712 non-null int64
job               24712 non-null object
marital           24712 non-null object
education         24712 non-null object
default           24712 non-null object
housing           24712 non-null object
loan              24712 non-null object
contact           24712 non-null object
month             24712 non-null object
day_of_week       24712 non-null object
duration          24712 non-null int64
campaign          24712 non-null int64
pdays             24712 non-null int64
previous          24712 non-null int64
poutcome          24712 non-null object
emp.var.rate      24712 non-null float64
cons.price.idx    24712 non-null float64
cons.conf.idx     24712 non-null float64
euribor3m         24712 non-null float64
nr.employed       24712 non-null float64
target            24712 non-null int64

In [97]:
def preproc_data(df_input):
    df_output = df_input.copy()
    
    df_output['default'] = df_output['default'].apply(lambda x: 0 if x == 'no' else 1)
    
    df_output = df_output.drop(['month', 'day_of_week', 'contact', 'pdays', 'nr.employed'], axis = 1)

    # category code
    df_output = pd.get_dummies(df_output, columns=['job', 'marital', 'education', 'housing', 'loan', 'poutcome'])
        
    return df_output

In [98]:
x = preproc_data(df_train).drop(['target'], axis=1)
y = df_train['target']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33)

In [99]:
lr = LogisticRegression(n_jobs=-1, class_weight='balanced', solver='newton-cg', multi_class='multinomial')
lr.fit(x_train, y_train)
roc_auc_score(y_test, lr.predict_proba(x_test)[:,1])

0.9267761696914006

In [102]:
lr.fit(x, y)

df_test = pd.read_csv('test.csv', encoding='utf8')
ans = lr.predict(preproc_data(df_test.drop(['_id'], axis=1)))
result = pd.DataFrame()
result['_id'] = df_test['_id']
result['target'] = ans
#result.to_csv('submit.csv', index=False)

In [103]:
result.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16476 entries, 0 to 16475
Data columns (total 2 columns):
_id       16476 non-null object
target    16476 non-null int64
dtypes: int64(1), object(1)
memory usage: 193.1+ KB


In [91]:
df_train[df_train['target'] == 1].groupby('default')['target'].agg(['count'])

Unnamed: 0_level_0,count
default,Unnamed: 1_level_1
no,2526
unknown,261


In [94]:
df_train.groupby('default')['target'].agg(['count'])

Unnamed: 0_level_0,count
default,Unnamed: 1_level_1
no,19567
unknown,5144
yes,1


In [9]:
train[train['campaign'] == 0] # means all clients have at least one call from bank

Unnamed: 0,_id,age,job,marital,education,default,housing,loan,contact,month,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,target


In [10]:
train['campaign'].describe() # means all clients have at least one call from bank

count    24712.000000
mean         2.580163
std          2.776757
min          1.000000
25%          1.000000
50%          2.000000
75%          3.000000
max         43.000000
Name: campaign, dtype: float64

In [13]:
train[train['target'] == 1]['campaign'].describe() # looks like count of calls doesn't matter a lot. 75% with target = 1 has just 2 calls

count    2787.000000
mean        2.049157
std         1.617637
min         1.000000
25%         1.000000
50%         2.000000
75%         2.000000
max        17.000000
Name: campaign, dtype: float64

In [12]:
train['duration'].describe()

count    24712.000000
mean       258.345298
std        261.573344
min          0.000000
25%        102.000000
50%        180.000000
75%        319.000000
max       4918.000000
Name: duration, dtype: float64

In [14]:
train[train['target'] == 1]['duration'].describe() # but duration means a lot as described in overview

count    2787.000000
mean      551.896304
std       404.420029
min        37.000000
25%       255.000000
50%       442.000000
75%       739.500000
max      4199.000000
Name: duration, dtype: float64

In [58]:
train[(train['previous'] == 0)].groupby('target')['_id'].agg(['count']) # looks like if there were no calls on previous campaigns, probability  for target = 1 is low

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,2480
1,886


In [59]:
train[(train['previous'] != 0)].groupby('target')['_id'].agg(['count']) 

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,2480
1,886


In [57]:
train.groupby('target')['_id'].agg(['count'])

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,21925
1,2787


In [64]:
# 11.28% from total poulation has target = 1
train[train['target'] == 1]['_id'].agg(['count']) /train['_id'].agg(['count']) 

count    0.112779
Name: _id, dtype: float64

In [65]:
# 88.72% for target =0
train[train['target'] == 0]['_id'].agg(['count']) /train['_id'].agg(['count']) 

count    0.887221
Name: _id, dtype: float64

In [60]:
# Target = 1  total count = 2787,  those who doesn't have personal loan and agreed for deposit 2285
train[train['target'] == 1].groupby('loan')['_id'].agg(['count'])

Unnamed: 0_level_0,count
loan,Unnamed: 1_level_1
no,2285
unknown,70
yes,432


In [66]:
# Target = 1  total count = 2787,  but for mortgage splitted almost half and half.
train[train['target'] == 1].groupby('housing')['_id'].agg(['count'])

Unnamed: 0_level_0,count
housing,Unnamed: 1_level_1
no,1213
unknown,70
yes,1504


In [69]:
# Let's check if unknown are the same people (with unknown loan and mortgage)
# They are the same
train[(train['loan'] == 'unknown') & (train['housing'] == 'unknown') & (train['target'] == 1)]['_id'].agg(['count'])

count    70
Name: _id, dtype: int64