In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-whitegrid')
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_colwidth', -1)
import warnings
warnings.simplefilter('ignore')

In [None]:
train = pd.read_csv(r'C:\Users\Rajat\Downloads\Hackathon 1\Train_eP48B9k.csv')

In [None]:
test = pd.read_csv(r'C:\Users\Rajat\Downloads\Hackathon 1\Test_jPKyvmK.csv')

In [None]:
ss.head()

In [None]:
train.head()

In [None]:
ID_COL, TARGET_COL = 'id', 'term_deposit_subscribed'

In [None]:
print(f'\nTrain contains {train.shape[0]} samples and {train.shape[1]} variables')
print(f'\nTest contains {test.shape[0]} samples and {test.shape[1]} variables')

features = [c for c in train.columns if c not in [ID_COL, TARGET_COL]]
print(f'\nThe dataset contains {len(features)} features')

In [None]:
train[TARGET_COL].value_counts(normalize=True)

In [None]:
plt.figure(figsize = (10,10))
sns.countplot(train[TARGET_COL])
plt.title("Target Distribution", fontsize=14)

In [None]:
train.info()

In [None]:
null_values_per_variable = 100 * (train.isnull().sum()/train.shape[0]).round(3)
null_values_per_variable.sort_values(ascending=False)

In [None]:
train.nunique()

In [None]:
train.columns
cat_cols = ['job_type',
 'marital',
 'education',
 'default',
 'housing_loan',
 'personal_loan',
 'communication_type',
 'month',
 'prev_campaign_outcome']
num_cols = [c for c in features if c not in cat_cols]
num_cols

In [None]:
print(list(enumerate(train[cat_cols])))

In [None]:
train.head(1)

In [None]:
for i, c in enumerate(train[cat_cols]):
    train[c].value_counts()[::-1]

In [None]:
#plt.subplots(2, 2, figsize=(14, 22))
for c in train[cat_cols]:
    plt.figure(figsize = (10,10))
    sns.countplot(train[c])
    plt.title(c, fontsize=14)
    plt.xticks(rotation='vertical')

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(14, 22))
axes = [ax for axes_row in axes for ax in axes_row]

for i, c in enumerate(train[cat_cols]):
    train[c].value_counts()[::-1].plot(kind = 'pie', ax=axes[i], autopct='%.0f', title=c, fontsize=12)
    axes[i].set_ylabel('')    
plt.tight_layout()

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(16, 16))
axes = [ax for axes_row in axes for ax in axes_row]

for i, c in enumerate(train[cat_cols]):
    train[c].value_counts()[::-1].plot(kind = 'barh', ax=axes[i], title=c, fontsize=14)
    
plt.tight_layout()

In [None]:
train['term_deposit_subscribed'].value_counts()

In [None]:
fig, axes = plt.subplots(5, 2, figsize=(16, 24))
axes = [ax for axes_row in axes for ax in axes_row]

for i, c in enumerate(train[cat_cols]):
    fltr = train[TARGET_COL] == 0
    vc_a = train[fltr][c].value_counts(normalize=True).reset_index().rename({'index' : c, c: 'count'}, axis=1)
    vc_b = train[~fltr][c].value_counts(normalize=True).reset_index().rename({'index' : c, c: 'count'}, axis=1)
    vc_a[TARGET_COL] = 0
    vc_b[TARGET_COL] = 1
    df = pd.concat([vc_a, vc_b]).reset_index(drop = True)
    sns.barplot(y = c, x = 'count', data =df , hue=TARGET_COL, ax=axes[i])
plt.tight_layout()

In [None]:
cat_cols

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(16, 24))
axes = [ax for axes_row in axes for ax in axes_row]

for i, c in enumerate(train[cat_cols]):
    fltr = train[TARGET_COL] == 0
    vc_a = train[fltr][c].value_counts(normalize=True).reset_index().rename({'index' : c, c: 'count'}, axis=1)
    vc_b = train[~fltr][c].value_counts(normalize=True).reset_index().rename({'index' : c, c: 'count'}, axis=1)
    vc_a[TARGET_COL] = 0
    vc_b[TARGET_COL] = 1
    df = pd.concat([vc_a, vc_b]).reset_index(drop = True)
    sns.barplot(y = c, x = 'count', data =df , hue=TARGET_COL, ax=axes[i])
#plt.tight_layout()
plt.show()

In [None]:
train.groupby('marital')[TARGET_COL].mean().sort_values().plot(kind = 'barh')

In [None]:
fig, axes = plt.subplots(7, 1, figsize=(8, 20))
for i, c in enumerate(num_cols):
    train[[c]].boxplot(ax=axes[i], vert=False)

In [None]:
train.head(1)

In [None]:
num_cols

In [None]:
sns.set(font_scale=1.3)
fig, axes = plt.subplots(4, 2, figsize=(18, 14))
axes = [ax for axes_row in axes for ax in axes_row]
for i, c in enumerate(num_cols):
    plot = train.groupby(TARGET_COL)[c].median().plot(kind = 'barh', title=f'Median_{c}', ax=axes[i])
plt.tight_layout()

In [None]:
train.groupby(TARGET_COL)['balance'].median().plot(kind = 'barh', title=f'Median_balance')

In [None]:
train.groupby(TARGET_COL)['last_contact_duration'].median().plot(kind = 'barh', title=f'Median_last_contact_duration')

In [None]:
train['is_old'] = True
train.loc[train['customer_age'] <= 50, 'is_old'] = False

train.groupby('is_old')[TARGET_COL].mean().sort_values().plot(kind = 'barh', title='Probability of subscribing to a term deposit')

In [None]:
plt.figure(figsize=(14, 8))
sns.heatmap(train[num_cols].corr(), annot=True)

In [None]:
def download_preds(preds_test, file_name = r'C:\Users\Rajat\Downloads\Hackathon 1\hacklive_sub.csv'):

  ## 1. Setting the target column with our obtained predictions
  ss[TARGET_COL] = preds_test

  ## 2. Saving our predictions to a csv file

  ss.to_csv(file_name, index = False)

  ## 3. Downloading and submitting the csv file
  #from google.colab import files
  #files.download(file_name)

In [None]:
target = train[TARGET_COL]
target.value_counts()

In [None]:
preds_target = np.zeros(len(train))

accuracy = accuracy_score(target, preds_target)
f1 = f1_score(target, preds_target)

print(f'Accuracy score is: {accuracy}')
print(f'F1 score is: {f1}')

In [None]:
preds_test = np.zeros(len(test))

In [None]:
download_preds(preds_test, file_name = 'haklive_zero_sub.csv')

In [None]:
train.head()

In [None]:
train['job_type'].value_counts()

In [None]:
train['job_type'].nunique()

In [None]:
pd.get_dummies(train[['job_type']])

In [None]:
train.shape, test.shape

In [None]:
df = pd.concat([train, test], axis=0).reset_index(drop = True)
df.shape

In [None]:
df = pd.get_dummies(df, columns = cat_cols)

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df = df.fillna(-999)
df.isnull().sum().sum()

In [None]:
train_proc, test_proc = df[:train.shape[0]], df[train.shape[0]:].reset_index(drop = True)
features = [c for c in train_proc.columns if c not in [ID_COL, TARGET_COL]]

In [None]:
len(features)

In [None]:
trn, val = train_test_split(train_proc, test_size=0.2, random_state = 1, stratify = train_proc[TARGET_COL])

###### Input to our model will be the features
X_trn, X_val = trn[features], val[features]

###### Output of our model will be the TARGET_COL
y_trn, y_val = trn[TARGET_COL], val[TARGET_COL]

##### Features for the test data that we will be predicting
X_test = test_proc[features]

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_trn)

X_trn = scaler.transform(X_trn)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [None]:
clf = LogisticRegression(random_state = 1)
clf.fit(X_trn, y_trn)

preds_val = clf.predict(X_val)

f1_score(y_val, preds_val)

In [None]:
clf = DecisionTreeClassifier(random_state = 1)
clf.fit(X_trn, y_trn)

preds_val = clf.predict(X_val)

f1_score(y_val, preds_val)