In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Beautiful format for float type
pd.set_option('display.float_format', lambda x: '%.3f' % x)

account = pd.read_csv('tj_03_account_info.csv')
deposit_txn = pd.read_csv('tj_03_deposit_txn.csv')
data_training = pd.read_csv('tj_03_training.csv', header=None, names=["account_no", "closed_in_month_11"])
data_test = pd.read_csv('tj_03_test.csv', header=None, names=["account_no"])

In [2]:
# account.info()
# deposit_txn.info()
# data_training.info()
# data_test.info

print("Unique account_no =", len(account.account_no.unique()))
account.sort_values(['dormant_days']).head(20)

Unique account_no = 14020


Unnamed: 0,account_no,txn_dt,open_date,customer_type,last_active_date,dormant_days,compound_frq,compound_frq_unit,eff_interest_rate
390700,3450002852,2016-10-16 00:00:00,2016-03-01 00:00:00,704,2016-10-17 00:00:00,-1,6,M,0.5
76935,3450041357,2016-10-11 00:00:00,2012-05-21 00:00:00,704,2016-10-12 00:00:00,-1,6,M,0.5
41837,3450002660,2016-10-10 00:00:00,2016-02-21 00:00:00,704,2016-10-11 00:00:00,-1,6,M,0.5
159080,3450002660,2016-09-30 00:00:00,2016-02-21 00:00:00,704,2016-10-01 00:00:00,-1,6,M,0.5
480975,3450004814,2016-09-16 00:00:00,2016-06-01 00:00:00,704,2016-09-17 00:00:00,-1,6,M,0.5
186445,3450002660,2016-10-13 00:00:00,2016-02-21 00:00:00,704,2016-10-14 00:00:00,-1,6,M,0.5
8443,3450004466,2016-10-16 00:00:00,2016-05-15 00:00:00,702,2016-10-17 00:00:00,-1,6,M,0.5
473358,3450002660,2016-08-03 00:00:00,2016-02-21 00:00:00,704,2016-08-04 00:00:00,-1,6,M,0.5
188049,3450038548,2016-10-31 00:00:00,2015-04-03 00:00:00,704,2016-11-01 00:00:00,-1,6,M,0.5
474004,3450013665,2016-10-08 00:00:00,2015-01-12 00:00:00,704,2016-10-09 00:00:00,-1,6,M,0.5


In [3]:
deposit_txn.head()

Unnamed: 0,account_no,from_to_account_no,txn_amount,txn_dt,txn_hour,txn_type
0,3450019186,3450377309,50.0,2016-02-14 00:00:00,8,DR
1,3450034484,0,50.0,2016-02-14 00:00:00,16,DR
2,3450021883,0,5000.0,2016-02-14 00:00:00,9,DR
3,3450032248,3450321146,150.0,2016-02-14 00:00:00,19,DR
4,3450029267,3450070632,850.0,2016-02-14 00:00:00,21,DR


In [4]:
account.describe()

Unnamed: 0,account_no,customer_type,dormant_days,compound_frq,eff_interest_rate
count,529709.0,529709.0,529709.0,529709.0,529709.0
mean,3450025588.732,713.632,3.107,6.0,0.5
std,13985.755,53.006,34.117,0.0,0.0
min,3450000068.0,702.0,-1.0,6.0,0.5
25%,3450014590.0,704.0,0.0,6.0,0.5
50%,3450026673.0,704.0,0.0,6.0,0.5
75%,3450036576.0,704.0,1.0,6.0,0.5
max,3450049918.0,1003.0,1589.0,6.0,0.5


In [5]:
account.head()
# sns.distplot(account['dormant_days']);

account['is_minus_one'] = account["dormant_days"].apply(lambda x: 1 if x == -1 else 0)
account.sort_values(['dormant_days']).head()

num_of_active = pd.DataFrame(account.groupby(["account_no"])['is_minus_one'].count()).reset_index()
num_of_active.head(50)

Unnamed: 0,account_no,is_minus_one
0,3450000068,33
1,3450000070,52
2,3450000073,1
3,3450000075,54
4,3450000083,60
5,3450000087,43
6,3450000089,79
7,3450000091,37
8,3450000092,37
9,3450000094,44


In [6]:
data_features = pd.DataFrame(account.account_no.unique(), columns=['account_no'])
data_features.shape
data_features.head()

account_active = num_of_active.join(data_training.set_index('account_no'), on='account_no', how='left')
account_active = account_active.dropna(axis=0, how='any')

account_active_1 = account_active[account_active['closed_in_month_11'] == 1]
account_active_0 = account_active[account_active['closed_in_month_11'] == 0]
# account_active_0.head()

# plt.plot(account_active_1.closed_in_month_11, account_active_1.is_minus_one,'r') # plotting t,a separately 
# plt.plot(account_active_0.closed_in_month_11, account_active_0.is_minus_one,'b') # plotting t,b separately 
# plt.show()

account_active.head(5)

Unnamed: 0,account_no,is_minus_one,closed_in_month_11
1,3450000070,52,0.0
2,3450000073,1,1.0
4,3450000083,60,0.0
6,3450000089,79,0.0
7,3450000091,37,0.0


In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

clf = KNeighborsClassifier()

X_all = account_active.drop(['account_no', 'closed_in_month_11'], axis=1)
y_all = account_active['closed_in_month_11']

num_test = 0.20
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=num_test, random_state=23)

from sklearn.cross_validation import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

# Choose the type of classifier. 
clf = RandomForestClassifier()

def run_kfold(clf):
    kf = KFold(account_active.shape[0], n_folds=10)
    outcomes = []
    fold = 0
    for train_index, test_index in kf:
        fold += 1
        X_train, X_test = X_all.values[train_index], X_all.values[test_index]
        y_train, y_test = y_all.values[train_index], y_all.values[test_index]
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        outcomes.append(accuracy)
        print("Fold {0} accuracy: {1}".format(fold, accuracy))     
    mean_outcome = np.mean(outcomes)
    print("Mean Accuracy: {0}".format(mean_outcome)) 

run_kfold(clf)

clf.fit(X_all, y_all)

Fold 1 accuracy: 0.8680926916221033
Fold 2 accuracy: 0.8743315508021391
Fold 3 accuracy: 0.9429590017825312
Fold 4 accuracy: 0.9393939393939394
Fold 5 accuracy: 0.9545454545454546
Fold 6 accuracy: 0.9464763603925067
Fold 7 accuracy: 0.9429081177520071
Fold 8 accuracy: 0.9339875111507583
Fold 9 accuracy: 0.9339875111507583
Fold 10 accuracy: 0.9348795718108831
Mean Accuracy: 0.9271561710403082


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [11]:
predictions = clf.predict(X_test)
accuracy_score(y_test, predictions)

0.93357111012037453

In [12]:
# Choose the type of classifier. 
random_forest = RandomForestClassifier()

# Choose some parameter combinations to try
parameters = {'n_estimators': [3, 5, 9], 
              'max_features': ['log2', 'sqrt', 'auto'], 
              'criterion': ['entropy', 'gini'],
              'max_depth': [2, 3, 5, 10], 
              'min_samples_split': [2, 3, 5],
              'min_samples_leaf': [1,5,8]
             }

# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(accuracy_score)

# Run the grid search
grid_obj = GridSearchCV(random_forest, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)

# Set the clf to the best combination of parameters
random_forest = grid_obj.best_estimator_

# Fit the best algorithm to the data. 
random_forest.fit(X_train, y_train)

predictions = random_forest.predict(X_test)
print(accuracy_score(y_test, predictions))

0.93357111012


In [13]:
# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split

X_all = account_active.drop(['account_no', 'closed_in_month_11'], axis=1)
y_all = account_active['closed_in_month_11']

num_test = 0.20
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=num_test, random_state=23)


adaboost = AdaBoostClassifier()
adaboost.fit(X_train, y_train)
predictions = adaboost.predict(X_test)
acc_adaboost_train = round(adaboost.score(X_train, y_train) * 100, 2)
acc_adaboost_test = round(adaboost.score(X_test, y_test) * 100, 2)
acc_adaboost = acc_adaboost_train, acc_adaboost_test

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
Y_pred = logreg.predict(X_test)
acc_log_train = round(logreg.score(X_train, y_train) * 100, 2)
acc_log_test = round(logreg.score(X_test, y_test) * 100, 2)
acc_log = acc_log_train, acc_log_test

svc = SVC()
svc.fit(X_train, y_train)
Y_pred = svc.predict(X_test)
acc_svc_train = round(svc.score(X_train, y_train) * 100, 2)
acc_svc_test = round(svc.score(X_test, y_test) * 100, 2)
acc_svc = acc_svc_train, acc_svc_test

knn = KNeighborsClassifier(n_neighbors = X_train.shape[1])
knn.fit(X_train, y_train)
Y_pred = knn.predict(X_test)
acc_knn_train = round(knn.score(X_train, y_train) * 100, 2)
acc_knn_test = round(knn.score(X_test, y_test) * 100, 2)
acc_knn = acc_knn_train, acc_knn_test

gaussian = GaussianNB()
gaussian.fit(X_train, y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian_train = round(gaussian.score(X_train, y_train) * 100, 2)
acc_gaussian_test = round(gaussian.score(X_test, y_test) * 100, 2)
acc_gaussian = acc_gaussian_train, acc_gaussian_test

perceptron = Perceptron()
perceptron.fit(X_train, y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron_train = round(perceptron.score(X_train, y_train) * 100, 2)
acc_perceptron_test = round(perceptron.score(X_test, y_test) * 100, 2)
acc_perceptron = acc_perceptron_train, acc_perceptron_test


linear_svc = LinearSVC()
linear_svc.fit(X_train, y_train)
Y_pred = linear_svc.predict(X_test)
acc_linear_svc_train = round(linear_svc.score(X_train, y_train) * 100, 2)
acc_linear_svc_test = round(linear_svc.score(X_test, y_test) * 100, 2)
acc_linear_svc = acc_linear_svc_train, acc_linear_svc_test

sgd = SGDClassifier()
sgd.fit(X_train, y_train)
Y_pred = sgd.predict(X_test)
acc_sgd_train = round(sgd.score(X_train, y_train) * 100, 2)
acc_sgd_test = round(sgd.score(X_test, y_test) * 100, 2)
acc_sgd = acc_sgd_train, acc_sgd_test

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree_train = round(decision_tree.score(X_train, y_train) * 100, 2)
acc_decision_tree_test = round(decision_tree.score(X_test, y_test) * 100, 2)
acc_decision_tree = acc_decision_tree_train, acc_decision_tree_test

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, y_train)
acc_random_forest_train = round(random_forest.score(X_train, y_train) * 100, 2)
acc_random_forest_test = round(random_forest.score(X_test, y_test) * 100, 2)
acc_random_forest = acc_random_forest_train, acc_random_forest_test

models = pd.DataFrame({
    'Model': ['Support Vector Machines'
              , 'KNN'
              , 'Logistic Regression'
              , 'Random Forest'
              , 'Naive Bayes'
              , 'Perceptron'
              , 'Stochastic Gradient Decent'
              , 'Linear SVC'
              , 'Decision Tree'
              , 'AdaBoost'],
    'Score': [acc_svc
              , acc_knn
              , acc_log
              , acc_random_forest
              , acc_gaussian
              , acc_perceptron
              , acc_sgd
              , acc_linear_svc
              , acc_decision_tree
              , acc_adaboost]})
models['Train Score'] = models.Score.apply(lambda x: x[0])
models['Test Score'] = models.Score.apply(lambda x: x[1])
models = models.drop('Score', axis = 1)

models.sort_values(by = 'Test Score', ascending = False)

Unnamed: 0,Model,Train Score,Test Score
0,Support Vector Machines,92.76,93.36
8,Decision Tree,92.76,93.36
9,AdaBoost,92.74,93.27
6,Stochastic Gradient Decent,92.54,93.18
3,Random Forest,92.76,93.09
7,Linear SVC,92.32,92.87
5,Perceptron,92.32,92.6
2,Logistic Regression,91.56,92.33
4,Naive Bayes,90.19,91.35
1,KNN,90.26,89.97


In [16]:
test = data_test.join(num_of_active.set_index('account_no'), on='account_no', how='left')
# test = test.drop('closed_in_month_11', 1) # Drop closed_in_month_11 column
test.head(5)
# test = test.fillna(0)

predictions = clf.predict(test.drop('account_no', axis=1))

output = pd.DataFrame({'is_merchant': predictions }).astype(int)
output.to_csv('3-3.txt', index=False, header=None)
output.describe()

Unnamed: 0,is_merchant
count,2805.0
mean,0.258
std,0.438
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [17]:
test.describe()

Unnamed: 0,account_no,is_minus_one
count,2805.0,2805.0
mean,3450025647.354,37.068
std,14047.982,27.027
min,3450000068.0,1.0
25%,3450015448.0,10.0
50%,3450026310.0,37.0
75%,3450036629.0,58.0
max,3450049917.0,92.0
