In [219]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Beautiful format for float type
pd.set_option('display.float_format', lambda x: '%.3f' % x)

account_info = pd.read_csv('account_info.csv')
account_txn = pd.read_csv('account_transaction.csv')
data_training = pd.read_csv('tj_04_training.csv', header=None, names=["account_no", "account_type"])
data_test = pd.read_csv('tj_04_test.csv', header=None, names=["account_no"])

In [220]:
account_info.info()
print('\n')
account_txn.info()
# data_training.info()
# data_test.info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5400 entries, 0 to 5399
Data columns (total 5 columns):
account_no    5400 non-null int64
pos_dt        5400 non-null object
opn_dt        5400 non-null object
cls_dt        95 non-null object
drmt_dys      5400 non-null int64
dtypes: int64(2), object(3)
memory usage: 211.0+ KB


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 797089 entries, 0 to 797088
Data columns (total 6 columns):
account_no          797089 non-null int64
fm_to_account_no    797089 non-null int64
txn_amt             797089 non-null int64
txn_dt              797089 non-null object
txn_tm              0 non-null float64
txn_type            797089 non-null object
dtypes: float64(1), int64(3), object(2)
memory usage: 36.5+ MB


In [221]:
account_info = account_info.drop('cls_dt', axis=1) # Drop cls_dt column, they are all NULL value
account_txn = account_txn.drop('txn_tm', axis=1) # Drop txn_tm column, they are all NULL value

In [222]:
account_info['pos_dt'] = pd.to_datetime(account_info['pos_dt']) # Clean timestamp to only date
account_info['opn_dt'] = pd.to_datetime(account_info['opn_dt']) # Clean timestamp to only date
account_info.sort_values('account_no').head(50)

Unnamed: 0,account_no,pos_dt,opn_dt,drmt_dys
3687,3450000004,2016-10-31,2003-07-11,3
5371,3450000020,2016-10-31,2015-07-20,5
3873,3450000023,2016-10-31,2015-07-20,0
4066,3450000025,2016-10-31,2015-07-20,0
2959,3450000026,2016-10-31,2015-07-20,217
3727,3450000042,2016-10-31,2015-07-22,152
5302,3450000045,2016-10-31,2015-07-22,0
4006,3450000061,2016-10-31,2015-07-24,174
5060,3450000063,2016-10-31,2015-07-24,0
3411,3450000064,2016-10-31,2015-07-25,89


In [223]:
account_txn['txn_dt'] = pd.to_datetime(account_txn['txn_dt']) # Clean timestamp to only date
account_txn.sort_values('account_no').head(50)

Unnamed: 0,account_no,fm_to_account_no,txn_amt,txn_dt,txn_type
169006,3450000004,0,500,2016-05-30,CR
355983,3450000004,0,200,2016-04-29,DR
15941,3450000004,0,500,2016-04-28,CR
313630,3450000004,0,200,2016-05-31,DR
79473,3450000004,0,500,2016-06-29,CR
202730,3450000004,0,200,2016-06-30,DR
578636,3450000004,0,500,2016-09-28,CR
515750,3450000004,0,200,2016-09-30,DR
434603,3450000020,0,1800,2016-06-20,DR
395867,3450000020,3450000585,18100,2016-03-11,CR


In [224]:
transaction_per_day = account_txn[['account_no', 'txn_amt', 'txn_type', 'txn_dt']].groupby(['account_no', 'txn_type', 'txn_dt']).sum().reset_index()
transaction_per_day.head(20)

Unnamed: 0,account_no,txn_type,txn_dt,txn_amt
0,3450000004,CR,2016-04-28,500
1,3450000004,CR,2016-05-30,500
2,3450000004,CR,2016-06-29,500
3,3450000004,CR,2016-09-28,500
4,3450000004,DR,2016-04-29,200
5,3450000004,DR,2016-05-31,200
6,3450000004,DR,2016-06-30,200
7,3450000004,DR,2016-09-30,200
8,3450000020,CR,2016-02-25,150900
9,3450000020,CR,2016-03-11,18100


In [225]:
transaction_per_day_pivot = transaction_per_day.pivot_table(values = 'txn_amt'
                                                      , columns = 'txn_type'
                                                      , index = 'account_no'
                                                      , aggfunc = np.sum
                                                      , fill_value = 0
                                                     ).reset_index()
transaction_per_day_pivot.columns.name = None
transaction_per_day_pivot=transaction_per_day_pivot.rename(columns = {'CR':'CR_total', 'DR':'DR_total'})
transaction_per_day_pivot.head()

Unnamed: 0,account_no,CR_total,DR_total
0,3450000004,2000,800
1,3450000020,1336100,1227200
2,3450000023,279900,535200
3,3450000025,727100,2256200
4,3450000026,93900,166400


In [226]:
transaction_per_day_pivot_2 = transaction_per_day.pivot_table(values = 'txn_dt'
                                                      , columns = 'txn_type'
                                                      , index = 'account_no'
                                                      , aggfunc = np.size
                                                      , fill_value = 0
                                                     ).reset_index()
transaction_per_day_pivot_2.columns.name = None
transaction_per_day_pivot_2=transaction_per_day_pivot_2.rename(columns = {'CR':'CR_times', 'DR':'DR_times'})
transaction_per_day_pivot_2.head()

Unnamed: 0,account_no,CR_times,DR_times
0,3450000004,4,4
1,3450000020,14,15
2,3450000023,2,2
3,3450000025,12,21
4,3450000026,3,7


In [227]:
account_txn_summary = transaction_per_day_pivot.join(transaction_per_day_pivot_2.set_index('account_no'), on='account_no', how='left')
account_txn_summary.head()

Unnamed: 0,account_no,CR_total,DR_total,CR_times,DR_times
0,3450000004,2000,800,4,4
1,3450000020,1336100,1227200,14,15
2,3450000023,279900,535200,2,2
3,3450000025,727100,2256200,12,21
4,3450000026,93900,166400,3,7


In [228]:
data_training.loc[data_training["account_type"] == "sa", "account_type"] = 0
data_training.loc[data_training["account_type"] == "ca", "account_type"] = 1
data_training.head(6)

Unnamed: 0,account_no,account_type
0,3450002243,0
1,3450013179,0
2,3450005173,0
3,3450034206,0
4,3450033428,0
5,3450013510,1


In [229]:
data_features = account_txn_summary.join(data_training.set_index('account_no'), on='account_no', how='left')
# data_features = data_features.dropna(axis=0, how='any')
data_features = data_features.reset_index(drop=True)
data_features.head()

Unnamed: 0,account_no,CR_total,DR_total,CR_times,DR_times,account_type
0,3450000004,2000,800,4,4,1.0
1,3450000020,1336100,1227200,14,15,
2,3450000023,279900,535200,2,2,1.0
3,3450000025,727100,2256200,12,21,1.0
4,3450000026,93900,166400,3,7,1.0


In [230]:
training_data_features = data_features.dropna(axis=0, how='any')
X_all = training_data_features.drop(['account_no', 'account_type'], axis=1).astype(int)
y_all = training_data_features['account_type'].astype(int)

num_test = 0.20
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = num_test, random_state = 23)

In [231]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

In [232]:
# Choose the type of classifier. 
clf = RandomForestClassifier(n_estimators = 300, min_samples_leaf = 5)

# Fit the best algorithm to the data. 
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=5,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=300, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [233]:
predictions = clf.predict(X_test)
print(accuracy_score(y_test, predictions))

0.876157407407


In [234]:
clf.score(X_train, y_train)

0.9282407407407407

In [235]:
test_data_features = data_test.join(data_features.set_index('account_no'), on = 'account_no', how = 'left')
test_data_features.head()

Unnamed: 0,account_no,CR_total,DR_total,CR_times,DR_times,account_type
0,3450030692,1540400,1205600,57,104,
1,3450036445,3626000,4496500,23,33,
2,3450034940,2225300,11729000,24,50,
3,3450003291,180700,159400,17,40,
4,3450011774,51500,38300,6,9,


In [242]:
predictions = clf.predict(test.drop(['account_no', 'account_type'], axis = 1))

output = pd.DataFrame({'account_type': predictions })
output.loc[output['account_type'] == 0] = 'sa'
output.loc[output['account_type'] == 1] = 'ca'
output.to_csv('4.txt', index = False, header = None)
output.head(10)

Unnamed: 0,account_type
0,sa
1,ca
2,sa
3,sa
4,ca
5,sa
6,ca
7,sa
8,sa
9,sa


In [246]:
# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split

num_test = 0.20
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=num_test, random_state=23)

adaboost = AdaBoostClassifier()
adaboost.fit(X_train, y_train)
predictions = adaboost.predict(X_test)
acc_adaboost_train = round(adaboost.score(X_train, y_train) * 100, 2)
acc_adaboost_test = round(adaboost.score(X_test, y_test) * 100, 2)
acc_adaboost = acc_adaboost_train, acc_adaboost_test

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
Y_pred = logreg.predict(X_test)
acc_log_train = round(logreg.score(X_train, y_train) * 100, 2)
acc_log_test = round(logreg.score(X_test, y_test) * 100, 2)
acc_log = acc_log_train, acc_log_test

svc = SVC()
svc.fit(X_train, y_train)
Y_pred = svc.predict(X_test)
acc_svc_train = round(svc.score(X_train, y_train) * 100, 2)
acc_svc_test = round(svc.score(X_test, y_test) * 100, 2)
acc_svc = acc_svc_train, acc_svc_test

knn = KNeighborsClassifier(n_neighbors = X_train.shape[1])
knn.fit(X_train, y_train)
Y_pred = knn.predict(X_test)
acc_knn_train = round(knn.score(X_train, y_train) * 100, 2)
acc_knn_test = round(knn.score(X_test, y_test) * 100, 2)
acc_knn = acc_knn_train, acc_knn_test

gaussian = GaussianNB()
gaussian.fit(X_train, y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian_train = round(gaussian.score(X_train, y_train) * 100, 2)
acc_gaussian_test = round(gaussian.score(X_test, y_test) * 100, 2)
acc_gaussian = acc_gaussian_train, acc_gaussian_test

perceptron = Perceptron()
perceptron.fit(X_train, y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron_train = round(perceptron.score(X_train, y_train) * 100, 2)
acc_perceptron_test = round(perceptron.score(X_test, y_test) * 100, 2)
acc_perceptron = acc_perceptron_train, acc_perceptron_test


linear_svc = LinearSVC()
linear_svc.fit(X_train, y_train)
Y_pred = linear_svc.predict(X_test)
acc_linear_svc_train = round(linear_svc.score(X_train, y_train) * 100, 2)
acc_linear_svc_test = round(linear_svc.score(X_test, y_test) * 100, 2)
acc_linear_svc = acc_linear_svc_train, acc_linear_svc_test

sgd = SGDClassifier()
sgd.fit(X_train, y_train)
Y_pred = sgd.predict(X_test)
acc_sgd_train = round(sgd.score(X_train, y_train) * 100, 2)
acc_sgd_test = round(sgd.score(X_test, y_test) * 100, 2)
acc_sgd = acc_sgd_train, acc_sgd_test

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree_train = round(decision_tree.score(X_train, y_train) * 100, 2)
acc_decision_tree_test = round(decision_tree.score(X_test, y_test) * 100, 2)
acc_decision_tree = acc_decision_tree_train, acc_decision_tree_test

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, y_train)
acc_random_forest_train = round(random_forest.score(X_train, y_train) * 100, 2)
acc_random_forest_test = round(random_forest.score(X_test, y_test) * 100, 2)
acc_random_forest = acc_random_forest_train, acc_random_forest_test

models = pd.DataFrame({
    'Model': ['Support Vector Machines'
              , 'KNN'
              , 'Logistic Regression'
              , 'Random Forest'
              , 'Naive Bayes'
              , 'Perceptron'
              , 'Stochastic Gradient Decent'
              , 'Linear SVC'
              , 'Decision Tree'
              , 'AdaBoost'],
    'Score': [acc_svc
              , acc_knn
              , acc_log
              , acc_random_forest
              , acc_gaussian
              , acc_perceptron
              , acc_sgd
              , acc_linear_svc
              , acc_decision_tree
              , acc_adaboost]})
models['Train Score'] = models.Score.apply(lambda x: x[0])
models['Test Score'] = models.Score.apply(lambda x: x[1])
models = models.drop('Score', axis = 1)

models.sort_values(by = 'Test Score', ascending = False)

Unnamed: 0,Model,Train Score,Test Score
3,Random Forest,99.97,87.5
9,AdaBoost,89.18,87.04
8,Decision Tree,99.97,81.48
1,KNN,78.39,70.02
2,Logistic Regression,61.43,61.81
0,Support Vector Machines,99.97,57.75
5,Perceptron,49.65,51.39
4,Naive Bayes,52.05,49.77
6,Stochastic Gradient Decent,50.35,48.61
7,Linear SVC,44.97,42.94
