In [191]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Beautiful format for float type
pd.set_option('display.float_format', lambda x: '%.3f' % x)

account_info = pd.read_csv('account_info.csv')
account_txn = pd.read_csv('account_transaction.csv')
data_training = pd.read_csv('tj_04_training.csv', header=None, names=["account_no", "account_type"])
data_test = pd.read_csv('tj_04_test.csv', header=None, names=["account_no"])

In [192]:
account_info.info()
print('\n')
account_txn.info()
# data_training.info()
# data_test.info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5400 entries, 0 to 5399
Data columns (total 5 columns):
account_no    5400 non-null int64
pos_dt        5400 non-null object
opn_dt        5400 non-null object
cls_dt        95 non-null object
drmt_dys      5400 non-null int64
dtypes: int64(2), object(3)
memory usage: 211.0+ KB


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 797089 entries, 0 to 797088
Data columns (total 6 columns):
account_no          797089 non-null int64
fm_to_account_no    797089 non-null int64
txn_amt             797089 non-null int64
txn_dt              797089 non-null object
txn_tm              0 non-null float64
txn_type            797089 non-null object
dtypes: float64(1), int64(3), object(2)
memory usage: 36.5+ MB


In [193]:
account_info = account_info.drop('cls_dt', axis=1) # Drop cls_dt column, they are all NULL value
account_txn = account_txn.drop('txn_tm', axis=1) # Drop txn_tm column, they are all NULL value

In [194]:
account_info['pos_dt'] = pd.to_datetime(account_info['pos_dt']) # Clean timestamp to only date
account_info['opn_dt'] = pd.to_datetime(account_info['opn_dt']) # Clean timestamp to only date
account_info.sort_values('account_no').head(50)

Unnamed: 0,account_no,pos_dt,opn_dt,drmt_dys
3687,3450000004,2016-10-31,2003-07-11,3
5371,3450000020,2016-10-31,2015-07-20,5
3873,3450000023,2016-10-31,2015-07-20,0
4066,3450000025,2016-10-31,2015-07-20,0
2959,3450000026,2016-10-31,2015-07-20,217
3727,3450000042,2016-10-31,2015-07-22,152
5302,3450000045,2016-10-31,2015-07-22,0
4006,3450000061,2016-10-31,2015-07-24,174
5060,3450000063,2016-10-31,2015-07-24,0
3411,3450000064,2016-10-31,2015-07-25,89


In [195]:
account_txn['txn_dt'] = pd.to_datetime(account_txn['txn_dt']) # Clean timestamp to only date
account_txn.sort_values('account_no').head(50)

Unnamed: 0,account_no,fm_to_account_no,txn_amt,txn_dt,txn_type
169006,3450000004,0,500,2016-05-30,CR
355983,3450000004,0,200,2016-04-29,DR
15941,3450000004,0,500,2016-04-28,CR
313630,3450000004,0,200,2016-05-31,DR
79473,3450000004,0,500,2016-06-29,CR
202730,3450000004,0,200,2016-06-30,DR
578636,3450000004,0,500,2016-09-28,CR
515750,3450000004,0,200,2016-09-30,DR
434603,3450000020,0,1800,2016-06-20,DR
395867,3450000020,3450000585,18100,2016-03-11,CR


In [196]:
transaction_per_day = account_txn[['account_no', 'txn_amt', 'txn_type', 'txn_dt']].groupby(['account_no', 'txn_type', 'txn_dt']).sum().reset_index()
transaction_per_day.head(20)

Unnamed: 0,account_no,txn_type,txn_dt,txn_amt
0,3450000004,CR,2016-04-28,500
1,3450000004,CR,2016-05-30,500
2,3450000004,CR,2016-06-29,500
3,3450000004,CR,2016-09-28,500
4,3450000004,DR,2016-04-29,200
5,3450000004,DR,2016-05-31,200
6,3450000004,DR,2016-06-30,200
7,3450000004,DR,2016-09-30,200
8,3450000020,CR,2016-02-25,150900
9,3450000020,CR,2016-03-11,18100


In [197]:
transaction_per_day_pivot = transaction_per_day.pivot_table(values = 'txn_amt'
                                                      , columns = 'txn_type'
                                                      , index = 'account_no'
                                                      , aggfunc = np.sum
                                                      , fill_value = 0
                                                     ).reset_index()
transaction_per_day_pivot.columns.name = None
transaction_per_day_pivot=transaction_per_day_pivot.rename(columns = {'CR':'CR_total', 'DR':'DR_total'})
transaction_per_day_pivot.head()

Unnamed: 0,account_no,CR_total,DR_total
0,3450000004,2000,800
1,3450000020,1336100,1227200
2,3450000023,279900,535200
3,3450000025,727100,2256200
4,3450000026,93900,166400


In [198]:
transaction_per_day_pivot_2 = transaction_per_day.pivot_table(values = 'txn_dt'
                                                      , columns = 'txn_type'
                                                      , index = 'account_no'
                                                      , aggfunc = np.size
                                                      , fill_value = 0
                                                     ).reset_index()
transaction_per_day_pivot_2.columns.name = None
transaction_per_day_pivot_2=transaction_per_day_pivot_2.rename(columns = {'CR':'CR_times', 'DR':'DR_times'})
transaction_per_day_pivot_2.head()

Unnamed: 0,account_no,CR_times,DR_times
0,3450000004,4,4
1,3450000020,14,15
2,3450000023,2,2
3,3450000025,12,21
4,3450000026,3,7


In [199]:
transaction_drmt_days = account_info[['account_no', 'drmt_dys',]].groupby(['account_no']).sum().reset_index()
transaction_drmt_days.head()

Unnamed: 0,account_no,drmt_dys
0,3450000004,3
1,3450000020,5
2,3450000023,0
3,3450000025,0
4,3450000026,217


In [200]:
account_txn_summary = transaction_per_day_pivot.join(transaction_per_day_pivot_2.set_index('account_no'), on='account_no', how='left')
account_txn_summary = account_txn_summary.join(transaction_drmt_days.set_index('account_no'), on='account_no', how='left')
account_txn_summary['drmt_dys'].fillna(0, inplace=True)
account_txn_summary["drmt_dys"] = np.sqrt(account_txn_summary["drmt_dys"])
account_txn_summary.head()

  after removing the cwd from sys.path.


Unnamed: 0,account_no,CR_total,DR_total,CR_times,DR_times,drmt_dys
0,3450000004,2000,800,4,4,1.732
1,3450000020,1336100,1227200,14,15,2.236
2,3450000023,279900,535200,2,2,0.0
3,3450000025,727100,2256200,12,21,0.0
4,3450000026,93900,166400,3,7,14.731


In [201]:
data_training.loc[data_training["account_type"] == "sa", "account_type"] = 0
data_training.loc[data_training["account_type"] == "ca", "account_type"] = 1
data_training.head(6)

Unnamed: 0,account_no,account_type
0,3450002243,0
1,3450013179,0
2,3450005173,0
3,3450034206,0
4,3450033428,0
5,3450013510,1


In [202]:
data_features = account_txn_summary.join(data_training.set_index('account_no'), on='account_no', how='left')
# data_features = data_features.dropna(axis=0, how='any')
data_features = data_features.reset_index(drop=True)
data_features.head()

Unnamed: 0,account_no,CR_total,DR_total,CR_times,DR_times,drmt_dys,account_type
0,3450000004,2000,800,4,4,1.732,1.0
1,3450000020,1336100,1227200,14,15,2.236,
2,3450000023,279900,535200,2,2,0.0,1.0
3,3450000025,727100,2256200,12,21,0.0,1.0
4,3450000026,93900,166400,3,7,14.731,1.0


In [203]:
from sklearn.model_selection import train_test_split

training_data_features = data_features.dropna(axis=0, how='any')
X_all = training_data_features.drop(['account_no', 'account_type'], axis=1).astype(int)
y_all = training_data_features['account_type'].astype(int)

num_test = 0.20
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = num_test, random_state = 23)

In [204]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

In [205]:
##### from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

# Choose the type of classifier. 
clf = RandomForestClassifier(n_estimators=299)
# clf = KNeighborsClassifier()

def run_kfold(clf):
    kf = KFold(training_data_features.shape[0], n_folds=10)
    outcomes = []
    fold = 0
    for train_index, test_index in kf:
        fold += 1
        X_train, X_test = X_all.values[train_index], X_all.values[test_index]
        y_train, y_test = y_all.values[train_index], y_all.values[test_index]
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        outcomes.append(accuracy)
        print("Fold {0} accuracy: {1}".format(fold, accuracy))     
    mean_outcome = np.mean(outcomes)
    print("Mean Accuracy: {0}".format(mean_outcome)) 

run_kfold(clf)

clf.fit(X_all, y_all)

Fold 1 accuracy: 0.8680555555555556
Fold 2 accuracy: 0.8425925925925926
Fold 3 accuracy: 0.9004629629629629
Fold 4 accuracy: 0.8842592592592593
Fold 5 accuracy: 0.8912037037037037
Fold 6 accuracy: 0.8796296296296297
Fold 7 accuracy: 0.9305555555555556
Fold 8 accuracy: 0.877030162412993
Fold 9 accuracy: 0.8677494199535963
Fold 10 accuracy: 0.851508120649652
Mean Accuracy: 0.87930469622755


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=299, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [206]:
predictions = clf.predict(X_test)
print(accuracy_score(y_test, predictions))

1.0


In [207]:
clf.score(X_train, y_train)

0.99971039675644369

In [208]:
test_data_features = data_test.join(data_features.set_index('account_no'), on = 'account_no', how = 'left')
test_data_features.head()

Unnamed: 0,account_no,CR_total,DR_total,CR_times,DR_times,drmt_dys,account_type
0,3450030692,1540400,1205600,57,104,0.0,
1,3450036445,3626000,4496500,23,33,2.236,
2,3450034940,2225300,11729000,24,50,0.0,
3,3450003291,180700,159400,17,40,0.0,
4,3450011774,51500,38300,6,9,3.606,


In [209]:
test = data_test.join(account_txn_summary.set_index('account_no'), on='account_no', how='left')

predictions = clf.predict(test.drop(['account_no'], axis = 1))

output = pd.DataFrame({'account_type': predictions})
output.loc[output['account_type'] == 0] = 'sa'
output.loc[output['account_type'] == 1] = 'ca'
output.to_csv('4.txt', index = False, header = None)
output.head(10)

Unnamed: 0,account_type
0,sa
1,ca
2,sa
3,sa
4,ca
5,sa
6,ca
7,sa
8,sa
9,sa
