# Problem 2: Merchant Prediction

In this technological era, more than ever before, online shopping and social media sites are
becoming more popular than ever. It is becoming a significant influence to the Thai retail trade.
In order to make the interactions and transactions between merchants and buyers easier, people
can now open bank accounts to reduce the use of cash during transactions.
Your job is to create a prediction model that finds individuals who are most likely merchants
that opened bank accounts based on their credit card and deposit transactions.

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

# Beautiful format for float type
pd.set_option('display.float_format', lambda x: '%.3f' % x)

## CSV to DataFrame

In [3]:
acc_x_card = pd.read_csv('tj_02_acc_x_card.csv')
account_txn = pd.read_csv('tj_02_account_transaction.csv')
creditcard_txn = pd.read_csv('tj_02_creditcard_transaction.csv')
data_training = pd.read_csv('tj_02_training.csv', header=None, names=[
                            "account_no", "is_merchant"])
data_test = pd.read_csv('tj_02_test.csv', header=None, names=["account_no"])

## Merge Related DataFrame

## Cleanup

In [4]:
def cleanup_columns(df):
    drop_columns = []
    
    return df.drop(drop_columns, axis = 1)


def make_readable_columns(df):
    df.columns = []
    
    return df


def cleanup_cards(df):
    df = cleanup_columns(df)
    df = make_readable_columns(df)
    
    return df


# clean_card_details = cleanup_cards(card_details)
# clean_card_details.sample()

creditcard_txn['txn_date'] = pd.to_datetime(creditcard_txn['txn_date']) # Clean timestamp to only date

## Visualizing Data

In [5]:
creditcard_txn.head()

Unnamed: 0,card_no,txn_date,txn_hour,txn_amount,mer_cat_code,mer_id
0,1234000000009154,2016-12-07,22,1550,4511,0
1,1234000000017165,2016-12-07,22,250,6011,0
2,1234000000000768,2016-12-07,22,250,8398,0
3,1234000000018716,2016-12-07,22,50,5735,0
4,1234000000010162,2016-12-07,22,200,4121,0


In [6]:
account_txn.head()

Unnamed: 0,account_no,from_to_account_no,txn_amount,txn_dt,txn_hour,txn_type
0,3450045224,0,50,2016-04-03,17,DR
1,3450034734,0,10000,2016-10-07,13,DR
2,3450019126,3450594449,800,2016-10-03,18,CR
3,3450028971,3450596508,8650,2016-10-03,21,DR
4,3450000833,0,10000,2016-06-22,22,DR


## Feature Engineering

In [53]:
transaction_per_day = account_txn[['account_no', 'txn_amount', 'txn_type', 'txn_dt']].groupby(['account_no', 'txn_type', 'txn_dt']).sum().reset_index()
transaction_per_day.head()

Unnamed: 0,account_no,txn_type,txn_dt,txn_amount
0,3450000010,CR,2016-02-14,400
1,3450000010,CR,2016-02-15,1900
2,3450000010,CR,2016-02-16,3500
3,3450000010,CR,2016-02-17,250
4,3450000010,CR,2016-02-18,400


In [8]:
transaction_per_day_pivot = transaction_per_day.pivot_table(values = 'txn_amount'
                                                      , columns = 'txn_type'
                                                      , index = 'account_no'
                                                      , aggfunc = np.sum
                                                      , fill_value = 0
                                                     ).reset_index()
transaction_per_day_pivot.columns.name = None
transaction_per_day_pivot=transaction_per_day_pivot.rename(columns = {'CR':'CR_total', 'DR':'DR_total'})
transaction_per_day_pivot.head()

Unnamed: 0,account_no,CR_total,DR_total
0,3450000010,118800,125400
1,3450000011,565600,522650
2,3450000012,28300,17150
3,3450000014,690200,639300
4,3450000015,750,0


In [9]:
transaction_per_day_pivot_2 = transaction_per_day.pivot_table(values = 'txn_dt'
                                                      , columns = 'txn_type'
                                                      , index = 'account_no'
                                                      , aggfunc = np.size
                                                      , fill_value = 0
                                                     ).reset_index()
transaction_per_day_pivot_2.columns.name = None
transaction_per_day_pivot_2=transaction_per_day_pivot_2.rename(columns = {'CR':'CR_times', 'DR':'DR_times'})
transaction_per_day_pivot_2.head()

Unnamed: 0,account_no,CR_times,DR_times
0,3450000010,236,95
1,3450000011,185,203
2,3450000012,23,43
3,3450000014,261,103
4,3450000015,1,0


In [48]:
account_txn_summary = transaction_per_day_pivot.join(transaction_per_day_pivot_2.set_index('account_no'), on='account_no', how='left')
account_txn_summary.describe()

Unnamed: 0,account_no,CR_total,DR_total,CR_times,DR_times
count,16560.0,16560.0,16560.0,16560.0,16560.0
mean,3450025407.675,306715.806,306443.448,34.348,36.657
std,14925.841,1055073.717,1072027.434,52.074,41.916
min,3450000010.0,0.0,0.0,0.0,0.0
25%,3450013103.75,12450.0,12950.0,3.0,5.0
50%,3450026554.0,63700.0,64725.0,10.0,22.0
75%,3450037871.75,232250.0,229250.0,42.0,55.0
max,3450049920.0,33872400.0,37692600.0,261.0,261.0


### Day Transactions

In [13]:
transaction_per_day.head()

Unnamed: 0,account_no,txn_type,txn_dt,txn_amount
0,3450000010,CR,2016-02-14,400
1,3450000010,CR,2016-02-15,1900
2,3450000010,CR,2016-02-16,3500
3,3450000010,CR,2016-02-17,250
4,3450000010,CR,2016-02-18,400


In [51]:
transaction_per_day[transaction_per_day['txn_type'] == "DR"].describe()

Unnamed: 0,account_no,txn_amount
count,607048.0,607048.0
mean,3450027858.041,8359.641
std,14425.386,41805.41
min,3450000010.0,50.0
25%,3450016687.0,300.0
50%,3450029901.0,1300.0
75%,3450040192.0,5000.0
max,3450049920.0,6000000.0


In [63]:
transaction_cr_of_day = transaction_per_day[transaction_per_day['txn_type'] == "CR"].copy()
transaction_cr_of_day['day'] = pd.to_datetime(transaction_cr_of_day.txn_dt).dt.dayofweek
transaction_cr_of_day['day'] = transaction_cr_of_day['day'].apply(lambda day: "CR_"+{0: 'Sunday'
                                                                         , 1: 'Monday'
                                                                         , 2: 'Thuesday'
                                                                         , 3: 'Wednesday'
                                                                         , 4: 'Thursday'
                                                                         , 5: 'Friday'
                                                                         , 6: 'Saturnday'}[day])
transaction_cr_of_day = transaction_cr_of_day.drop(['txn_type', 'txn_dt'], axis=1)
transaction_cr_of_day = transaction_cr_of_day.pivot_table(values='txn_amount', columns='day', index='account_no', aggfunc=np.sum, fill_value=0).reset_index()

transaction_cr_of_day.head()
transaction_cr_of_day.describe()

day,account_no,CR_Friday,CR_Monday,CR_Saturnday,CR_Sunday,CR_Thuesday,CR_Thursday,CR_Wednesday
count,15296.0,15296.0,15296.0,15296.0,15296.0,15296.0,15296.0,15296.0
mean,3450025414.858,29045.391,59138.775,22933.515,61676.412,51934.646,55401.196,51931.632
std,15007.637,117030.328,246018.514,92394.013,238863.316,200917.808,235425.101,203996.72
min,3450000010.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3450012726.75,0.0,0.0,0.0,50.0,0.0,0.0,0.0
50%,3450026697.0,2500.0,8050.0,1500.0,9450.0,7500.0,9150.0,8100.0
75%,3450038012.25,18500.0,37750.0,15000.0,40962.5,34650.0,39400.0,35450.0
max,3450049920.0,5545450.0,9866600.0,4330750.0,8591300.0,7534850.0,9873100.0,8221650.0


In [64]:
transaction_dr_of_day = transaction_per_day[transaction_per_day['txn_type'] == "DR"].copy()
transaction_dr_of_day['day'] = pd.to_datetime(transaction_dr_of_day.txn_dt).dt.dayofweek
transaction_dr_of_day['day'] = transaction_dr_of_day['day'].apply(lambda day: "DR_"+{0: 'Sunday'
                                                                         , 1: 'Monday'
                                                                         , 2: 'Thuesday'
                                                                         , 3: 'Wednesday'
                                                                         , 4: 'Thursday'
                                                                         , 5: 'Friday'
                                                                         , 6: 'Saturnday'}[day])
transaction_dr_of_day = transaction_dr_of_day.drop(['txn_type', 'txn_dt'], axis=1)
transaction_dr_of_day = transaction_dr_of_day.pivot_table(values='txn_amount', columns='day', index='account_no', aggfunc=np.sum, fill_value=0).reset_index()

transaction_dr_of_day.head()
transaction_dr_of_day.describe()

day,account_no,DR_Friday,DR_Monday,DR_Saturnday,DR_Sunday,DR_Thuesday,DR_Thursday,DR_Wednesday
count,15457.0,15457.0,15457.0,15457.0,15457.0,15457.0,15457.0,15457.0
mean,3450025855.561,31571.0,58828.534,24807.379,59433.574,51571.495,51854.555,50244.488
std,14839.172,116384.562,248134.452,104581.285,227601.248,199189.855,223630.336,204066.001
min,3450000010.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3450013966.0,0.0,500.0,0.0,750.0,350.0,500.0,450.0
50%,3450027118.0,5050.0,8300.0,4000.0,9550.0,7550.0,8950.0,8100.0
75%,3450038197.0,22200.0,35800.0,17750.0,38900.0,32600.0,34500.0,33200.0
max,3450049920.0,4878050.0,10064050.0,6150000.0,7551000.0,6845750.0,11151800.0,7825250.0


### Merge

In [None]:
data_features = account_txn_summary.join(data_training.set_index('account_no'), on='account_no', how='left')
# data_features = data_features.dropna(axis=0, how='any')
data_features = data_features.reset_index(drop=True)
data_features.head()

In [77]:
data_features = data_training.join(account_txn_summary.set_index('account_no'), on='account_no', how='left') \
                             .join(transaction_dr_of_day.set_index('account_no'), on='account_no', how='left') \
                             .join(transaction_cr_of_day.set_index('account_no'), on='account_no', how='left') \
                             .fillna(0)
# data_features = data_features.dropna(axis=0, how='any')
data_features = data_features.reset_index(drop=True)
data_features.describe()

Unnamed: 0,account_no,is_merchant,CR_total,DR_total,CR_times,DR_times,DR_Friday,DR_Monday,DR_Saturnday,DR_Sunday,DR_Thuesday,DR_Thursday,DR_Wednesday,CR_Friday,CR_Monday,CR_Saturnday,CR_Sunday,CR_Thuesday,CR_Thursday,CR_Wednesday
count,9321.0,9321.0,9321.0,9321.0,9321.0,9321.0,9321.0,9321.0,9321.0,9321.0,9321.0,9321.0,9321.0,9321.0,9321.0,9321.0,9321.0,9321.0,9321.0,9321.0
mean,3450024729.421,0.291,290440.811,291863.909,29.698,35.131,27359.607,53025.378,21992.254,52985.742,45106.045,45601.641,45793.241,24494.877,51525.984,19335.093,55203.176,45912.826,48572.723,45396.132
std,14561.315,0.454,1008233.773,1049651.551,45.284,38.44,96067.288,241842.783,95772.572,221197.894,173749.505,208186.618,205124.745,96041.751,209756.07,79731.838,223015.774,191881.714,206771.72,193037.847
min,3450000010.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3450013634.0,0.0,16000.0,17550.0,3.0,6.0,0.0,250.0,0.0,500.0,200.0,250.0,250.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3450025592.0,0.0,65300.0,66700.0,10.0,22.0,4350.0,6950.0,3050.0,8050.0,6250.0,7550.0,6950.0,1200.0,6100.0,500.0,7300.0,5450.0,7000.0,6050.0
75%,3450036389.0,1.0,219550.0,217400.0,34.0,51.0,19300.0,30700.0,15100.0,34600.0,27500.0,29850.0,28850.0,14300.0,31950.0,11500.0,34700.0,29500.0,33600.0,29850.0
max,3450049920.0,1.0,33872400.0,37692600.0,261.0,261.0,2869950.0,10064050.0,6150000.0,7551000.0,5227050.0,11151800.0,7825250.0,3104850.0,8553250.0,3192950.0,7936800.0,6265950.0,8062050.0,8221650.0


## Splitting up the training Data

In [68]:
from sklearn.model_selection import train_test_split

training_data_features = data_features.dropna(axis=0, how='any')
X_all = training_data_features.drop(['account_no', 'is_merchant'], axis=1).astype(int)
y_all = training_data_features['is_merchant'].astype(int)

num_test = 0.20
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = num_test, random_state = 23)

In [42]:
X_all.describe()

Unnamed: 0,CR_total,DR_total,CR_times,DR_times,Friday,Monday,Saturnday,Sunday,Thuesday,Thursday,Wednesday
count,9321.0,9321.0,9321.0,9321.0,9321.0,9321.0,9321.0,9321.0,9321.0,9321.0,9321.0
mean,290440.811,291863.909,29.698,35.131,51854.484,104551.363,41327.347,108188.917,91018.871,94174.364,91189.373
std,1008233.773,1049651.551,45.284,38.44,177932.102,425442.493,149827.093,422838.409,340853.126,388028.074,378776.025
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,16000.0,17550.0,3.0,6.0,500.0,2500.0,250.0,3000.0,2050.0,2500.0,2350.0
50%,65300.0,66700.0,10.0,22.0,8800.0,17150.0,7100.0,19100.0,15650.0,18450.0,16300.0
75%,219550.0,217400.0,34.0,51.0,35950.0,67300.0,30000.0,72650.0,62150.0,67800.0,62600.0
max,33872400.0,37692600.0,261.0,261.0,5722900.0,16645750.0,6150650.0,14109150.0,10436550.0,15212850.0,15807100.0


## Fitting and Tuning an Classifier

In [69]:
# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split

num_test = 0.20
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=num_test, random_state=23)

adaboost = AdaBoostClassifier()
adaboost.fit(X_train, y_train)
predictions = adaboost.predict(X_test)
acc_adaboost_train = round(adaboost.score(X_train, y_train) * 100, 2)
acc_adaboost_test = round(adaboost.score(X_test, y_test) * 100, 2)
acc_adaboost = acc_adaboost_train, acc_adaboost_test

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
Y_pred = logreg.predict(X_test)
acc_log_train = round(logreg.score(X_train, y_train) * 100, 2)
acc_log_test = round(logreg.score(X_test, y_test) * 100, 2)
acc_log = acc_log_train, acc_log_test

svc = SVC()
svc.fit(X_train, y_train)
Y_pred = svc.predict(X_test)
acc_svc_train = round(svc.score(X_train, y_train) * 100, 2)
acc_svc_test = round(svc.score(X_test, y_test) * 100, 2)
acc_svc = acc_svc_train, acc_svc_test

knn = KNeighborsClassifier(n_neighbors = X_train.shape[1])
knn.fit(X_train, y_train)
Y_pred = knn.predict(X_test)
acc_knn_train = round(knn.score(X_train, y_train) * 100, 2)
acc_knn_test = round(knn.score(X_test, y_test) * 100, 2)
acc_knn = acc_knn_train, acc_knn_test

gaussian = GaussianNB()
gaussian.fit(X_train, y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian_train = round(gaussian.score(X_train, y_train) * 100, 2)
acc_gaussian_test = round(gaussian.score(X_test, y_test) * 100, 2)
acc_gaussian = acc_gaussian_train, acc_gaussian_test

perceptron = Perceptron()
perceptron.fit(X_train, y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron_train = round(perceptron.score(X_train, y_train) * 100, 2)
acc_perceptron_test = round(perceptron.score(X_test, y_test) * 100, 2)
acc_perceptron = acc_perceptron_train, acc_perceptron_test


linear_svc = LinearSVC()
linear_svc.fit(X_train, y_train)
Y_pred = linear_svc.predict(X_test)
acc_linear_svc_train = round(linear_svc.score(X_train, y_train) * 100, 2)
acc_linear_svc_test = round(linear_svc.score(X_test, y_test) * 100, 2)
acc_linear_svc = acc_linear_svc_train, acc_linear_svc_test

sgd = SGDClassifier()
sgd.fit(X_train, y_train)
Y_pred = sgd.predict(X_test)
acc_sgd_train = round(sgd.score(X_train, y_train) * 100, 2)
acc_sgd_test = round(sgd.score(X_test, y_test) * 100, 2)
acc_sgd = acc_sgd_train, acc_sgd_test

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree_train = round(decision_tree.score(X_train, y_train) * 100, 2)
acc_decision_tree_test = round(decision_tree.score(X_test, y_test) * 100, 2)
acc_decision_tree = acc_decision_tree_train, acc_decision_tree_test

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, y_train)
acc_random_forest_train = round(random_forest.score(X_train, y_train) * 100, 2)
acc_random_forest_test = round(random_forest.score(X_test, y_test) * 100, 2)
acc_random_forest = acc_random_forest_train, acc_random_forest_test

models = pd.DataFrame({
    'Model': ['Support Vector Machines'
              , 'KNN'
              , 'Logistic Regression'
              , 'Random Forest'
              , 'Naive Bayes'
              , 'Perceptron'
              , 'Stochastic Gradient Decent'
              , 'Linear SVC'
              , 'Decision Tree'
              , 'AdaBoost'],
    'Score': [acc_svc
              , acc_knn
              , acc_log
              , acc_random_forest
              , acc_gaussian
              , acc_perceptron
              , acc_sgd
              , acc_linear_svc
              , acc_decision_tree
              , acc_adaboost]})
models['Train Score'] = models.Score.apply(lambda x: x[0])
models['Test Score'] = models.Score.apply(lambda x: x[1])
models = models.drop('Score', axis = 1)

models.sort_values(by = 'Test Score', ascending = False)

Unnamed: 0,Model,Train Score,Test Score
3,Random Forest,99.97,84.02
2,Logistic Regression,82.59,83.97
9,AdaBoost,83.76,83.27
8,Decision Tree,99.97,76.84
1,KNN,77.9,75.82
4,Naive Bayes,71.59,73.24
0,Support Vector Machines,99.97,72.44
5,Perceptron,70.52,72.44
7,Linear SVC,65.69,67.13
6,Stochastic Gradient Decent,57.43,60.16


## Validate with KFold

In [83]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()

from sklearn.cross_validation import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

# Choose the type of classifier. 
clf = RandomForestClassifier()

def run_kfold(clf):
    kf = KFold(training_data_features.shape[0], n_folds=10)
    outcomes = []
    fold = 0
    for train_index, test_index in kf:
        fold += 1
        X_train, X_test = X_all.values[train_index], X_all.values[test_index]
        y_train, y_test = y_all.values[train_index], y_all.values[test_index]
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        outcomes.append(accuracy)
        print("Fold {0} accuracy: {1}".format(fold, accuracy))     
    mean_outcome = np.mean(outcomes)
    print("Mean Accuracy: {0}".format(mean_outcome)) 

run_kfold(clf)

clf.fit(X_all, y_all)

Fold 1 accuracy: 0.8295819935691319
Fold 2 accuracy: 0.8036480686695279
Fold 3 accuracy: 0.8283261802575107
Fold 4 accuracy: 0.825107296137339
Fold 5 accuracy: 0.8175965665236051
Fold 6 accuracy: 0.8122317596566524
Fold 7 accuracy: 0.8186695278969958
Fold 8 accuracy: 0.8090128755364807
Fold 9 accuracy: 0.8186695278969958
Fold 10 accuracy: 0.8218884120171673
Mean Accuracy: 0.8184732208161405


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [72]:
predictions = clf.predict(X_test)
accuracy_score(y_test, predictions)

0.98498659517426268

In [74]:
# Choose the type of classifier. 
random_forest = RandomForestClassifier()

# Choose some parameter combinations to try
parameters = {'n_estimators': [50], 
              'max_features': ['log2', 'sqrt', 'auto'], 
              'criterion': ['entropy', 'gini'],
              'max_depth': [2, 3, 5, 10], 
              'min_samples_split': [2, 3, 5],
              'min_samples_leaf': [1,5,8]
             }

# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(accuracy_score)

# Run the grid search
grid_obj = GridSearchCV(random_forest, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)

# Set the clf to the best combination of parameters
random_forest = grid_obj.best_estimator_

# Fit the best algorithm to the data. 
random_forest.fit(X_train, y_train)

predictions = random_forest.predict(X_test)
print(accuracy_score(y_test, predictions))

0.841286863271


In [85]:
test = data_test.join(account_txn_summary.set_index('account_no'), on='account_no', how='left') \
                             .join(transaction_dr_of_day.set_index('account_no'), on='account_no', how='left') \
                             .join(transaction_cr_of_day.set_index('account_no'), on='account_no', how='left') \
                             .fillna(0)
# test = test.drop('closed_in_month_11', 1) # Drop closed_in_month_11 column
test.head(5)
# test = test.fillna(0)

clf = random_forest
predictions = clf.predict(test.drop('account_no', axis=1))

output = pd.DataFrame({'is_merchant': predictions }).astype(int)
output.to_csv('2.txt', index=False, header=None)
output.describe()

Unnamed: 0,is_merchant
count,2331.0
mean,0.26
std,0.439
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [75]:
output.describe()

NameError: name 'output' is not defined

## Model Version
### Save model to file

In [86]:
from sklearn.externals import joblib

version_name = input('version name: ')

joblib.dump(clf, 'model_{}.pkl'.format(version_name)) 

version name: 841285


['model_841285.pkl']

### Load model from file

In [None]:
from sklearn.externals import joblib

version_name = input('version name: ')

clf = joblib.load('model_{}.pkl'.format(version_name)) 

## Predict the Actual Test Data

In [None]:
test = data_test.join(final_card_details.set_index('card_no'), on = 'card_no', how = 'left')
predictions = clf.predict(test.drop('card_no', axis = 1))

output = pd.DataFrame({'npl_flag': predictions })
output.to_csv('1.txt', index = False, header = None)
output.head(10)