# Gender Prediction from Credit Card Transactions

Exploring the ways customers’ use their credit cards will give banks, researchers, marketers
and economists more information to understand their spending habits. With this kind of
knowledge, banks or marketing strategists could tailor their marketing and communication
programs according to customers’ consumption behaviors or patterns.
 
This problem focuses on credit card spending transactions to answer a simple question: could
you train an algorithm to predict the gender of the owner of a credit card?


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

# Beautiful format for float type
pd.set_option('display.float_format', lambda x: '%.3f' % x)

## CSV to DataFrame

In [110]:
transactions = pd.read_csv('tj_05_credit_card_transaction.csv')
transactions = transactions.dropna()

mcc_codes = pd.read_csv('mcc_codes.csv')

data_train = pd.read_csv('tj_05_training.csv', names = ['card_no', 'gender'])
data_train = data_train.drop_duplicates(keep = False, subset = 'card_no')

data_test = pd.read_csv('tj_05_test.csv', names = ['card_no'])

In [111]:
transactions.describe()

Unnamed: 0,card_no,txn_hour,txn_amount,mer_cat_code,mer_id
count,884013.0,884013.0,884013.0,884013.0,884013.0
mean,1234000000014117.5,14.466,1712.669,5803.844,7032.409
std,8077.745,4.67,26015.682,877.35,10584.628
min,1234000000000001.0,0.0,50.0,742.0,0.0
25%,1234000000007269.0,11.0,250.0,5411.0,0.0
50%,1234000000014186.0,15.0,500.0,5631.0,0.0
75%,1234000000021105.0,18.0,1000.0,6011.0,13173.0
max,1234000000028171.0,23.0,19201000.0,9405.0,28727.0


## Merge Related DataFrame

In [112]:
transactions_with_mcc = transactions.join(mcc_codes.set_index('mcc'), on = 'mer_cat_code')

transactions_with_mcc.head()

Unnamed: 0,card_no,txn_date,txn_hour,txn_amount,mer_cat_code,mer_id,edited_description,combined_description,usda_description,irs_description,irs_reportable
0,1234000000009154,2016-12-07 00:00:00,22.0,1550.0,4511.0,0,"Airlines, Air Carriers ( not listed elsewhere)","Airlines, Air Carriers ( not listed elsewhere)","Airlines, Air Carriers ( not listed elsewhere)","Airlines, Air Carriers",Yes
1,1234000000017165,2016-12-07 00:00:00,22.0,250.0,6011.0,0,Financial Institutions – Manual Cash Disbursem...,Financial Institutions – Manual Cash Disbursem...,Financial Institutions – Manual Cash Disbursem...,Automated Cash Disburse,No1.6041-3(c)
2,1234000000000768,2016-12-07 00:00:00,22.0,250.0,8398.0,0,Charitable and Social Service Organizations,Charitable and Social Service Organizations,Charitable and Social Service Organizations,Charitable and Social Service Organizations - ...,No1.6041-3(p)(2)
3,1234000000018716,2016-12-07 00:00:00,22.0,50.0,5735.0,0,Record Shops,Record Shops,Record Shops,Record Stores,No1.6041-3(c)
4,1234000000016652,2016-12-07 00:00:00,22.0,50.0,5735.0,0,Record Shops,Record Shops,Record Shops,Record Stores,No1.6041-3(c)


## Transform Features

In [113]:
def cleanup_columns(df):
    drop_columns = ['mer_cat_code'
                    , 'mer_id'
                    , 'edited_description'
                    , 'combined_description'
                    , 'usda_description'
                    , 'irs_reportable']
    
    return df.drop(drop_columns, axis = 1).dropna()


def make_readable_columns(df):
    df.columns = ['card_no'
                  , 'date'
                  , 'hour'
                  , 'amount'
                  , 'merchant']
    
    return df


def simplify_hour(df):
    bins = (-1, 4, 12, 17, 21, 25)
    group_names = ['LateNight', 'Morning', 'Afternoon', 'Evening', 'EarlyNight']
    hours = pd.cut(df.hour, bins, labels=group_names)
    df.hour = hours
    
    return df


def build_day(df):
    df['day'] = pd.to_datetime(df.date).dt.dayofweek
    df['day'] = df['day'].apply(lambda day: {0: 'Sunday'
                                             , 1: 'Monday'
                                             , 2: 'Thuesday'
                                             , 3: 'Wednesday'
                                             , 4: 'Thursday'
                                             , 5: 'Friday'
                                             , 6: 'Saturnday'}[day])
    return df


def simplify_date(df):
    df['date'] = pd.to_datetime(df['date']).map(lambda x: x.strftime('%Y-%m-%d'))
    
    return df


def transform_features(df):
    df = cleanup_columns(df)
    df = make_readable_columns(df)
    df = simplify_hour(df)
    df = build_day(df)
    df = simplify_date(df)
    
    return df


clean_transactions = transform_features(transactions_with_mcc)
clean_transactions.describe()

Unnamed: 0,card_no,amount
count,882745.0,882745.0
mean,1234000000014116.5,1714.763
std,8076.299,26034.293
min,1234000000000001.0,50.0
25%,1234000000007269.0,250.0
50%,1234000000014186.0,500.0
75%,1234000000021104.0,1050.0
max,1234000000028171.0,19201000.0


## Visualizing Data

## Feature Engineering

### Amount of transaction per day

In [150]:
# transaction_per_day = clean_transactions[['card_no', 'amount', 'day']].groupby(['card_no', 'day']).sum().reset_index()

transaction_per_day = clean_transactions[['card_no', 'amount', 'day']].groupby(['card_no', 'day']).count()
transaction_per_day_reset = transaction_per_day.reset_index()

_memo_sum = dict()
def memmo_sum(card_id):
    if card_id in _memo_sum:
        return _memo_sum[card_id]
    else:
        m_sum = transaction_per_day_reset['amount'][transaction_per_day_reset['card_no'] == card_id].sum()
        _memo_sum[card_id] = m_sum
        return m_sum

transaction_per_day = transaction_per_day.apply(lambda x: x / memmo_sum(x.name[0]), axis=1)

transaction_per_day = transaction_per_day.pivot_table(values = 'amount'
                                                      , columns = 'day'
                                                      , index = 'card_no'
                                                      , aggfunc = np.sum
                                                      , fill_value = 0
                                                     )
# transaction_per_day = transaction_per_day.apply(lambda column: column.apply(lambda x: np.log(x) if x > 0 else 0))
transaction_per_day = transaction_per_day.reset_index()
transaction_per_day.head()

day,card_no,Friday,Monday,Saturnday,Sunday,Thuesday,Thursday,Wednesday
0,1234000000000001,0.058,0.135,0.058,0.192,0.135,0.231,0.192
1,1234000000000004,0.019,0.192,0.154,0.212,0.077,0.154,0.192
2,1234000000000005,0.429,0.429,0.143,0.0,0.0,0.0,0.0
3,1234000000000006,0.156,0.219,0.094,0.125,0.031,0.156,0.219
4,1234000000000010,0.095,0.181,0.238,0.114,0.152,0.114,0.105


In [149]:
transaction_amount_per_day = clean_transactions[['card_no', 'amount', 'day']].groupby(['card_no', 'day']).sum().reset_index()

transaction_amount_per_day = transaction_amount_per_day.pivot_table(values = 'amount'
                                                      , columns = 'day'
                                                      , index = 'card_no'
                                                      , aggfunc = np.sum
                                                      , fill_value = 0
                                                     )
# transaction_per_day = transaction_per_day.apply(lambda column: column.apply(lambda x: np.log(x) if x > 0 else 0))
transaction_amount_per_day.columns = ["A_"+str(name) for name in transaction_amount_per_day.columns]
transaction_amount_per_day = transaction_amount_per_day.reset_index()
transaction_amount_per_day.head()

Unnamed: 0,card_no,A_Friday,A_Monday,A_Saturnday,A_Sunday,A_Thuesday,A_Thursday,A_Wednesday
0,1234000000000001,1550,3350,1550,4400,5950,6100,107850
1,1234000000000004,350,9850,5600,6800,1550,5750,14400
2,1234000000000005,650,500,100,0,0,0,0
3,1234000000000006,17200,17850,17500,20500,3000,17550,19200
4,1234000000000010,3850,6350,9750,4150,9050,9200,16100


### Amount of transaction per hour

In [148]:
# transaction_per_hour = clean_transactions[['card_no', 'amount', 'hour']].groupby(['card_no', 'hour']).sum().fillna(0).reset_index()

transaction_per_hour = clean_transactions[['card_no', 'amount', 'hour']].groupby(['card_no', 'hour']).count()
transaction_per_hour_reset = transaction_per_hour.reset_index()

_memo_sum = dict()
def memmo_sum(card_id):
    if card_id in _memo_sum:
        return _memo_sum[card_id]
    else:
        m_sum = transaction_per_hour_reset['amount'][transaction_per_hour_reset['card_no'] == card_id].sum()
        _memo_sum[card_id] = m_sum
        return m_sum

transaction_per_hour = transaction_per_hour.apply(lambda x: x / memmo_sum(x.name[0]), axis=1)

transaction_per_hour = transaction_per_hour.pivot_table(values = 'amount'
                                                        , columns = ['hour']
                                                        , index = ['card_no']
                                                        , aggfunc = np.sum
                                                        , fill_value = 0
                                                       )
transaction_per_hour.columns = ['LateNight', 'Morning', 'Afternoon', 'Evening', 'EarlyNight']
# transaction_per_hour = transaction_per_hour.apply(lambda column: column.apply(lambda x: np.log(x) if x > 0 else 0))
transaction_per_hour = transaction_per_hour.reset_index()
transaction_per_hour.head()

Unnamed: 0,card_no,LateNight,Morning,Afternoon,Evening,EarlyNight
0,1234000000000001,0.0,0.212,0.442,0.327,0.019
1,1234000000000004,0.0,0.327,0.404,0.231,0.038
2,1234000000000005,0.0,0.0,0.857,0.143,0.0
3,1234000000000006,0.031,0.438,0.531,0.0,0.0
4,1234000000000010,0.0,0.305,0.248,0.429,0.019


In [147]:
transaction_amount_per_hour = clean_transactions[['card_no', 'amount', 'hour']].groupby(['card_no', 'hour']).sum().fillna(0).reset_index()
transaction_amount_per_hour = transaction_amount_per_hour.pivot_table(values = 'amount'
                                                        , columns = ['hour']
                                                        , index = ['card_no']
                                                        , aggfunc = np.sum
                                                        , fill_value = 0
                                                       )
transaction_amount_per_hour.columns = ['A_LateNight', 'A_Morning', 'A_Afternoon', 'A_Evening', 'A_EarlyNight']
transaction_amount_per_hour = transaction_amount_per_hour.reset_index()
transaction_amount_per_hour.head()

Unnamed: 0,card_no,A_LateNight,A_Morning,A_Afternoon,A_Evening,A_EarlyNight
0,1234000000000001,0,104800,15200,7700,3050
1,1234000000000004,0,17250,11100,11200,4750
2,1234000000000005,0,0,1150,100,0
3,1234000000000006,3500,75250,34050,0,0
4,1234000000000010,0,23650,15900,18800,100


### Amount of transaction per merchant

In [151]:

transaction_per_merchant = clean_transactions[['card_no', 'amount', 'merchant']].groupby(['card_no', 'merchant']).count()
transaction_per_merchant_reset = transaction_per_merchant.reset_index()

_memo_sum = dict()
def memmo_sum(card_id):
    if card_id in _memo_sum:
        return _memo_sum[card_id]
    else:
        m_sum = transaction_per_merchant_reset['amount'][transaction_per_merchant_reset['card_no'] == card_id].sum()
        _memo_sum[card_id] = m_sum
        return m_sum

transaction_per_merchant = transaction_per_merchant.apply(lambda x: x / memmo_sum(x.name[0]), axis=1)

transaction_per_merchant = transaction_per_merchant.pivot_table(values = 'amount'
                                                            , columns = ['merchant']
                                                            , index = ['card_no']
                                                            , aggfunc = np.sum
                                                            , fill_value = 0
                                                           )

transaction_per_merchant = transaction_per_merchant.reset_index()
transaction_per_merchant.head()

merchant,card_no,"A/C, Refrigeration Repair",Accounting/Bookkeeping Services,Advertising Services,Agricultural Cooperative,Airlines,"Airlines, Air Carriers","Airports, Flying Fields",Amusement Parks/Carnivals,Antique Reproductions,...,Video Game Arcades,Video Tape Rental Stores,Vocational/Trade Schools,Watch/Jewelry Repair,Welding Repair,Wholesale Clubs,Wig and Toupee Stores,"Wires, Money Orders",Women’s Accessory and Specialty Shops,Women’s Ready-To-Wear Stores
0,1234000000000001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1234000000000004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.019
2,1234000000000005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1234000000000006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1234000000000010,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0


In [152]:

transaction_amount_per_merchant = clean_transactions[['card_no', 'amount', 'merchant']].groupby(['card_no', 'merchant']).sum().reset_index()

transaction_amount_per_merchant = transaction_amount_per_merchant.pivot_table(values = 'amount'
                                                            , columns = ['merchant']
                                                            , index = ['card_no']
                                                            , aggfunc = np.sum
                                                            , fill_value = 0
                                                           )

transaction_amount_per_merchant.columns = ["A_"+str(name) for name in transaction_amount_per_merchant.columns]
transaction_amount_per_merchant = transaction_amount_per_merchant.reset_index()
transaction_amount_per_merchant.head()

Unnamed: 0,card_no,"A_A/C, Refrigeration Repair",A_Accounting/Bookkeeping Services,A_Advertising Services,A_Agricultural Cooperative,A_Airlines,"A_Airlines, Air Carriers","A_Airports, Flying Fields",A_Amusement Parks/Carnivals,A_Antique Reproductions,...,A_Video Game Arcades,A_Video Tape Rental Stores,A_Vocational/Trade Schools,A_Watch/Jewelry Repair,A_Welding Repair,A_Wholesale Clubs,A_Wig and Toupee Stores,"A_Wires, Money Orders",A_Women’s Accessory and Specialty Shops,A_Women’s Ready-To-Wear Stores
0,1234000000000001,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1234000000000004,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,350
2,1234000000000005,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1234000000000006,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1234000000000010,0,0,0,0,0,50,0,0,0,...,0,0,0,0,0,0,0,0,500,0


### Sum Amount

In [153]:
transaction_amount = clean_transactions[['card_no', 'amount']].groupby(['card_no']).sum().reset_index()

transaction_amount.head()

Unnamed: 0,card_no,amount
0,1234000000000001,130750.0
1,1234000000000004,44300.0
2,1234000000000005,1250.0
3,1234000000000006,112800.0
4,1234000000000010,58450.0


### Merge Feature

In [155]:
final_data = transaction_per_day
final_data = final_data.join(transaction_per_hour.set_index('card_no'), on='card_no')
final_data = final_data.join(transaction_per_merchant.set_index('card_no'), on='card_no')
final_data = final_data.join(transaction_amount_per_day.set_index('card_no'), on='card_no')
final_data = final_data.join(transaction_amount_per_hour.set_index('card_no'), on='card_no')
final_data = final_data.join(transaction_amount_per_merchant.set_index('card_no'), on='card_no')
final_data = final_data.join(transaction_amount.set_index('card_no'), on='card_no')


transaction_amount
final_data = final_data.fillna(0)
# final_data = transaction_per_merchant.copy()
final_data.describe()

Unnamed: 0,card_no,Friday,Monday,Saturnday,Sunday,Thuesday,Thursday,Wednesday,LateNight,Morning,...,A_Video Tape Rental Stores,A_Vocational/Trade Schools,A_Watch/Jewelry Repair,A_Welding Repair,A_Wholesale Clubs,A_Wig and Toupee Stores,"A_Wires, Money Orders",A_Women’s Accessory and Specialty Shops,A_Women’s Ready-To-Wear Stores,amount
count,12572.0,12572.0,12572.0,12572.0,12572.0,12572.0,12572.0,12572.0,12572.0,12572.0,...,12572.0,12572.0,12572.0,12572.0,12572.0,12572.0,12572.0,12572.0,12572.0,12572.0
mean,1234000000014099.5,0.159,0.135,0.151,0.135,0.136,0.148,0.136,0.024,0.297,...,1.392,5.679,4.709,1.344,3474.738,4.844,0.099,732.747,520.975,120402.366
std,8119.197,0.137,0.116,0.133,0.112,0.115,0.118,0.114,0.065,0.204,...,63.469,214.306,124.52,101.474,41965.197,286.662,7.964,4869.767,3151.657,428679.628
min,1234000000000001.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0
25%,1234000000007120.8,0.083,0.073,0.072,0.076,0.077,0.089,0.077,0.0,0.159,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,28237.5
50%,1234000000014087.0,0.142,0.123,0.132,0.123,0.125,0.136,0.125,0.0,0.267,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,63000.0
75%,1234000000021015.5,0.205,0.172,0.197,0.172,0.172,0.187,0.172,0.021,0.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,126950.0
max,1234000000028171.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,5000.0,16500.0,7750.0,10400.0,1806600.0,28450.0,850.0,245000.0,154200.0,28245000.0


In [119]:
final_data.head()

Unnamed: 0,card_no,Friday,Monday,Saturnday,Sunday,Thuesday,Thursday,Wednesday,LateNight,Morning,...,Video Game Arcades,Video Tape Rental Stores,Vocational/Trade Schools,Watch/Jewelry Repair,Welding Repair,Wholesale Clubs,Wig and Toupee Stores,"Wires, Money Orders",Women’s Accessory and Specialty Shops,Women’s Ready-To-Wear Stores
0,1234000000000001,0.058,0.135,0.058,0.192,0.135,0.231,0.192,0.0,0.212,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1234000000000004,0.019,0.192,0.154,0.212,0.077,0.154,0.192,0.0,0.327,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.019
2,1234000000000005,0.429,0.429,0.143,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1234000000000006,0.156,0.219,0.094,0.125,0.031,0.156,0.219,0.031,0.438,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1234000000000010,0.095,0.181,0.238,0.114,0.152,0.114,0.105,0.0,0.305,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0


## Fitting and Tuning an Classifier

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

In [156]:
data_train_with_features = data_train.join(final_data.set_index('card_no'), on = 'card_no', how='left').dropna()

X_all = data_train_with_features.drop(['card_no', 'gender'], axis = 1)
y_all = data_train_with_features['gender']

num_test = 0.20
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = num_test, random_state = 23)

In [157]:
X_all.shape

(9852, 527)

In [76]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [175]:
# Choose the type of classifier. 
clf = RandomForestClassifier(n_estimators = 500, min_samples_leaf = 5)

# Fit the best algorithm to the data. 
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=5,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [176]:
clf.score(X_train, y_train), clf.score(X_test, y_test)

(0.92335997969800787, 0.68138001014713345)

## Validate with KFold

In [81]:
from sklearn.cross_validation import KFold


def run_kfold(clf):
    kf = KFold(X_all.shape[0], n_folds=10)
    outcomes = []
    fold = 0
    for train_index, test_index in kf:
        fold += 1
        X_train, X_test = X_all.values[train_index], X_all.values[test_index]
        y_train, y_test = y_all.values[train_index], y_all.values[test_index]
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        outcomes.append(accuracy)
        print("Fold {0} accuracy: {1}".format(fold, accuracy))     
    mean_outcome = np.mean(outcomes)
    print("Mean Accuracy: {0}".format(mean_outcome)) 

run_kfold(clf)


Fold 1 accuracy: 0.665314401622718
Fold 2 accuracy: 0.6582150101419878
Fold 3 accuracy: 0.6639593908629442
Fold 4 accuracy: 0.6558375634517767
Fold 5 accuracy: 0.6873096446700507
Fold 6 accuracy: 0.6802030456852792
Fold 7 accuracy: 0.6720812182741117
Fold 8 accuracy: 0.6903553299492385
Fold 9 accuracy: 0.6588832487309645
Fold 10 accuracy: 0.6690355329949239
Mean Accuracy: 0.6701194386383994


In [162]:
# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split

adaboost = AdaBoostClassifier()
adaboost.fit(X_train, y_train)
predictions = adaboost.predict(X_test)
acc_adaboost_train = round(adaboost.score(X_train, y_train) * 100, 2)
acc_adaboost_test = round(adaboost.score(X_test, y_test) * 100, 2)
acc_adaboost = acc_adaboost_train, acc_adaboost_test

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
Y_pred = logreg.predict(X_test)
acc_log_train = round(logreg.score(X_train, y_train) * 100, 2)
acc_log_test = round(logreg.score(X_test, y_test) * 100, 2)
acc_log = acc_log_train, acc_log_test

svc = SVC()
svc.fit(X_train, y_train)
Y_pred = svc.predict(X_test)
acc_svc_train = round(svc.score(X_train, y_train) * 100, 2)
acc_svc_test = round(svc.score(X_test, y_test) * 100, 2)
acc_svc = acc_svc_train, acc_svc_test

knn = KNeighborsClassifier(n_neighbors = X_train.shape[1])
knn.fit(X_train, y_train)
Y_pred = knn.predict(X_test)
acc_knn_train = round(knn.score(X_train, y_train) * 100, 2)
acc_knn_test = round(knn.score(X_test, y_test) * 100, 2)
acc_knn = acc_knn_train, acc_knn_test

gaussian = GaussianNB()
gaussian.fit(X_train, y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian_train = round(gaussian.score(X_train, y_train) * 100, 2)
acc_gaussian_test = round(gaussian.score(X_test, y_test) * 100, 2)
acc_gaussian = acc_gaussian_train, acc_gaussian_test

perceptron = Perceptron()
perceptron.fit(X_train, y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron_train = round(perceptron.score(X_train, y_train) * 100, 2)
acc_perceptron_test = round(perceptron.score(X_test, y_test) * 100, 2)
acc_perceptron = acc_perceptron_train, acc_perceptron_test


linear_svc = LinearSVC()
linear_svc.fit(X_train, y_train)
Y_pred = linear_svc.predict(X_test)
acc_linear_svc_train = round(linear_svc.score(X_train, y_train) * 100, 2)
acc_linear_svc_test = round(linear_svc.score(X_test, y_test) * 100, 2)
acc_linear_svc = acc_linear_svc_train, acc_linear_svc_test

sgd = SGDClassifier()
sgd.fit(X_train, y_train)
Y_pred = sgd.predict(X_test)
acc_sgd_train = round(sgd.score(X_train, y_train) * 100, 2)
acc_sgd_test = round(sgd.score(X_test, y_test) * 100, 2)
acc_sgd = acc_sgd_train, acc_sgd_test

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree_train = round(decision_tree.score(X_train, y_train) * 100, 2)
acc_decision_tree_test = round(decision_tree.score(X_test, y_test) * 100, 2)
acc_decision_tree = acc_decision_tree_train, acc_decision_tree_test

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, y_train)
acc_random_forest_train = round(random_forest.score(X_train, y_train) * 100, 2)
acc_random_forest_test = round(random_forest.score(X_test, y_test) * 100, 2)
acc_random_forest = acc_random_forest_train, acc_random_forest_test

models = pd.DataFrame({
    'Model': ['Support Vector Machines'
              , 'KNN'
              , 'Logistic Regression'
              , 'Random Forest'
              , 'Naive Bayes'
              , 'Perceptron'
              , 'Stochastic Gradient Decent'
              , 'Linear SVC'
              , 'Decision Tree'
              , 'AdaBoost'],
    'Score': [acc_svc
              , acc_knn
              , acc_log
              , acc_random_forest
              , acc_gaussian
              , acc_perceptron
              , acc_sgd
              , acc_linear_svc
              , acc_decision_tree
              , acc_adaboost]})
models['Train Score'] = models.Score.apply(lambda x: x[0])
models['Test Score'] = models.Score.apply(lambda x: x[1])
models = models.drop('Score', axis = 1)

models.sort_values(by = 'Test Score', ascending = False)

Unnamed: 0,Model,Train Score,Test Score
3,Random Forest,99.97,67.78
9,AdaBoost,67.8,65.75
2,Logistic Regression,63.1,61.75
5,Perceptron,59.4,59.16
4,Naive Bayes,58.09,57.43
8,Decision Tree,99.97,56.77
1,KNN,57.16,56.32
6,Stochastic Gradient Decent,58.48,56.22
0,Support Vector Machines,99.97,55.66
7,Linear SVC,54.13,53.48


## Model Version
### Save model to file

In [19]:
from sklearn.externals import joblib

version_name = input('version name: ')

joblib.dump(clf, 'model_{}.pkl'.format(version_name)) 

version name: 687468


['model_687468.pkl']

### Load model from file

In [None]:
from sklearn.externals import joblib

version_name = input('version name: ')

clf = joblib.load('model_{}.pkl'.format(version_name)) 

## Predict the Actual Test Data

In [174]:
test = data_test.join(final_data.set_index('card_no'), on = 'card_no', how = 'left').fillna(0)
predictions = clf.predict(test.drop('card_no', axis = 1))

output = pd.DataFrame({'gender': predictions })
output.to_csv('5-try-2.txt', index = False, header = None)
output.describe()

Unnamed: 0,gender
count,4623.0
mean,0.188
std,0.391
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [None]:
output.describe()