# Gender Prediction from Credit Card Transactions

Exploring the ways customers’ use their credit cards will give banks, researchers, marketers
and economists more information to understand their spending habits. With this kind of
knowledge, banks or marketing strategists could tailor their marketing and communication
programs according to customers’ consumption behaviors or patterns.
 
This problem focuses on credit card spending transactions to answer a simple question: could
you train an algorithm to predict the gender of the owner of a credit card?


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

# Beautiful format for float type
pd.set_option('display.float_format', lambda x: '%.3f' % x)

## CSV to DataFrame

In [2]:
transactions = pd.read_csv('tj_05_credit_card_transaction.csv')

mcc_codes = pd.read_csv('mcc_codes.csv')

data_train = pd.read_csv('tj_05_training.csv', names = ['card_no', 'gender'])
data_train = data_train.drop_duplicates(keep = False, subset = 'card_no')

data_test = pd.read_csv('tj_05_test.csv', names = ['card_no'])

In [3]:
data_train.describe()

Unnamed: 0,card_no,gender
count,17414.0,17414.0
mean,1234000000014123.2,0.439
std,8104.257,0.496
min,1234000000000001.0,0.0
25%,1234000000007127.2,0.0
50%,1234000000014129.5,0.0
75%,1234000000021158.5,1.0
max,1234000000028172.0,1.0


## Merge Related DataFrame

In [7]:
transactions_with_mcc = transactions.join(mcc_codes.set_index('mcc'), on = 'mer_cat_code')

transactions_with_mcc.describe()

Unnamed: 0,card_no,txn_hour,txn_amount,mer_id
count,893761.0,884013.0,893761.0,893761.0
mean,1234000000014118.5,14.466,1694.535,7269.036
std,8078.405,4.67,25873.996,10765.232
min,1234000000000001.0,0.0,50.0,0.0
25%,1234000000007268.0,11.0,250.0,0.0
50%,1234000000014186.0,15.0,450.0,0.0
75%,1234000000021106.0,18.0,1000.0,14070.0
max,1234000000028172.0,23.0,19201000.0,28728.0


## Transform Features

In [99]:
def cleanup_columns(df):
    drop_columns = ['mer_cat_code'
                    , 'mer_id'
                    , 'combined_description'
                    , 'usda_description'
                    , 'irs_description'
                    , 'irs_reportable']
    
    return df.drop(drop_columns, axis = 1).dropna()


def make_readable_columns(df):
    df.columns = ['card_no'
                  , 'date'
                  , 'hour'
                  , 'amount'
                  , 'merchant']
    
    return df


def simplify_hour(df):
    bins = (-1, 4, 12, 17, 21, 25)
    group_names = ['LateNight', 'Morning', 'Afternoon', 'Evening', 'EarlyNight']
    hours = pd.cut(df.hour, bins, labels=group_names)
    df.hour = hours
    
    return df


def build_day(df):
    df['day'] = pd.to_datetime(df.date).dt.dayofweek
    df['day'] = df['day'].apply(lambda day: {0: 'Sunday'
                                             , 1: 'Monday'
                                             , 2: 'Thuesday'
                                             , 3: 'Wednesday'
                                             , 4: 'Thursday'
                                             , 5: 'Friday'
                                             , 6: 'Saturnday'}[day])
    return df


def simplify_date(df):
    df['date'] = pd.to_datetime(df['date']).map(lambda x: x.strftime('%Y-%m-%d'))
    
    return df


def transform_features(df):
    df = cleanup_columns(df)
    df = make_readable_columns(df)
    df = simplify_hour(df)
    df = build_day(df)
    df = simplify_date(df)
    
    return df


clean_transactions = transform_features(transactions_with_mcc)
clean_transactions.describe()

Unnamed: 0,card_no,amount
count,883990.0,883990.0
mean,1234000000014117.5,1712.699
std,8077.697,26016.017
min,1234000000000001.0,50.0
25%,1234000000007269.0,250.0
50%,1234000000014186.0,500.0
75%,1234000000021105.0,1000.0
max,1234000000028171.0,19201000.0


## Visualizing Data

## Feature Engineering

### Amount of transaction per day

In [193]:
transaction_per_day = clean_transactions[['card_no', 'amount', 'day']].groupby(['card_no', 'day']).sum().reset_index()
transaction_per_day = transaction_per_day.pivot_table(values = 'amount'
                                                      , columns = 'day'
                                                      , index = 'card_no'
                                                      , aggfunc = np.sum
                                                      , fill_value = 0
                                                     )
transaction_per_day = transaction_per_day.apply(lambda column: column.apply(lambda x: np.log(x) if x > 0 else 0))
transaction_per_day.head()

day,Friday,Monday,Saturnday,Sunday,Thuesday,Thursday,Wednesday
card_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1234000000000001,7.346,8.117,7.346,8.389,8.691,8.716,11.588
1234000000000004,5.858,9.195,8.631,8.825,7.346,8.657,9.575
1234000000000005,6.477,6.215,4.605,0.0,0.0,0.0,0.0
1234000000000006,9.753,9.79,9.77,9.928,8.006,9.773,9.863
1234000000000010,8.256,8.756,9.185,8.331,9.111,9.127,9.687


### Amount of transaction per hour

In [194]:
transaction_per_hour = clean_transactions[['card_no', 'amount', 'hour']].groupby(['card_no', 'hour']).sum().fillna(0).reset_index()
transaction_per_hour = transaction_per_hour.pivot_table(values = 'amount'
                                                        , columns = ['hour']
                                                        , index = ['card_no']
                                                        , aggfunc = np.sum
                                                        , fill_value = 0
                                                       )
transaction_per_hour.columns = ['LateNight', 'Morning', 'Afternoon', 'Evening', 'EarlyNight']
transaction_per_hour = transaction_per_hour.apply(lambda column: column.apply(lambda x: np.log(x) if x > 0 else 0))
transaction_per_hour = transaction_per_hour.reset_index()
transaction_per_hour.head()

Unnamed: 0,card_no,LateNight,Morning,Afternoon,Evening,EarlyNight
0,1234000000000001,0.0,11.56,9.629,8.949,8.023
1,1234000000000004,0.0,9.756,9.315,9.324,8.466
2,1234000000000005,0.0,0.0,7.048,4.605,0.0
3,1234000000000006,8.161,11.229,10.436,0.0,0.0
4,1234000000000010,0.0,10.071,9.674,9.842,4.605


### Amount of transaction per merchant

In [262]:
def test(a):
    print(a, end=" ")
    return a

test2['amount'][test2['card_no'] == 1234000000000005].sum()

7

In [266]:
test = clean_transactions[['card_no', 'amount', 'merchant']].groupby(['card_no', 'merchant']).count()
test2 = test.reset_index()
test.apply(lambda x: x / test2['amount'][test2['card_no'] == x.name[0]].sum(), axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,amount
card_no,merchant,Unnamed: 2_level_1
1234000000000001,Department Stores,0.019
1234000000000001,Eating places and Restaurants,0.115
1234000000000001,"Grocery Stores, Supermarkets",0.058
1234000000000001,Hospitals,0.038
1234000000000001,Men’s and Women’s Clothing Stores,0.019
1234000000000001,Security Brokers/Dealers,0.019
1234000000000001,Service Stations ( with or without ancillary services),0.558
1234000000000001,Sporting Goods Stores,0.096
1234000000000001,Telecommunications Equipment including telephone sales,0.077
1234000000000004,Bicycle Shops – Sales and Service,0.135


In [269]:

transaction_per_merchant = clean_transactions[['card_no', 'amount', 'merchant']].groupby(['card_no', 'merchant']).count()
transaction_per_merchant_reset = transaction_per_merchant.reset_index()
transaction_per_merchant = transaction_per_merchant.apply(lambda x: x / transaction_per_merchant_reset['amount'][transaction_per_merchant_reset['card_no'] == x.name[0]].sum(), axis=1)

transaction_per_merchant = transaction_per_merchant.pivot_table(values = 'amount'
                                                            , columns = ['merchant']
                                                            , index = ['card_no']
                                                            , aggfunc = np.sum
                                                            , fill_value = 0
                                                           )

transaction_per_merchant = transaction_per_merchant.reset_index()
transaction_per_merchant.head()

merchant,card_no,AEORFLOT,AEROMEXICO,AFFILIATED AUTO RENTAL,AIR CANADA,AIR CHINA,AIR FRANCE,AIR MALTA,AIR NEW ZEALAND,AIR-INDIA,...,Video Tape Rental Stores,Vocational Schools and Trade Schools,WESTIN HOTELS,"Watch, Clock, Jewelry, and Silverware Stores","Watch, Clock, and Jewelry Repair",Welding Repair,Wholesale Clubs,Wig and Toupee Stores,Women’s Accessory and Specialty Shops,Women’s Ready-to-Wear Stores
0,1234000000000001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1234000000000004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.019
2,1234000000000005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1234000000000006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1234000000000010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0


### Merge Feature

In [270]:
final_data = transaction_per_day.reset_index()
final_data = final_data.join(transaction_per_hour.set_index('card_no'), on='card_no')
final_data = final_data.join(transaction_per_merchant.set_index('card_no'), on='card_no')
final_data = final_data.fillna(0)
            
final_data.describe()

Unnamed: 0,card_no,Friday,Monday,Saturnday,Sunday,Thuesday,Thursday,Wednesday,LateNight,Morning,...,Video Tape Rental Stores,Vocational Schools and Trade Schools,WESTIN HOTELS,"Watch, Clock, Jewelry, and Silverware Stores","Watch, Clock, and Jewelry Repair",Welding Repair,Wholesale Clubs,Wig and Toupee Stores,Women’s Accessory and Specialty Shops,Women’s Ready-to-Wear Stores
count,12576.0,12576.0,12576.0,12576.0,12576.0,12576.0,12576.0,12576.0,12576.0,12576.0,...,12576.0,12576.0,12576.0,12576.0,12576.0,12576.0,12576.0,12576.0,12576.0,12576.0
mean,1234000000014098.2,7.929,7.829,7.746,7.853,7.85,8.012,7.864,2.581,8.869,...,0.0,0.0,0.0,0.004,0.0,0.0,0.015,0.0,0.008,0.006
std,8119.1,3.125,3.113,3.172,3.09,3.076,3.038,3.086,3.576,2.872,...,0.001,0.001,0.001,0.028,0.003,0.001,0.099,0.001,0.027,0.024
min,1234000000000001.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1234000000007120.8,7.576,7.409,7.378,7.467,7.438,7.65,7.467,0.0,8.412,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1234000000014085.5,8.839,8.657,8.691,8.687,8.691,8.814,8.7,0.0,9.575,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1234000000021014.2,9.793,9.661,9.655,9.665,9.649,9.761,9.678,6.238,10.486,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1234000000028171.0,15.368,15.803,15.249,15.099,15.7,15.265,16.778,14.12,16.035,...,0.083,0.035,0.14,1.0,0.333,0.081,1.0,0.031,1.0,1.0


## Fitting and Tuning an Classifier

In [166]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

In [271]:
data_train_with_features = data_train.join(final_data.set_index('card_no'), on = 'card_no', how='left').dropna()

X_all = data_train_with_features.drop(['card_no', 'gender'], axis = 1)
y_all = data_train_with_features['gender']

num_test = 0.20
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = num_test, random_state = 23)

In [168]:
X_all.shape

(9856, 376)

In [272]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [273]:
# Choose the type of classifier. 
clf = RandomForestClassifier(n_estimators = 300, min_samples_leaf = 5)

# Fit the best algorithm to the data. 
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=5,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=300, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [274]:
predictions = clf.predict(X_test)

print(accuracy_score(y_test, predictions))

0.668356997972


In [220]:
clf.score(X_train, y_train)

0.86542364282090312

## Validate with KFold

In [216]:
from sklearn.cross_validation import KFold

clf = logreg
def run_kfold(clf):
    kf = KFold(X_all.shape[0], n_folds=10)
    outcomes = []
    fold = 0
    for train_index, test_index in kf:
        fold += 1
        X_train, X_test = X_all.values[train_index], X_all.values[test_index]
        y_train, y_test = y_all.values[train_index], y_all.values[test_index]
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        outcomes.append(accuracy)
        print("Fold {0} accuracy: {1}".format(fold, accuracy))     
    mean_outcome = np.mean(outcomes)
    print("Mean Accuracy: {0}".format(mean_outcome)) 

run_kfold(clf)


Fold 1 accuracy: 0.6501014198782962
Fold 2 accuracy: 0.6612576064908722
Fold 3 accuracy: 0.6592292089249493
Fold 4 accuracy: 0.6551724137931034
Fold 5 accuracy: 0.6937119675456389
Fold 6 accuracy: 0.6490872210953347
Fold 7 accuracy: 0.6710659898477157
Fold 8 accuracy: 0.6649746192893401
Fold 9 accuracy: 0.6527918781725889
Fold 10 accuracy: 0.649746192893401
Mean Accuracy: 0.660713851793124


## Model Version
### Save model to file

In [186]:
from sklearn.externals import joblib

version_name = input('version name: ')

joblib.dump(clf, 'model_{}.pkl'.format(version_name)) 

version name: 661120


['model_661120.pkl']

### Load model from file

In [None]:
from sklearn.externals import joblib

version_name = input('version name: ')

clf = joblib.load('model_{}.pkl'.format(version_name)) 

## Predict the Actual Test Data

In [221]:
test = data_test.join(final_data.set_index('card_no'), on = 'card_no', how = 'left').fillna(0)
predictions = clf.predict(test.drop('card_no', axis = 1))

output = pd.DataFrame({'gender': predictions })
output.to_csv('5.txt', index = False, header = None)
output.head(10)

Unnamed: 0,gender
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0


In [None]:
output.describe()