# Data Cleaning

In [66]:
# Import libraries
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier

In [67]:
# Import data
train_df = pd.read_csv('data/fraudTrain.csv')
test_df = pd.read_csv('data/fraudTest.csv')

In [68]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [69]:
# This column holds no significance so it can be dropped off the bat
train_df.drop('Unnamed: 0', axis=1, inplace=True)

In [71]:
test_df.drop('Unnamed: 0', axis=1, inplace=True)

In [84]:
def datetime_column(df, col_name: str, hour: bool =False, new_col_prefix: str =''):
    df[col_name] = pd.to_datetime(df[col_name])

    if hour:
        new_col = new_col_prefix + '_hour'
        df[new_col] = df[col_name].dt.hour
    df[new_col_prefix + '_weekday'] = df[col_name].dt.weekday
    df[new_col_prefix + '_month'] = df[col_name].dt.strftime("%m")
    df[new_col_prefix + '_year'] = df[col_name].dt.year

    df.drop(col_name, axis=1, inplace=True)

In [85]:
datetime_column(train_df, 'trans_date_trans_time', True, 'trans')

In [86]:
datetime_column(test_df, 'trans_date_trans_time', True, 'trans')

In [87]:
datetime_column(train_df, 'dob', new_col_prefix='dob')

In [88]:
datetime_column(test_df, 'dob', new_col_prefix='dob')

In [42]:
train_df.columns

Index(['cc_num', 'merchant', 'category', 'amt', 'first', 'last', 'gender',
       'street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job',
       'dob', 'trans_num', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud',
       'trans_hour', 'trans_day_of_week', 'trans_month', 'trans_year',
       'dob_day', 'dob_month', 'dob_age'],
      dtype='object')

In [78]:
train_df.drop(['merchant', 'first', 'last','street','zip', 'trans_num'], axis=1, inplace=True)
test_df.drop(['merchant', 'first', 'last','street','zip', 'trans_num'], axis=1, inplace=True)

In [44]:
train_df.dtypes

cc_num                 int64
category              object
amt                  float64
gender                object
city                  object
state                 object
lat                  float64
long                 float64
city_pop               int64
job                   object
unix_time              int64
merch_lat            float64
merch_long           float64
is_fraud               int64
trans_hour             int64
trans_day_of_week      int64
trans_month           object
trans_year             int64
dob_day                int64
dob_month             object
dob_age                int64
dtype: object

In [79]:
categorical_column_names = ['gender', 'city', 'state', 'job', 'category']

for cat_name in categorical_column_names:
    train_df[cat_name] = pd.factorize(train_df[cat_name])[0]
    test_df[cat_name] = pd.factorize(test_df[cat_name])[0]

In [46]:
train_df.head()

Unnamed: 0,cc_num,category,amt,gender,city,state,lat,long,city_pop,job,...,merch_lat,merch_long,is_fraud,trans_hour,trans_day_of_week,trans_month,trans_year,dob_day,dob_month,dob_age
0,2703186189652095,0,4.97,0,0,0,36.0788,-81.1781,3495,0,...,36.011293,-82.048315,0,0,1,1,2019,2,3,1988
1,630423337322,1,107.23,0,1,1,48.8878,-118.2105,149,1,...,49.159047,-118.186462,0,0,1,1,2019,2,6,1978
2,38859492057661,2,220.11,1,2,2,42.1808,-112.262,4154,2,...,43.150704,-112.154481,0,0,1,1,2019,4,1,1962
3,3534093764340240,3,45.0,1,3,3,46.2306,-112.1138,1939,3,...,47.034331,-112.561071,0,0,1,1,2019,3,1,1967
4,375534208663984,4,41.96,1,4,4,38.4207,-79.4629,99,4,...,38.674999,-78.632459,0,0,1,1,2019,4,3,1986


In [93]:
train_y = train_df['is_fraud']
train_X = train_df.drop('is_fraud', axis=1)

test_y = test_df['is_fraud']
test_X = test_df.drop(['is_fraud'], axis=1)

In [94]:
# estimator = SVR(kernel="linear")
selector = RFE(DecisionTreeClassifier(), n_features_to_select=10, step=1)
selector.fit(train_X, train_y)

RFE(estimator=DecisionTreeClassifier(), n_features_to_select=10)

In [50]:
selected_features = []
for i, feature_selection in enumerate(selector.support_):
    if feature_selection:
        selected_features += [train_X.columns[i]]
        print(train_X.columns[i])

category
amt
gender
city
city_pop
unix_time
merch_lat
merch_long
trans_hour
dob_age


Comment out why you're randomly dropping columns
a little bit about RFE


In [52]:
train_selected = train_X[selected_features]
test_selected = test_X[selected_features]

In [53]:
clf = DecisionTreeClassifier(random_state=3000)

In [54]:
clf.fit(train_X, train_y)

DecisionTreeClassifier(random_state=3000)