In [1]:
#Import the basic numpy and pandas libraries
import numpy as np 
import pandas as pd 

In [2]:
#Loading the datasets 
train_transaction = pd.read_csv('train_transaction.csv')
test_transaction = pd.read_csv('test_transaction.csv')
train_identity = pd.read_csv('train_identity.csv')
test_identity = pd.read_csv('test_identity.csv')

In [3]:
#Merge datasets to create a single train test dataset
train_merged = pd.merge(train_transaction, train_identity, on = 'TransactionID', how = 'left')
test_merged = pd.merge(test_transaction, test_identity, on = 'TransactionID', how = 'left')

In [4]:
#Checking shape of the datasets
print('train:',train_merged.shape)
print('test:', test_merged.shape)

train: (590540, 434)
test: (506691, 433)


In [5]:
#We are primarily interested in features that have less than 10% missing data
valid_features = train_merged.isnull().sum()/len(train_merged)*100 < 10
valid_features = pd.DataFrame(valid_features[valid_features == True])

In [6]:
#Selecting only valid features in our dataset
train = train_merged[list(valid_features.index)]
test = test_merged[[col for col in list(valid_features.index) if col != 'isFraud']]

In [7]:
#Checking the shape of datsets after filtering for valid columns
print('train:',train.shape)
print('test:', test.shape)

train: (590540, 112)
test: (506691, 111)


In [8]:
#Checking the dataframes
train.head(5)

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,135.0,0.0,0.0,0.0,50.0,1404.0,790.0,0.0,0.0,0.0
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
test.head(5)

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
0,3663549,18403224,31.95,W,10409,111.0,150.0,visa,226.0,debit,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3663550,18403263,49.0,W,4272,111.0,150.0,visa,226.0,debit,...,77.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3663551,18403310,171.0,W,4476,574.0,150.0,visa,226.0,debit,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,263.0,0.0
3,3663552,18403310,284.95,W,10989,360.0,150.0,visa,166.0,debit,...,0.0,282.540009,282.540009,282.540009,0.0,0.0,0.0,0.0,0.0,0.0
4,3663553,18403317,67.95,W,18018,452.0,150.0,mastercard,117.0,debit,...,67.949997,67.949997,183.850006,67.949997,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
#Checking the number of fraud transactions in the training dataset
train.groupby(['isFraud'])['TransactionID'].count()

isFraud
0    569877
1     20663
Name: TransactionID, dtype: int64

In [14]:
#Combining the training and testing dataset as both sets need to perform data cleaning. Also categorize the categorical columns according to the correct data type
combined = pd.concat([train.drop('isFraud',axis=1),test])
combined = combined.astype({'ProductCD': 'category','card1': 'category','card2': 'category','card3': 'category','card4': 'category','card5': 'category','card6': 'category'})

In [15]:
#Checking the shape of the combined dataset
print(combined.shape)

(1097231, 111)


In [16]:
#Seperating the numerical and categorical data to perform the imputation later
combined_num = combined.select_dtypes(include=np.number)
combined_cat = combined.select_dtypes(exclude=np.number)

In [17]:
#Performing imputation for numerical columns
from sklearn.impute import SimpleImputer
imputer_median = SimpleImputer(strategy="median")
combined_filled_median = combined_num.copy()
imputer_median.fit(combined_filled_median)
combined_filled_median = pd.DataFrame(imputer_median.transform(combined_filled_median),columns=combined_filled_median.columns)

In [18]:
#Performing imputation for categorical columns
imputer_cat = SimpleImputer(strategy="most_frequent")
combined_filled_cat = combined_cat.copy()
imputer_cat.fit(combined_filled_cat)
combined_filled_cat = pd.DataFrame(imputer_cat.transform(combined_filled_cat),columns=combined_filled_cat.columns)

In [19]:
#Merging the filled datasets
combined_num_cat = pd.concat([combined_filled_median,combined_filled_cat],axis=1)

In [20]:
#Checking the number of unique values in the categorical columns before doing one-hot encoding
def unique_values_counter(cols):
    for col in cols:
        print("{}: {}".format(col, combined_cat[col].nunique()))
    
unique_values_counter(combined_cat.columns)

ProductCD: 5
card1: 17091
card2: 501
card3: 133
card4: 4
card5: 138
card6: 4


In [22]:
#Drop the columns with too many unique values
combined_dropped = combined_num_cat.drop(['TransactionID','card1', 'card2', 'card3', 'card5'],axis=1)

In [23]:
#Selecting the numerical columns to perform standard scaling
numerical_column_names = combined_dropped.select_dtypes(include=['number']).columns

In [24]:
#Perform scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(combined_dropped.loc[:, numerical_column_names])
combined_scaled = combined_dropped.copy()
combined_scaled.loc[:, numerical_column_names] = scaler.transform(combined_scaled.loc[:, numerical_column_names])

In [25]:
#Selecting the categorical columns to do one-hot-encoding
categorical_column_names = combined_scaled.select_dtypes(include=['object']).columns

In [27]:
#Perform One-Hot-Encoding
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(handle_unknown="ignore")
encoder.fit(combined_scaled.loc[:, categorical_column_names])
combined_encoded_columns = encoder.transform(combined_scaled.loc[:, categorical_column_names])
combined_encoded_df = pd.DataFrame.sparse.from_spmatrix(combined_encoded_columns, columns=encoder.get_feature_names_out(), index=combined_scaled.index)

In [28]:
print(combined_encoded_df.shape)

(1097231, 13)


In [29]:
combined_dropcat = combined_scaled.drop(categorical_column_names, axis=1)
combined_final = pd.concat([combined_dropcat,combined_encoded_df], axis=1)
print(combined_final.shape)

(1097231, 116)


In [30]:
#There are more than 100 different columns in the combined_final dataset. This is just too many features to be modeled simply. We perform PCA to reduce the features to 10 features.
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
pca.fit(combined_final)
combined_pca = pca.transform(combined_final)
print(combined_pca.shape)



(1097231, 10)


In [31]:
# Separating the train and test datasets
len_train = len(train) #This is the number of rows in which we have label 'isFraud'
X_train = pd.DataFrame(combined_pca).iloc[:len_train]
y_train = train['isFraud']
X_test = pd.DataFrame(combined_pca).iloc[len_train:]

In [32]:
#Checking the shape of the train and test set after PCA
print(X_train.shape)
print(X_test.shape)

(590540, 10)
(506691, 10)


In [33]:
#Our dataset is also inbalanced. Only ~3% of the label are fraudulent. We use oversampling and undersampling to create a balanced training dataset.
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
print(X_train_smote.shape, y_train_smote.shape)
print(pd.value_counts(y_train_smote))

(1139754, 10) (1139754,)
0    569877
1    569877
Name: isFraud, dtype: int64
