In [70]:
import pandas as pd
import numpy as np

from xgboost import XGBClassifier
from feature_engine.selection import DropConstantFeatures
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import mutual_info_score
from sklearn.model_selection import train_test_split

#### Reading the data

In [7]:
fraud_df = pd.read_csv("fraud_detection.csv")


#### Basic EDA

In [28]:
# fraud_df.head()
# fraud_df.shape
fraud_df.columns

Index(['transaction_id', 'user_id', 'transaction_amount',
       'transaction_date_and_time', 'merchant_id', 'payment_method',
       'country_code', 'transaction_type', 'device_type', 'ip_address',
       'browser_type', 'operating_system', 'merchant_category', 'user_age',
       'user_occupation', 'user_income', 'user_gender', 'user_account_status',
       'transaction_status', 'location_distance', 'time_taken_for_transaction',
       'transaction_time_of_day', 'user's_transaction_history',
       'merchant's_reputation_score', 'user's_device_location',
       'transaction_currency', 'transaction_purpose', 'user's_credit_score',
       'user's_email_domain', 'merchant's_business_age',
       'transaction_authentication_method', 'fraudulent_flag'],
      dtype='object')

In [29]:
features = ["transaction_amount", "payment_method", "country_code", "transaction_type",
 "device_type", "ip_address", "browser_type", "operating_system", "merchant_category",
            "user_age", "user_occupation", "user_income", "user_gender", 'user_account_status',
            'transaction_status', 'location_distance', 'time_taken_for_transaction',
            'transaction_time_of_day', 'user\'s_transaction_history',
            'merchant\'s_reputation_score', 'user\'s_device_location',
            'transaction_currency', 'transaction_purpose', 'user\'s_credit_score',
            'user\'s_email_domain', 'merchant\'s_business_age',
            'transaction_authentication_method']


In [47]:
selected_features = ["ip_address", "user_income", "transaction_amount",
                     "location_distance", "time_taken_for_transaction",
                     "user's_credit_score"]

In [17]:
# formatting the column names to be more consistent
fraud_df.columns = fraud_df.columns.str.lower()
fraud_df.columns = fraud_df.columns.str.replace(" ", "_")

### Feature Engineering

In [48]:
# seperating the data into target (y) and predictor (x) variables.
y = fraud_df["fraudulent_flag"]
X = fraud_df[selected_features]

In [31]:
# X.head()

In [38]:
for feature in features:
    score = mutual_info_score(fraud_df[feature], y)
    print(f"{feature} score: {score}")




transaction_amount score: 0.008438973328968282
payment_method score: 2.5987580564157398e-06
country_code score: 4.687493972477252e-06
transaction_type score: 3.3740052077374516e-06
device_type score: 3.241385290793375e-06
ip_address score: 0.6926499624915344
browser_type score: 2.8168583201398922e-06
operating_system score: 3.859630139907422e-06
merchant_category score: 4.300995773337535e-06
user_age score: 3.5765949986665824e-06
user_occupation score: 1.7762668802490111e-06




user_income score: 0.5175025935456573
user_gender score: 4.111489797231638e-07
user_account_status score: 1.1851414232627588e-06
transaction_status score: 2.7601645790423235e-06




location_distance score: 0.0008197739407848333




time_taken_for_transaction score: 0.00048606497360688356
transaction_time_of_day score: 1.1841384206512373e-07
user's_transaction_history score: 7.72316001679263e-06




merchant's_reputation_score score: 3.270236507881951e-05
user's_device_location score: 2.3434770907021507e-06
transaction_currency score: 3.881151857003384e-06
transaction_purpose score: 3.0466044109236923e-06
user's_credit_score score: 4.9009240722788956e-05
user's_email_domain score: 3.736723703025302e-06
merchant's_business_age score: 1.9052730870217083e-06
transaction_authentication_method score: 2.8808243364569774e-06


In [50]:
mutual_info_classif(fraud_df[['user_income']], y)

array([4.24473543e-05])

#### Splitting the dataset into train and test

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [66]:
X_train.reset_index(drop=True, inplace=True)

In [74]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
X_train.head()

(4200000, 6)
(1800000, 6)
(4200000,)
(1800000,)


Unnamed: 0,ip_address,user_income,transaction_amount,location_distance,time_taken_for_transaction,user's_credit_score
0,94.237.188.214,30075.67,933.49,45.16,57.79,588
1,123.19.242.76,23156.32,900.21,89.23,55.93,842
2,87.151.164.111,16861.05,488.9,15.42,35.08,459
3,94.27.111.0,33844.56,587.04,52.29,57.1,688
4,225.56.40.102,7825.53,633.01,3.46,11.89,634


### Model selection and fitting

In [71]:
XGmodel = XGBClassifier(n_estimators=2, max_depth=2,
                        learning_rate=1, objective='binary:logistic')

XGmodel.fit(X_train, y_train)

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "c:\Users\Latitude E7270\AppData\Local\Programs\Python\Python311\Lib\site-packages\IPython\core\interactiveshell.py", line 3508, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\Latitude E7270\AppData\Local\Temp\ipykernel_12984\3289419558.py", line 4, in <module>
    XGmodel.fit(X_train, y_train)
  File "c:\Users\Latitude E7270\AppData\Local\Programs\Python\Python311\Lib\site-packages\xgboost\core.py", line 620, in inner_f
    ):
      
  File "c:\Users\Latitude E7270\AppData\Local\Programs\Python\Python311\Lib\site-packages\xgboost\sklearn.py", line 1471, in fit
  File "c:\Users\Latitude E7270\AppData\Local\Programs\Python\Python311\Lib\site-packages\xgboost\sklearn.py", line 448, in _wrap_evaluation_matrices
  File "c:\Users\Latitude E7270\AppData\Local\Programs\Python\Python311\Lib\site-packages\xgboost\sklearn.py", line 908, in _create_dmatrix
    from_fit = True
       ^^^^^^^^^^^^
  File "c:\Users\Latitud