# Fraud detection Feature Engieering

### Import Libraries

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
plt.style.use("fivethirtyeight")

In [4]:
# Set theme for all charts in the notebook for both matplotlib and seaborn
# sns.set_theme()

### Read Data

In [5]:
def read_data():
    requests_data = pd.read_csv("data/requests.csv")
    device_data = pd.read_csv("data/device_info.csv")

    # Joining datasets
    df = pd.merge(requests_data, device_data, on='accountid', how='left')
    return df

In [6]:
def clean_data(df):
    df.drop_duplicates(inplace=True)
    df.drop(['count_feat_16','count_feat_17'], axis=1, inplace=True)
    cols_dtype_clean = ['anomaly_feat_0', 'anomaly_feat_1', 'anomaly_feat_2', 'interaction_feat_0']
    df[cols_dtype_clean] = df[cols_dtype_clean].astype('bool')

    df.loc[df['count_feat_13']==12130802.0, 'count_feat_13'] = np.nan
    df.loc[df['count_feat_14']==12130802.0, 'count_feat_14'] = np.nan
    df.loc[df['count_feat_15']==12130802.0, 'count_feat_15'] = np.nan
    
    # Encode boolean values to integers
    bool_cols = df.select_dtypes(include='bool').columns
    df[bool_cols] = df[bool_cols].replace({True:1, False:0})
    return df

In [7]:
def handle_nulls(df, value=0):
    # Distribution of records containing null with respect to target variable. 
    # Since most are normal transactions, we can impute with median (most features have outliers).
    
    float_cols = df.select_dtypes(include='float').columns
    col_median = df[float_cols].median()
    
    if value!=0:
        df[float_cols] = df[float_cols].fillna(value)
    else:
        df[float_cols] = df[float_cols].fillna(col_median)
    
    bool_cols = df.select_dtypes(include='bool').columns
    col_mode = df[bool_cols].mode()
    df[bool_cols] = df[bool_cols].fillna(col_mode)
    
    # Encode null device features as a separate category
    df['device_feat_1'].fillna('unknown', inplace=True)
    df['device_feat_2'].fillna('unknown', inplace=True)

    return df

In [8]:
from sklearn.feature_selection import SelectKBest
def feature_selection(X, y, choice='drop'):
    if choice=='drop':
        df_k = X.drop(['count_feat_3', 'count_feat_8', 'count_feat_12', 
                      'count_feat_11', 'count_feat_14', 'count_feat_15', 
                      'anomaly_feat_2', 'interaction_feat_1'], axis=1)
        
    elif choice=='top-k':
        clf = SelectKBest(10)
        df_k = clf.fit_transform(X, y)
        print(df_k.shape)
    return df_k

In [65]:
from category_encoders import BinaryEncoder, TargetEncoder
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler

def encode_categorical_data(encoder_df, df):
    # Encoding categorial variables using binary encoder
    print(f"Input df : {df.shape}")
    encoder = TargetEncoder(type)
#     encoder=BinaryEncoder(cols=['device_feat_1', 'device_feat_2'], return_df=True, drop_invariant=False).fit(encoder_df[['device_feat_1', 'device_feat_2']])
    encoded_df = encoder.transform(df)
    print(f"Encoder df : {encoded_df.shape}")
    dff = pd.concat([df, encoded_df], axis=1).drop(['device_feat_1', 'device_feat_2'], axis=1)
    print(f"Output df : {dff.shape}")
    return dff

def scale_data(df, scaler='std'):
    # Scale numerical features
    if scaler=='minmax':
        scaler = MinMaxScaler()
    elif scaler=='robust':
        scaler = RobustScaler()
    else:
        scaler=StandardScaler()
        
    df_num_cols = df.select_dtypes(include='float')
    scaled_df = scaler.fit_transform(df_num_cols)
    scaled_df = pd.DataFrame(scaled_df, columns=df_num_cols.columns)
    dff = pd.concat([scaled_df.reset_index(drop=True), df.select_dtypes(exclude='float').reset_index(drop=True)], axis=1)
#     print(df.select_dtypes(exclude='float').shape, dff.shape)
    return dff

In [66]:
from sklearn.model_selection import train_test_split

def make_dataset(scale=True, scaler='std', encode=True, select_features=False, sel_choice='top-k', num_null_value=0):
    data = read_data()
    df = clean_data(data)
   
    X = df.drop(['timestamp', 'accountid', 'is_attack'], axis=1)
    y = df['is_attack']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)

    X_train_f = handle_nulls(X_train, num_null_value)  
    X_test_f = handle_nulls(X_test, num_null_value)
   
    
    if scale:
        scaled_X_train = scale_data(X_train_f, scaler=scaler)
        X_train_f = scaled_X_train
        scaled_X_test = scale_data(X_test_f, scaler=scaler)
        X_test_f = scaled_X_test
#         print("Scale: \n")
#         print(X_train_f.columns, X_train_f.shape)
#         print(X_test_f.columns, X_test_f.shape)
    
    if encode:
        encoded_X_train = encode_categorical_data(X_train_f, X_train_f)
        encoded_X_test = encode_categorical_data(X_train_f, X_test_f)
        X_train_f = encoded_X_train
        X_test_f = encoded_X_test
#         print("Encoding: \n")
#         print(X_train_f.columns, X_train_f.shape)
#         print(X_test_f.columns, X_test_f.shape)
    
    if select_features:
        X_train_f_k = feature_selection(X_train_f, y_train, choice=sel_choice)
        X_train_f = X_train_f_k
        X_test_f_k = feature_selection(X_test_f, y_test, choice=sel_choice)
        X_test_f = X_test_f_k
#         print("Feature Selection: \n")
#         print(X_train_f.columns, X_train_f.shape)
#         print(X_test_f.columns, X_test_f.shape)
        
    return X_train_f, X_test_f, y_train, y_test

In [67]:
# Sampling on X_train

In [68]:
# data = read_data()
# df = clean_data(data)

# X = df.drop(['timestamp', 'accountid', 'is_attack'], axis=1)
# y = df['is_attack']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)


In [69]:
#Make different datasets

X_train, X_test, y_train, y_test = make_dataset(scale=True, encode=True, select_features=False, num_null_value=-999)
# X_train, X_test, y_train, y_test = make_dataset(scale=True, encode=True, select_features=False, num_null_value=0)
# X_train, X_test, y_train, y_test = make_dataset(scale=False, encode=True, select_features=False)
# X_train, X_test, y_train, y_test = make_dataset(scale=True, encode=True, select_features=True, sel_choice='top-k', X3_train, y3_train)
# X_train, X_test, y_train, y_test = make_dataset(scale=True, encode=True, select_features=True, sel_choice='drop')

Input df : (22479, 26)


ValueError: Unexpected input dimension 26, expected 2

In [63]:
X_train

Unnamed: 0,count_feat_0,count_feat_1,count_feat_2,count_feat_3,count_feat_4,count_feat_5,count_feat_6,count_feat_7,count_feat_8,count_feat_9,...,device_feat_2_2,device_feat_2_3,device_feat_2_4,device_feat_2_5,device_feat_2_6,device_feat_2_7,device_feat_2_8,device_feat_2_9,device_feat_2_10,device_feat_2_11
0,-0.136837,-0.031884,0.026630,-0.139352,0.022915,-0.142626,-0.977321,-0.988916,-0.184776,-0.012695,...,0,0,0,0,0,0,0,0,0,1
1,-0.136837,-0.031884,0.026630,-0.139352,0.022915,-0.134759,0.072952,0.158272,-0.184776,-0.012695,...,0,0,0,0,0,0,0,0,1,0
2,-0.136837,-0.031884,0.026630,-0.139352,0.022915,-0.142626,-0.642611,-0.652457,-0.184776,-0.012695,...,0,0,0,0,0,0,0,0,0,1
3,-0.136837,-0.031884,0.026630,-0.139352,0.022915,-0.142626,2.187562,1.930889,-0.184776,-0.012695,...,0,0,0,0,0,0,0,0,1,1
4,-0.136837,-0.031884,0.026630,-0.139352,0.022915,-0.142626,-0.391026,-0.357900,-0.184776,0.019550,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22474,8.121701,-0.031884,0.026630,8.032805,0.022915,-0.134759,-0.860378,-1.010643,11.239682,0.019550,...,0,0,0,0,0,0,0,0,0,1
22475,-0.114989,-0.031884,0.026630,-0.139352,0.022915,-0.099356,0.458547,0.318121,-0.160339,-0.012695,...,1,1,0,1,0,0,1,0,1,0
22476,-0.136837,-0.031884,0.164043,-0.139352,0.057324,-0.142626,-0.906365,-0.820066,-0.184776,0.051795,...,0,0,0,0,0,0,0,0,0,1
22477,-0.136837,0.116659,0.026630,-0.139352,0.022915,-0.142626,-0.805541,-0.811375,-0.160339,-0.012695,...,0,0,1,1,0,0,0,1,1,1


In [64]:
X_test

Unnamed: 0,count_feat_0,count_feat_1,count_feat_2,count_feat_3,count_feat_4,count_feat_5,count_feat_6,count_feat_7,count_feat_8,count_feat_9,...,device_feat_2_2,device_feat_2_3,device_feat_2_4,device_feat_2_5,device_feat_2_6,device_feat_2_7,device_feat_2_8,device_feat_2_9,device_feat_2_10,device_feat_2_11
0,-0.159998,-0.345425,0.026392,-0.148632,0.021477,-0.149621,1.396276,1.195964,-0.197695,-0.009000,...,0,0,0,0,0,0,0,0,0,1
1,-0.159998,0.757992,0.026392,-0.129268,0.021477,-0.082339,-0.669505,-0.553682,-0.140803,-0.009000,...,0,0,0,0,0,0,0,0,1,0
2,-0.159998,-0.345425,0.026392,-0.148632,0.021477,0.154918,-0.809924,-0.823356,-0.174938,0.019712,...,0,0,0,0,0,0,0,0,0,1
3,-0.159998,0.757992,0.026392,-0.129268,0.021477,-0.142539,-0.496463,-0.263667,-0.174938,-0.009000,...,0,0,0,0,0,0,0,0,1,1
4,-0.159998,-0.345425,0.026392,-0.148632,0.021477,-0.149621,-0.541536,-0.596213,-0.197695,-0.009000,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22474,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,1
22475,,,,,,,,,,,...,1,1,0,1,0,0,1,0,1,0
22476,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,1
22477,,,,,,,,,,,...,0,0,1,1,0,0,0,1,1,1


# Handling mislabelling

Anomaly detection - knn, svm, isolation forest

Semi supervised learning to improve labels 

In [74]:
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# from sklearn.semi_supervised import LabelPropagation
# from sklearn.metrics import classification_report

# # Load and preprocess the data
# X = df.drop(['accountid', 'timestamp', 'is_attack', 'device_feat_1', 'device_feat_2'], axis=1)
# y = df['is_attack']

# # split into train and test
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, stratify=y, shuffle=True)

# # split train into labeled and unlabeled
# X_train_lab, X_test_unlab, y_train_lab, y_test_unlab = train_test_split(X_train, y_train, test_size=0.5, stratify=y_train)

# # Standardize features
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train_lab)
# X_valid_scaled = scaler.transform(X_test_unlab)
# X_unlabeled_scaled = scaler.transform(X_test)

# # Semi-Supervised Learning with Label Propagation
# lp_model = LabelPropagation(kernel='knn', n_neighbors=10)
# lp_model.fit(X_train_scaled, y_train)

# # Predict on the validation set
# y_pred_valid = lp_model.predict(X_valid_scaled)

# # Evaluate the model
# print("Classification Report for Labeled Data:")
# print(classification_report(y_valid, y_pred_valid))

# # Predict on the unlabeled data
# y_pred_unlabeled = lp_model.predict(X_unlabeled_scaled)

# # Assign pseudo-labels to the unlabeled data
# unlabeled_data['pseudo_label'] = y_pred_unlabeled

# # Combine pseudo-labeled data with labeled data
# combined_data = pd.concat([labeled_data, unlabeled_data], ignore_index=True)

# # Retrain the model on the combined dataset (including pseudo-labeled data)
# X_combined_scaled = scaler.transform(combined_data.drop(['accounted', 'is_attack', 'pseudo_label'], axis=1))
# y_combined = combined_data['is_attack']
# lp_model.fit(X_combined_scaled, y_combined)

In [70]:
# # Handling outliers 
# # Try isolation forest to identify outliers
# from sklearn.model_selection import train_test_split
# X = dff.drop(['timestamp', 'accountid', 'is_attack'], axis=1)
# y = dff['is_attack']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [71]:
# X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [72]:
# from sklearn.preprocessing import StandardScaler
# from sklearn.decomposition import PCA

# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

# pca = PCA(n_components=20)
# X_pca = pca.fit_transform(X_scaled)

In [73]:
# # Calculate the explained variance ratio
# explained_variance_ratio = pca.explained_variance_ratio_

# # Calculate the cumulative explained variance
# cumulative_variance = np.cumsum(explained_variance_ratio)


In [74]:
# cumulative_variance

In [75]:
# # Plot the explained variance
# plt.figure(figsize=(10, 6))
# plt.plot(np.arange(1, len(cumulative_variance) + 1), cumulative_variance, marker='o')
# plt.title('Explained Variance by Number of Principal Components')
# plt.xlabel('Number of Principal Components')
# plt.ylabel('Cumulative Explained Variance')
# plt.grid(True)
# plt.show()

In [76]:
# plt.scatter(X_pca[:,0], X_pca[:,1])
# plt.xlabel('Principal Component 1')
# plt.ylabel('Principal Component 2')
# plt.title('PCA')
# plt.show()

In [77]:
# from sklearn.ensemble import IsolationForest

# clf = IsolationForest()
# clf.fit(X)
# dff['is_anomaly'] = clf.predict(X)
# anomaly_labels = clf.predict(X)

In [78]:
# dff.is_anomaly.value_counts()

In [79]:
# len(anomaly_labels[anomaly_labels==-1])

In [80]:
# # Visualize anomalies using scatter plot
# plt.scatter(X_pca[anomaly_labels == 1][:, 0], X_pca[anomaly_labels == 1][:, 1], color='green', label='Normal')
# plt.scatter(X_pca[anomaly_labels == -1][:, 0], X_pca[anomaly_labels == -1][:, 1], color='red', label='Anomaly')

In [81]:
# from sklearn.svm import OneClassSVM
# # Alternatively, use One-Class SVM
# model_one_class_svm = OneClassSVM(nu=0.01)
# model_one_class_svm.fit(X_pca)
# anomaly_labels_svm = model_one_class_svm.predict(X_pca)

In [82]:
# # Visualize anomalies using scatter plot
# plt.scatter(X_pca[anomaly_labels_svm == 1][:, 0], X_pca[anomaly_labels_svm == 1][:, 1], color='red', label='Anomaly')
# plt.scatter(X_pca[anomaly_labels_svm == -1][:, 0], X_pca[anomaly_labels_svm == -1][:, 1], color='green', label='Normal')
# plt.title('PCA + Isolation Forest Anomaly Detection')
# plt.xlabel('PC1')
# plt.ylabel('PC2')
# plt.legend()
# plt.show()

In [83]:
# sns.scatterplot(dff, x='count_feat_1', y='count_feat_2', hue='is_anomaly')