In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.read_csv('../data/dataset_feature_engineering.csv')

#### Data Transformations

In [None]:
# We transform every string value to lowercase
df['merchant'] = df['merchant'].str.lower()
df['category'] = df['category'].str.lower()
df['first'] = df['first'].str.lower()
df['last'] = df['last'].str.lower()
df['gender'] = df['gender'].str.lower()
df['state'] = df['state'].str.lower()
df['street'] = df['street'].str.lower()
df['city'] = df['city'].str.lower()
df['job'] = df['job'].str.lower()

In [None]:
# We transform every date value to datetime
df['dob'] = pd.to_datetime(df['dob'], format='%Y-%m-%d')

# We transform the 'unix_time' column to datetime
df['unix_time'] = pd.to_datetime(df['unix_time'], unit='s')

In [None]:
# Export the dataset to a CSV file
df.to_csv('../data/dataset_cleaned.csv', index=False)

In [7]:
df.shape

(1852394, 35)

### Feature Engineering

In [8]:
# We sort the DataFrame by the card number and time
df = df.sort_values(['cc_num', 'unix_time'])
# We calculate the time difference in seconds between consecutive transactions per card
df['time_diff_seconds'] = df.groupby('cc_num')['unix_time'].diff().dt.total_seconds().fillna(0)
# This to know how often the client spends between transactions

# We calculate the hour window by unix time
df['hour_window'] = df['unix_time'].dt.floor('h')
# We calculate the amount of transactions per hour made per card
df['trans_per_hour'] = df.groupby(['cc_num', 'hour_window'])['trans_num'].transform('count')
# This to know how many transactions are made per hour by the client

# Ratio of transactions per hour to total transactions per card
df['hour_trans_ratio'] = df.groupby(['cc_num', 'hour'])['trans_num'].transform('count') / df.groupby('cc_num')['trans_num'].transform('count')
# This to know how many transactions the client usually makes per hour

In [9]:
# We calculate if the distance between the client and the merchant is unusual (> 100 km)
df['unusual_distance'] = (df['dist_between_client_and_merch'] > 100).astype(int)
# This to know if the far transactions made by the client

# We calculate the distance between the client and the merchant
df['distance_diff'] = df.groupby('cc_num')['dist_between_client_and_merch'].diff().fillna(0)
# We calculate the velocity in km/h between consecutive transactions
df['velocity_km_h'] = (df['distance_diff'] / (df['time_diff_seconds'] / 3600)).replace([float('inf'), -float('inf')], 0).fillna(0)
# This to know how fast the client most move to make those transactions

# We calculate the mean and standard deviation of the distance between the client and the merchant
df['mean_dist_merchant'] = df.groupby('merchant')['dist_between_client_and_merch'].transform('mean')
df['std_dist_merchant'] = df.groupby('merchant')['dist_between_client_and_merch'].transform('std')
df['dist_z_score'] = (df['dist_between_client_and_merch'] - df['mean_dist_merchant']) / df['std_dist_merchant'].replace(0, 1)
# This to know how unusual the distance between the client and the merchant is

In [None]:
# We calculate the amount ratio of transactions per month and year
df['amt_month_ratio'] = df['amt'] / df['amt_month'].replace(0, 1)
df['amt_year_ratio'] = df['amt'] / df['amt_year'].replace(0, 1)
# This helps us identify amounts that are significantly higher or lower than the average for that month or year.

# We calculate the mean and standard deviation of the amounts a client spent in a merchant
df['mean_amt_category'] = df.groupby('category')['amt'].transform('mean')
df['std_amt_category'] = df.groupby('category')['amt'].transform('std')
df['amt_z_score'] = (df['amt'] - df['mean_amt_category']) / df['std_amt_category'].replace(0, 1)
# This helps us identify transactions that are significantly higher or lower than the average for that category, which usually is the behavior of fraudulent transactions.

# We calculate the first time a client made a transaction with a high amount at a merchant
amt_threshold = df['amt'].quantile(0.9)
df['high_amt_first_time'] = (df['first_time_at_merchant'] & (df['amt'] > amt_threshold)).astype(int)
# This to know if the client made a high amount transaction for the first time at the merchant, which could mean a fraudulent transaction.

In [None]:
# We calculate the mean, standard deviation, and z-score of the amount of times spent at each merchant
df['mean_times_day_merchant'] = df.groupby('merchant')['times_shopped_at_merchant_day'].transform('mean')
df['std_times_day_merchant'] = df.groupby('merchant')['times_shopped_at_merchant_day'].transform('std')
df['times_day_z_score'] = (df['times_shopped_at_merchant_day'] - df['mean_times_day_merchant']) / df['std_times_day_merchant'].replace(0, 1)
# This helps us identify transactions that were made at the merchant that ocurred more frequently than usual, which usually is the behavior of fraudulent transactions.

In [12]:
# We calculate the amount of unique cards that were used at each merchant
df['unique_cards_per_hour'] = df.groupby(['merchant', 'hour_window'])['cc_num'].transform('nunique')
# This helps us identify if there is an usually high number of unique cards that were used at the merchant, which could indicate that a coordinated attack took place during a specific time.


# We calculate the variance of the amount spent in each hour window
df['amt_variance_hour'] = df.groupby(['merchant', 'hour_window'])['amt'].transform('std').fillna(0)
# This could helps us identify if there is a coordinated attack taking place by looking at the variance of the amount spent in each hour window. A very high amount could indicate that there are suspiciously high transactions being made and if the value is too low, it could indicate that there are many stolen cards that are being tested (which is used to check if a stolen card can actually make any purchase).

time_diff_seconds: Tiempo de diferencia entre transacciones consecutivas
- Indica que tan seguido gasta el cliente entre transacciones

hour_window: Ventana de tiempo por tiempo Unix
trans_per_hour: Cantidad de transaciones por hora que se hace con la tarjeta
- Indica cuantas transacciones son hechas por hora por cliente 

hour_trans_ratio: Ratio de transacciones por hora de cada tarjeta
- Nos indica la continuidad de las transacciones por hora por cliente

unusual_distance: Calculamos si existe una inusual distancia entre el cliente y el vendedor
- Nos ayuda a saber que tal lejos esta la transacción hecha por el usuario

distance_diff: Diferencia entre la distancia entre el cliente y el vendedor
velocity_km_h: Velocidad requerida entre transacciones consecutivas
- Indica que tan rápido se tuve que mover el cliente para realizar la transacción

mean_dist_merchant: Distancia media entre el cliente y el vendedor
std_dist_merchant: Desviación estándar entre el cliente y el vendedor
dist_z_score: 
- Indica si existe alguna distancia inusual entre el cliente y el vendedor

amt_month_ratio: Ratio de la cantidad mensual que se gasta el cliente en transacciones
amt_year_ratio: Radio de la cantidad anual que se gasta el cliente en transacciones
- Identifica si existen cantidades que son más grandes o pequeñas de lo usual acostumbrado por el cliente

mean_amt_category: Cantidad media que se gasta el cliente por categoría
std_amt_category: Desviación estandar de la cantidad que se gasta el cliente por categoría
amt_z_score:
- Identifica si existen cantidad más altas o más bajas de las que compra el cliente por categoría

high_amt_first_time: Primera transaccion de alto monto
- Identifica si el usuario hizo por primera vez una compra muy por encima del monto usual a un vendedor

mean_times_day_merchant: Media de la cantidad de veces por día que el cliente compra al vendedor
std_times_day_merchant: Desviación estándar de la cantidad de veces que el cliente compra al vendedor
times_day_z_score: 
- Nos ayuda a identificar si el usuario realizo transacciones más frecuentes de lo normal a un vendedor

unique_cards_per_hour: Cantidad de tarjetas de créditos diferentes utilizadas en cada vendedor
- Nos ayuda a identificar si un vendedor tiene una gran cantidad de tarjetas utilizadas en su comercio, inusualmente grande a lo normal.

amt_variance_hour: Cantidad gastada en cada ventana de tiempo.
- Nos ayuda a identificar si sucede una cantidad inusual de compras dentro de una ventana de tiempo en específico.

In [13]:
df.shape

(1852394, 57)

In [None]:
# Export the dataset to a CSV file
df.to_csv('../data/data_engineered.csv', index=False)