In [1]:
import os
import dask.dataframe as dd
import datetime as dt
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
#Ouverture des bases de données
df_cct = pd.read_parquet("/home/onyxia/work/cct.parquet")


In [3]:
#Un peu de nettoyage

df_cct["Amount"] = df_cct["Amount"].str.replace("$","")
df_cct["Amount"] = df_cct["Amount"].astype(float)
df_cct["Is Fraud?"].mask(df_cct["Is Fraud?"] == "Yes", 1 , inplace = True)
df_cct["Is Fraud?"].mask(df_cct["Is Fraud?"] == "No", 0 , inplace = True)

In [4]:
df_fraud = df_cct[df_cct["Is Fraud?"] == 1]

In [5]:
df_not_fraud = df_cct[df_cct["Is Fraud?"] == 0]

## I. Time intervals between transactions

In [6]:
#Création des variables 'timestamp', 'delta_t_s' et 'delta_t_s_card'
def time(dataframe):
   dataframe[['Hours', 'Minutes']] = dataframe.Time.str.split(":", n=1, expand=True) #scinder la colonne Time en 2
   dataframe["Hours"] = dataframe["Hours"].astype(int)
   dataframe["Minutes"] = dataframe["Minutes"].astype(int)
   #Créer une colonne 'Timestamp' : format année, mois, jour, heure, minute:
   timestamp = pd.to_datetime(dataframe[["Year",'Month','Day','Hours','Minutes']], format = '%Y:%M:%D%:%H:%M')
   date = pd.to_datetime(dataframe[["Year",'Month','Day']], format = '%Y:%M:%D%')
   dataframe["date"] = date.values
   dataframe["timestamp"] = timestamp.values
   dataframe["delta_t_card"] = (timestamp-timestamp.shift()) #calculer l'intervalle de temps entre
   #deux transactions de la même carte et du même user

   #Convertir delta_t_card en secondes et mettre à 0 les temps négatifs 
   #(ici, dataframe ordonné en fonction de l'user et de la carte, et non de la chronologie):
   dataframe["delta_t_s_card"] = dataframe["delta_t_card"].dt.total_seconds().fillna(0).astype(int)
   dataframe["delta_t_s_card"] = dataframe["delta_t_s_card"].mask(dataframe["delta_t_s_card"] < 0, 0)

   #Réordonner les transactions en fonction du User et de la chronoogie de ses transactions : 
   dataframe = dataframe.sort_values(["User", "Year","Month","Day","Hours","Minutes"], ascending=[True,True,True,True,True,True]).reset_index(drop=True)
   timestamp2 = pd.to_datetime(dataframe[["Year",'Month','Day','Hours','Minutes']], format = '%Y:%M:%D%:%H:%M')
   dataframe['delta_t'] = (timestamp2-timestamp2.shift()) #différence de temps entre deux transactions du même user
   dataframe["delta_t_s"] = dataframe["delta_t"].dt.total_seconds().fillna(0).astype(int) #pour la première
   #transaction, fixer le delta_t_s à 0
   dataframe = dataframe[['User', 'Card', 'date', 'timestamp', 'Hours', 'Amount', 'Use Chip',
       'Merchant Name', 'Merchant City', 'Merchant State', 'Zip', 'MCC',
       'Errors?', 'delta_t_s','delta_t_s_card','Is Fraud?']] #ordonner et sélectionner les colonnes
   
   return dataframe



In [7]:
df_cct = time(df_cct)

## II. Ratio amount of transaction over daily income

In [8]:
import numpy as np

In [9]:
df_users = pd.read_csv("/home/onyxia/work/sd254_users.csv")
df_users['Yearly Income - Person'] = df_users['Yearly Income - Person'].str.replace("$","")
df_users["day_income"] = df_users['Yearly Income - Person'].astype(np.int)/365
df_users.reset_index(inplace=True)
df_users.rename(columns={"index":"User"},inplace=True)
df_cct["amt/day_income"] = df_cct["Amount"]/pd.merge(df_cct,df_users[["User","day_income"]], on="User",how="left")["day_income"]


## III. Daily amount

In [10]:
def calc_rolling_sum(dataf, column=None, setting='1D'):
    return (dataf
            .groupby('User')[column]
            .transform(lambda d: d.rolling(setting, min_periods=1).sum()))

In [11]:
df_cct["daily_amount"] = (df_cct
.set_index('date')
.assign(daily_amount=lambda d: calc_rolling_sum(d, column='Amount')))["daily_amount"].values

## IV. Daily number of declines per card

In [12]:
def calc_rolling_decline(dataf, column=None, setting='1D'):
    return (dataf
        .groupby(['User',"Card"])[column]
        .transform(lambda d: d.rolling(setting, min_periods=1).count()))

In [13]:
df_cct["nb_daily_declines_card"] = (df_cct
.set_index('date')
.assign(nb_daily_declines_card=lambda d: calc_rolling_decline(d, column="Errors?")))["nb_daily_declines_card"].values

## V. Ratio Number of transaction per hour over average number of transactions per hour during the last 30 days

In [14]:
def calc_rolling_nbt_h(dataf, column=None, setting='1H'):
    return (dataf
            .groupby(['User','Card'])[column]
            .transform(lambda d: d.rolling(setting, min_periods=1).count()))

In [15]:
df_cct["hourly_nbt"] = (df_cct
.set_index('timestamp')
.assign(hourly_nbt=lambda d: calc_rolling_nbt_h(d, column= "Amount")))["hourly_nbt"].values

In [16]:
def calc_rolling_av_30d_nbt(dataf, column=None, setting='30D'):
    return (dataf
            .groupby(['User','Card'])[column]
            .transform(lambda d: d.rolling(setting, min_periods=1).mean()))

In [17]:
df_cct["last_30_days_av_hourly_nbt"] = (df_cct
.set_index('timestamp')
.assign(last_30_days_av_hourly_nbt=lambda d: calc_rolling_av_30d_nbt(d, column= "hourly_nbt")))["last_30_days_av_hourly_nbt"].values

In [18]:
df_cct["hr_nbt/last_30d_av_hr_nbt"] = df_cct["hourly_nbt"]/df_cct["last_30_days_av_hourly_nbt"]

## VI. Ratio number of transactions over total amount of transactions for the 3 last days

In [19]:
def calc_rolling_3d_nbt(dataf, column=None, setting='3D'):
    return (dataf
            .groupby(['User','Card'])[column]
            .transform(lambda d: d.rolling(setting, min_periods=1).count()))

In [20]:
df_cct["last_3d_nbt"] = (df_cct
.set_index('date')
.assign(last_3d_nbt=lambda d: calc_rolling_3d_nbt(d, column= "Amount")))["last_3d_nbt"].values

In [21]:
def calc_rolling_3d_sum(dataf, column=None, setting='3D'):
    return (dataf
            .groupby('User')[column]
            .transform(lambda d: d.rolling(setting, min_periods=1).sum()))

In [22]:
df_cct["last_3d_amt"] = (df_cct
.set_index('date')
.assign(last_3d_amt=lambda d: calc_rolling_3d_sum(d, column='Amount')))["last_3d_amt"].values

In [23]:
df_cct["last_3d_nbt/amt"] = df_cct["last_3d_nbt"]/df_cct["last_3d_amt"]
df_cct

Unnamed: 0,User,Card,date,timestamp,Hours,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,...,Is Fraud?,amt/day_income,daily_amount,nb_daily_declines_card,hourly_nbt,last_30_days_av_hourly_nbt,hr_nbt/last_30d_av_hr_nbt,last_3d_nbt,last_3d_amt,last_3d_nbt/amt
0,0,0,2002-09-01,2002-09-01 06:21:00,6,134.09,Swipe Transaction,3527213246127876953,La Verne,CA,...,0,0.819868,134.09,0.0,1.0,1.000000,1.000000,1.0,134.09,0.007458
1,0,0,2002-09-01,2002-09-01 06:42:00,6,38.48,Swipe Transaction,-727612092139916043,Monterey Park,CA,...,0,0.235279,172.57,0.0,2.0,1.500000,1.333333,2.0,172.57,0.011589
2,0,0,2002-09-02,2002-09-02 06:22:00,6,120.34,Swipe Transaction,-727612092139916043,Monterey Park,CA,...,0,0.735796,120.34,0.0,1.0,1.333333,0.750000,3.0,292.91,0.010242
3,0,0,2002-09-02,2002-09-02 17:45:00,17,128.95,Swipe Transaction,3414527459579106770,Monterey Park,CA,...,0,0.788441,249.29,0.0,1.0,1.250000,0.800000,4.0,421.86,0.009482
4,0,0,2002-09-03,2002-09-03 06:23:00,6,104.71,Swipe Transaction,5817218446178736267,La Verne,CA,...,0,0.640230,104.71,0.0,1.0,1.200000,0.833333,5.0,526.57,0.009495
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24386895,1999,1,2020-02-27,2020-02-27 22:23:00,22,-54.00,Chip Transaction,-5162038175624867091,Merrimack,NH,...,0,-0.299049,261.71,0.0,2.0,1.241667,1.610738,17.0,951.36,0.017869
24386896,1999,1,2020-02-27,2020-02-27 22:24:00,22,54.00,Chip Transaction,-5162038175624867091,Merrimack,NH,...,0,0.299049,315.71,0.0,3.0,1.256198,2.388158,18.0,1005.36,0.017904
24386897,1999,1,2020-02-28,2020-02-28 07:43:00,7,59.15,Chip Transaction,2500998799892805156,Merrimack,NH,...,0,0.327569,59.15,0.0,1.0,1.256198,0.796053,14.0,630.77,0.022195
24386898,1999,1,2020-02-28,2020-02-28 20:10:00,20,43.12,Chip Transaction,2500998799892805156,Merrimack,NH,...,0,0.238796,102.27,0.0,1.0,1.258333,0.794702,15.0,673.89,0.022259


In [25]:
df_cct_final = df_cct[['User', 'Card', 'Hours', 'Amount', 'Use Chip', 'Merchant State', 'MCC',
       'delta_t_s', 'delta_t_s_card', 'amt/day_income',
       'daily_amount', 'nb_daily_declines_card', 'hr_nbt/last_30d_av_hr_nbt',
       'last_3d_nbt/amt','Is Fraud?']] #selecting final features to be used for ML models

In [26]:
df_cct_final

Unnamed: 0,User,Card,Hours,Amount,Use Chip,Merchant State,MCC,delta_t_s,delta_t_s_card,amt/day_income,daily_amount,nb_daily_declines_card,hr_nbt/last_30d_av_hr_nbt,last_3d_nbt/amt,Is Fraud?
0,0,0,6,134.09,Swipe Transaction,CA,5300,0,0,0.819868,134.09,0.0,1.000000,0.007458,0
1,0,0,6,38.48,Swipe Transaction,CA,5411,1260,1260,0.235279,172.57,0.0,1.333333,0.011589,0
2,0,0,6,120.34,Swipe Transaction,CA,5411,85200,85200,0.735796,120.34,0.0,0.750000,0.010242,0
3,0,0,17,128.95,Swipe Transaction,CA,5651,40980,40980,0.788441,249.29,0.0,0.800000,0.009482,0
4,0,0,6,104.71,Swipe Transaction,CA,5912,45480,45480,0.640230,104.71,0.0,0.833333,0.009495,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24386895,1999,1,22,-54.00,Chip Transaction,NH,5541,300,300,-0.299049,261.71,0.0,1.610738,0.017869,0
24386896,1999,1,22,54.00,Chip Transaction,NH,5541,60,60,0.299049,315.71,0.0,2.388158,0.017904,0
24386897,1999,1,7,59.15,Chip Transaction,NH,4121,33540,33540,0.327569,59.15,0.0,0.796053,0.022195,0
24386898,1999,1,20,43.12,Chip Transaction,NH,4121,44820,44820,0.238796,102.27,0.0,0.794702,0.022259,0
