In [4]:
import os
import dask.dataframe as dd
import datetime as dt
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [5]:
#Ouverture des bases de données
df_cct = pd.read_parquet("/home/onyxia/work/cct.parquet")


In [6]:
#Un peu de nettoyage

df_cct["Amount"] = df_cct["Amount"].str.replace("$","")
df_cct["Amount"] = df_cct["Amount"].astype(float)
df_cct["Is Fraud?"].mask(df_cct["Is Fraud?"] == "Yes", 1 , inplace = True)
df_cct["Is Fraud?"].mask(df_cct["Is Fraud?"] == "No", 0 , inplace = True)

In [7]:
df_fraud = df_cct[df_cct["Is Fraud?"] == 1]

In [8]:
df_not_fraud = df_cct[df_cct["Is Fraud?"] == 0]

## I. Time intervals between transactions

In [9]:
#Création des variables 'timestamp', 'delta_t_s' et 'delta_t_s_card'
def time(dataframe):
   dataframe[['Hours', 'Minutes']] = dataframe.Time.str.split(":", n=1, expand=True) #scinder la colonne Time en 2
   dataframe["Hours"] = dataframe["Hours"].astype(int)
   dataframe["Minutes"] = dataframe["Minutes"].astype(int)
   #Créer une colonne 'Timestamp' : format année, mois, jour, heure, minute:
   timestamp = pd.to_datetime(dataframe[["Year",'Month','Day','Hours','Minutes']], format = '%Y:%M:%D%:%H:%M')
   date = pd.to_datetime(dataframe[["Year",'Month','Day']], format = '%Y:%M:%D%')
   dataframe["date"] = date.values
   dataframe["timestamp"] = timestamp.values
   dataframe["delta_t_card"] = (timestamp-timestamp.shift()) #calculer l'intervalle de temps entre
   #deux transactions de la même carte et du même user

   #Convertir delta_t_card en secondes et mettre à 0 les temps négatifs 
   #(ici, dataframe ordonné en fonction de l'user et de la carte, et non de la chronologie):
   dataframe["delta_t_s_card"] = dataframe["delta_t_card"].dt.total_seconds().fillna(0).astype(int)
   dataframe["delta_t_s_card"] = dataframe["delta_t_s_card"].mask(dataframe["delta_t_s_card"] < 0, 0)

   #Réordonner les transactions en fonction du User et de la chronoogie de ses transactions : 
   dataframe = dataframe.sort_values(["User", "Year","Month","Day","Hours","Minutes"], ascending=[True,True,True,True,True,True]).reset_index(drop=True)
   timestamp2 = pd.to_datetime(dataframe[["Year",'Month','Day','Hours','Minutes']], format = '%Y:%M:%D%:%H:%M')
   dataframe['delta_t'] = (timestamp2-timestamp2.shift()) #différence de temps entre deux transactions du même user
   dataframe["delta_t_s"] = dataframe["delta_t"].dt.total_seconds().fillna(0).astype(int) #pour la première
   #transaction, fixer le delta_t_s à 0
   dataframe = dataframe[['User', 'Card', 'date', 'timestamp', 'Hours', 'Amount', 'Use Chip',
       'Merchant Name', 'Merchant City', 'Merchant State', 'Zip', 'MCC',
       'Errors?', 'delta_t_s','delta_t_s_card','Is Fraud?']] #ordonner et sélectionner les colonnes
   
   return dataframe



In [10]:
df_cct = time(df_cct)

## II. Ratio amount of transaction over daily income

In [11]:
import numpy as np

In [12]:
df_users = pd.read_csv("/home/onyxia/work/sd254_users.csv")
df_users['Yearly Income - Person'] = df_users['Yearly Income - Person'].str.replace("$","")
df_users["day_income"] = df_users['Yearly Income - Person'].astype(np.int)/365
df_users.reset_index(inplace=True)
df_users.rename(columns={"index":"User"},inplace=True)
df_cct["amt/day_income"] = df_cct["Amount"]/pd.merge(df_cct,df_users[["User","day_income"]], on="User",how="left")["day_income"]


## III. Daily amount

In [13]:
def calc_rolling_sum(dataf, column=None, setting='1D'):
    return (dataf
            .groupby('User')[column]
            .transform(lambda d: d.rolling(setting, min_periods=1).sum()))

In [14]:
df_cct["daily_amount"] = (df_cct
.set_index('date')
.assign(daily_amount=lambda d: calc_rolling_sum(d, column='Amount')))["daily_amount"].values

## IV. Daily number of declines per card

In [15]:
def calc_rolling_decline(dataf, column=None, setting='1D'):
    return (dataf
        .groupby(['User',"Card"])[column]
        .transform(lambda d: d.rolling(setting, min_periods=1).count()))

In [16]:
df_cct["nb_daily_declines_card"] = (df_cct
.set_index('date')
.assign(nb_daily_declines_card=lambda d: calc_rolling_decline(d, column="Errors?")))["nb_daily_declines_card"].values

## V. Bad_pin

In [17]:
bad_pin = ['Bad PIN','Bad PIN,Insufficient Balance','Bad PIN,Technical Glitch']

In [19]:
df_cct["bad_pin"] = df_cct["Errors?"].apply(lambda x: 1 if x in bad_pin else 0)

## VI. Insufficient balance

In [23]:
insufficient_balance = ['Insufficient Balance','Bad PIN,Insufficient Balance',
'Insufficient Balance,Technical Glitch','Bad Card Number,Insufficient Balance',
'Bad CVV,Insufficient Balance','Bad Expiration,Insufficient Balance',
'Bad Zipcode,Insufficient Balance','Bad Card Number,Bad Expiration,Insufficient Balance']

In [24]:
df_cct["insufficient_balance"] = df_cct["Errors?"].apply(lambda x: 1 if x in insufficient_balance else 0)

## VII. Ratio Number of transaction per hour over average number of transactions per hour during the last 30 days

In [27]:
def calc_rolling_nbt_h(dataf, column=None, setting='1H'):
    return (dataf
            .groupby(['User','Card'])[column]
            .transform(lambda d: d.rolling(setting, min_periods=1).count()))

In [28]:
df_cct["hourly_nbt"] = (df_cct
.set_index('timestamp')
.assign(hourly_nbt=lambda d: calc_rolling_nbt_h(d, column= "Amount")))["hourly_nbt"].values

In [29]:
def calc_rolling_av_30d_nbt(dataf, column=None, setting='30D'):
    return (dataf
            .groupby(['User','Card'])[column]
            .transform(lambda d: d.rolling(setting, min_periods=1).mean()))

In [30]:
df_cct["last_30_days_av_hourly_nbt"] = (df_cct
.set_index('timestamp')
.assign(last_30_days_av_hourly_nbt=lambda d: calc_rolling_av_30d_nbt(d, column= "hourly_nbt")))["last_30_days_av_hourly_nbt"].values

In [31]:
df_cct["hr_nbt/last_30d_av_hr_nbt"] = df_cct["hourly_nbt"]/df_cct["last_30_days_av_hourly_nbt"]

## VIII. Ratio number of transactions over total amount of transactions for the 3 last days

In [32]:
def calc_rolling_3d_nbt(dataf, column=None, setting='3D'):
    return (dataf
            .groupby(['User','Card'])[column]
            .transform(lambda d: d.rolling(setting, min_periods=1).count()))

In [33]:
df_cct["last_3d_nbt"] = (df_cct
.set_index('date')
.assign(last_3d_nbt=lambda d: calc_rolling_3d_nbt(d, column= "Amount")))["last_3d_nbt"].values

In [34]:
def calc_rolling_3d_sum(dataf, column=None, setting='3D'):
    return (dataf
            .groupby('User')[column]
            .transform(lambda d: d.rolling(setting, min_periods=1).sum()))

In [35]:
df_cct["last_3d_amt"] = (df_cct
.set_index('date')
.assign(last_3d_amt=lambda d: calc_rolling_3d_sum(d, column='Amount')))["last_3d_amt"].values

In [36]:
df_cct["last_3d_nbt/amt"] = df_cct["last_3d_nbt"]/df_cct["last_3d_amt"]


In [39]:
df_cct_final = df_cct[['User', 'Card', 'Hours', 'Amount', 'Use Chip', 'Merchant State', 'MCC',
       'delta_t_s', 'delta_t_s_card', 'amt/day_income',
       'daily_amount', 'nb_daily_declines_card', 'bad_pin', 'insufficient_balance',
       'hr_nbt/last_30d_av_hr_nbt',
       'last_3d_nbt/amt','Is Fraud?']] #selecting final features to be used for ML models

In [40]:
df_cct_final

Unnamed: 0,User,Card,Hours,Amount,Use Chip,Merchant State,MCC,delta_t_s,delta_t_s_card,amt/day_income,daily_amount,nb_daily_declines_card,bad_pin,insufficient_balance,hr_nbt/last_30d_av_hr_nbt,last_3d_nbt/amt,Is Fraud?
0,0,0,6,134.09,Swipe Transaction,CA,5300,0,0,0.819868,134.09,0.0,0,0,1.000000,0.007458,0
1,0,0,6,38.48,Swipe Transaction,CA,5411,1260,1260,0.235279,172.57,0.0,0,0,1.333333,0.011589,0
2,0,0,6,120.34,Swipe Transaction,CA,5411,85200,85200,0.735796,120.34,0.0,0,0,0.750000,0.010242,0
3,0,0,17,128.95,Swipe Transaction,CA,5651,40980,40980,0.788441,249.29,0.0,0,0,0.800000,0.009482,0
4,0,0,6,104.71,Swipe Transaction,CA,5912,45480,45480,0.640230,104.71,0.0,0,0,0.833333,0.009495,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24386895,1999,1,22,-54.00,Chip Transaction,NH,5541,300,300,-0.299049,261.71,0.0,0,0,1.610738,0.017869,0
24386896,1999,1,22,54.00,Chip Transaction,NH,5541,60,60,0.299049,315.71,0.0,0,0,2.388158,0.017904,0
24386897,1999,1,7,59.15,Chip Transaction,NH,4121,33540,33540,0.327569,59.15,0.0,0,0,0.796053,0.022195,0
24386898,1999,1,20,43.12,Chip Transaction,NH,4121,44820,44820,0.238796,102.27,0.0,0,0,0.794702,0.022259,0


In [None]:
df_cct_final.to_parquet("/home/onyxia/work/cct_final.parquet")