In [27]:
import os
import dask.dataframe as dd
import datetime as dt
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [28]:
#Ouverture des bases de données
df_cct = pd.read_parquet("/home/onyxia/work/data/cct.parquet")


In [29]:
#Un peu de nettoyage

df_cct["Amount"] = df_cct["Amount"].str.replace("$","")
df_cct["Amount"] = df_cct["Amount"].astype(float)


## Variables delta_t

In [30]:
#Création des variables 'timestamp', 'delta_t_s' et 'delta_t_s_card'
def time(dataframe):
   dataframe[['Hours', 'Minutes']] = dataframe.Time.str.split(":", n=1, expand=True) #scinder la colonne Time en 2
   dataframe["Hours"] = dataframe["Hours"].astype(int)
   dataframe["Minutes"] = dataframe["Minutes"].astype(int)
   #Créer une colonne 'Timestamp' : format année, mois, jour, heure, minute:
   timestamp = pd.to_datetime(dataframe[["Year",'Month','Day','Hours','Minutes']], format = '%Y:%M:%D%:%H:%M')
   date = pd.to_datetime(dataframe[["Year",'Month','Day']], format = '%Y:%M:%D%')
   dataframe["date"] = date.values
   dataframe["timestamp"] = timestamp.values
   dataframe["delta_t_card"] = (timestamp-timestamp.shift()) #calculer l'intervalle de temps entre
   #deux transactions de la même carte et du même user

   #Convertir delta_t_card en secondes et mettre à 0 les temps négatifs 
   #(ici, dataframe ordonné en fonction de l'user et de la carte, et non de la chronologie):
   dataframe["delta_t_s_card"] = dataframe["delta_t_card"].dt.total_seconds().fillna(0).astype(int)
   dataframe["delta_t_s_card"] = dataframe["delta_t_s_card"].mask(dataframe["delta_t_s_card"] < 0, 0)

   #Réordonner les transactions en fonction du User et de la chronoogie de ses transactions : 
   dataframe = dataframe.sort_values(["User", "Year","Month","Day","Hours","Minutes"], ascending=[True,True,True,True,True,True]).reset_index(drop=True)
   timestamp2 = pd.to_datetime(dataframe[["Year",'Month','Day','Hours','Minutes']], format = '%Y:%M:%D%:%H:%M')
   dataframe['delta_t'] = (timestamp2-timestamp2.shift()) #différence de temps entre deux transactions du même user
   dataframe["delta_t_s"] = dataframe["delta_t"].dt.total_seconds().fillna(0).astype(int) #pour la première
   #transaction, fixer le delta_t_s à 0
   dataframe = dataframe[['User', 'Card', 'date', 'timestamp', 'Amount', 'Use Chip',
       'Merchant Name', 'Merchant City', 'Merchant State', 'Zip', 'MCC',
       'Errors?', 'Is Fraud?', 'delta_t_s','delta_t_s_card']] #ordonner et sélectionner les colonnes
   
   return dataframe




In [31]:
df_cct = time(df_cct)

## Variables journalières

In [32]:
#Determining the daily amount of transactions
day_amt_v = df_cct.groupby(["User", "date"])["Amount"].sum().values
day_amt_i = df_cct.groupby(["User", "date"])["Amount"].sum().index
#Determining the daily number of transactions
day_nbt_v = df_cct.groupby(["User", "date"])["Amount"].size().values
day_nbt_i = df_cct.groupby(["User", "date"])["Amount"].size().index

df1 = pd.DataFrame(day_amt_v,day_amt_i,columns=["day_amt"]).reset_index()
df2 = pd.DataFrame(day_nbt_v,day_nbt_i,columns=["day_nbt"]).reset_index()


df1["day_nbt"] = df2["day_nbt"]
df1["day_med_nbt"] = df1["day_nbt"].expanding().median()



In [33]:
df_cct = df_cct.merge(df1,on=["User","date"])

## Variables hebdomadaires

In [19]:
#df_cct.groupby(["User", pd.Grouper(key="date",freq="1W")]).Amount.sum()

User  date      
0     2002-09-01     172.57
      2002-09-08    1482.72
      2002-09-15    1949.53
      2002-09-22    1561.70
      2002-09-29    1870.12
                     ...   
1999  2020-02-02    2136.96
      2020-02-09    1529.05
      2020-02-16     987.22
      2020-02-23    1535.12
      2020-03-01    1389.39
Name: Amount, Length: 1161864, dtype: float64

In [34]:
df = pd.DataFrame()
#Création des variables:
#wk_i_nbt: nombre de transactions hebdomadaires (jour 1: première transaction de chaque user)
#wk_med_nbt: nombre médian de transactions hebdomadaires (jusqu'à la semaine i)
#wk_i_amt: nmontant total des transactions hebdomadaires 
#wk_med_amt: montant médian des transactions hebdomadaires (jusqu'à la semaine i)

for i in range(len(df_cct["User"].unique())):
    df3 = pd.DataFrame(df_cct[df_cct["User"] == i][['User',"date"]])
    wk_nbt_i = df_cct[df_cct["User"] == i].groupby([pd.Grouper(key="date", freq="7d")]).size().index
    wk_nbt_v = df_cct[df_cct["User"] == i].groupby([pd.Grouper(key="date", freq="7d")]).size().values


    wk_amt_i = df_cct[df_cct["User"] == i].groupby([pd.Grouper(key="date", freq="7d")])["Amount"].sum().index
    wk_amt_v = df_cct[df_cct["User"] == i].groupby([pd.Grouper(key="date", freq="7d")])["Amount"].sum().values

    wk_amt_med_i = df_cct[df_cct["User"] == i].groupby([pd.Grouper(key="date", freq="7d")])["Amount"].median().index
    wk_amt_med_v = df_cct[df_cct["User"] == i].groupby([pd.Grouper(key="date", freq="7d")])["Amount"].median().values

    df4 = pd.DataFrame(wk_nbt_v,wk_nbt_i, columns=["wk_i_nbt"]).reset_index()
    df4["User"] = i
    df4["wk_med_nbt"] = df4["wk_i_nbt"].expanding().median()
    df4["wk_i_amt"] = wk_amt_v
    df4["wk_med_amt"] = df4["wk_i_amt"].expanding().median()
    df4["wk_i_med_amt"] = wk_amt_med_v
 
    df5 = pd.merge(df3,df4,how='left',on=["User","date"])
    df = pd.concat([df,df5], axis=0)
    df = df[['wk_i_nbt', 'wk_med_nbt', 'wk_i_amt',
       'wk_med_amt','wk_i_med_amt']].fillna(method="ffill")
    

df.reset_index(drop=True, inplace=True) #dataframe avec les 4 nouvelles var. hebdomdaires

In [69]:
df["nbt_wk_i/amt_wk_i"] = df["wk_i_nbt"]/df["wk_i_med_amt"]

In [70]:
df_cct = pd.concat([df_cct,df[['wk_i_nbt', 'wk_med_nbt', 'wk_i_amt',
       'wk_med_amt',"wk_i_med_amt","nbt_wk_i/amt_wk_i"]]], axis=1)

## Variable montant/revenu journalier

In [43]:
import numpy as np

In [55]:
df_users = pd.read_csv("/home/onyxia/work/data/sd254_users.csv")

In [56]:
df_users['Yearly Income - Person'] = df_users['Yearly Income - Person'].str.replace("$","")
df_users["day_income"] = df_users['Yearly Income - Person'].astype(np.int)/365

In [57]:
df_users.reset_index(inplace=True)

In [61]:
df_users.rename(columns={"index":"User"},inplace=True)

In [66]:
df_cct["amt/day_income"] = df_cct["Amount"]/pd.merge(df_cct,df_users[["User","day_income"]], on="User",how="left")["day_income"]

In [67]:
df_cct

Unnamed: 0,User,Card,date,timestamp,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,...,day_amt,day_nbt,day_med_nbt,wk_i_nbt,wk_med_nbt,wk_i_amt,wk_med_amt,wk_i_med_amt,freq/amt_wk_i,amt/day_income
0,0,0,2002-09-01,2002-09-01 06:21:00,134.09,Swipe Transaction,3527213246127876953,La Verne,CA,91750.0,...,172.57,2,2.0,16.0,16.0,1380.96,1380.96,90.015,0.177748,0.819868
1,0,0,2002-09-01,2002-09-01 06:42:00,38.48,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,...,172.57,2,2.0,16.0,16.0,1380.96,1380.96,90.015,0.177748,0.235279
2,0,0,2002-09-02,2002-09-02 06:22:00,120.34,Swipe Transaction,-727612092139916043,Monterey Park,CA,91754.0,...,249.29,2,2.0,16.0,16.0,1380.96,1380.96,90.015,0.177748,0.735796
3,0,0,2002-09-02,2002-09-02 17:45:00,128.95,Swipe Transaction,3414527459579106770,Monterey Park,CA,91754.0,...,249.29,2,2.0,16.0,16.0,1380.96,1380.96,90.015,0.177748,0.788441
4,0,0,2002-09-03,2002-09-03 06:23:00,104.71,Swipe Transaction,5817218446178736267,La Verne,CA,91750.0,...,190.90,2,2.0,16.0,16.0,1380.96,1380.96,90.015,0.177748,0.640230
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24386895,1999,1,2020-02-27,2020-02-27 22:23:00,-54.00,Chip Transaction,-5162038175624867091,Merrimack,NH,3054.0,...,315.71,8,3.0,39.0,32.0,1993.11,1859.19,54.000,0.722222,-0.299049
24386896,1999,1,2020-02-27,2020-02-27 22:24:00,54.00,Chip Transaction,-5162038175624867091,Merrimack,NH,3054.0,...,315.71,8,3.0,39.0,32.0,1993.11,1859.19,54.000,0.722222,0.299049
24386897,1999,1,2020-02-28,2020-02-28 07:43:00,59.15,Chip Transaction,2500998799892805156,Merrimack,NH,3054.0,...,147.40,3,3.0,3.0,32.0,147.40,1852.71,45.130,0.066475,0.327569
24386898,1999,1,2020-02-28,2020-02-28 20:10:00,43.12,Chip Transaction,2500998799892805156,Merrimack,NH,3054.0,...,147.40,3,3.0,3.0,32.0,147.40,1852.71,45.130,0.066475,0.238796
