# Feature Selection

In [1]:
from datetime import datetime
from math import ceil
import numpy as np
import pandas as pd

In [2]:
# Load the data
clean_data=pd.read_csv("data/cleaned_synthetic_data.csv",index_col=0)

sample_df=clean_data.sample(n=10)
prepared_df=sample_df.copy()

  mask |= (ar1 == a)


In [3]:
sample_df.head(5)

Unnamed: 0,is_fraud,unix_time,amt,cc_id,person_id,gender_id,job_category,age,city_pop_cluster_id,merchant_id,merchant_category
609946,0,1347762449,6.52,179,316,1,-1,57,80,70,shopping_net
215714,0,1334912162,90.16,79,810,1,113,50,56,8,shopping_net
100463,0,1330470063,5.96,649,578,1,-1,65,52,339,travel
1011350,0,1361404145,55.04,210,727,0,-1,46,33,61,kids_pets
1108726,0,1365250920,30.23,945,872,0,-1,27,2,41,home


### Time

In the real data set time is given in seconds since the first transaction. I implement the function `standardise_time` which calculates the number of seconds between mid-night on the day of the first transaction and each transaction.

In [4]:
def standardise_time(series) -> pd.Series:
    min_time=datetime.utcfromtimestamp(series.min())
    min_day=min_time.replace(second=0,minute=0,hour=0)
    return ((series-min_day.timestamp())).astype(int)

In [5]:
prepared_df["seconds_from_start"]=standardise_time(sample_df["unix_time"])

In [6]:
prepared_df.head(10)

Unnamed: 0,is_fraud,unix_time,amt,cc_id,person_id,gender_id,job_category,age,city_pop_cluster_id,merchant_id,merchant_category,seconds_from_start
609946,0,1347762449,6.52,179,316,1,-1,57,80,70,shopping_net,17375249
215714,0,1334912162,90.16,79,810,1,113,50,56,8,shopping_net,4524962
100463,0,1330470063,5.96,649,578,1,-1,65,52,339,travel,82863
1011350,0,1361404145,55.04,210,727,0,-1,46,33,61,kids_pets,31016945
1108726,0,1365250920,30.23,945,872,0,-1,27,2,41,home,34863720
1232505,0,1369850276,48.77,557,976,1,225,65,59,556,personal_care,39463076
911141,0,1356767105,72.57,899,968,1,-1,32,10,196,grocery_pos,26379905
309373,0,1338342688,53.14,919,761,1,-1,49,69,299,grocery_net,7955488
126133,0,1331428413,44.13,188,424,0,-1,56,68,622,gas_transport,1041213
338443,0,1339241299,96.02,474,184,1,262,51,65,153,gas_transport,8854099


**Hour of Day**


In [7]:
# TODO
prepared_df["hour_of_day"]=pd.to_datetime(sample_df["unix_time"],unit="s").dt.hour

## Currency Conversion
All `amt` values are in dollars, but the real data we have is from Europe so we need to account for the exchange rate. The functions below allow for the conversion between any currency.

In [8]:
# !pip install forex-python
import forex_python.converter as fx

In [9]:
def convert_currency(amount:float,date:datetime,cur_currency:str,tar_currency) -> float:
    """
    Determine the value of an amount of one currency in another currency at a specified point in time
    
    PARAMETERS
    amount (float) - amount of current currency
    date (datetime) - date of exchange rate to use
    cur_currency (str) - three character code for current currency
    tar_currency (str) - three character code for target currency
    
    RETURNS
    float - amount of target currency
    """
    exchange_rate=fx.get_rate(cur_currency,tar_currency,date)
    return round(amount*exchange_rate,2)

In [10]:
def prepare_amount(df,cur_label,cur_currency="USD",tar_currency="GBP") -> pd.Series:
    """
    Convert amounts in a dataframe between currencies, using the exchange rate at the start of the date on which transaction occurred
    NOTE - conversion rate taken at start of day for speed.
    
    PARAMETERS
    df (pd.Dataframe) - dataframe of transactions with at least ["data",cur_label] columns
    cur_label (str) - name of column which contains amounts to convert
    """
    df_local=df.copy(deep=True)
    df_local["date"]=pd.to_datetime(df["unix_time"],unit="s").dt.date
    
    # determine the exchange rate for each day
    exchange_rates=pd.DataFrame()
    exchange_rates["date"]=pd.to_datetime(df_local["date"].unique(),format="%Y-%m-%d")
    exchange_rates["rate"]=exchange_rates.apply(lambda x:convert_currency(1,x["date"],cur_currency,tar_currency),axis=1)
    
    # merge dataframes
    exchange_rates["date"]=exchange_rates["date"].dt.date
    df_merged=df_local[["date","amt"]].reset_index().merge(exchange_rates[["date","rate"]],on="date",how="left").set_index('index')

    # calculated exchanged amounts
    tar_label="amount_{}".format(tar_currency)
    df_merged[tar_label]=df_merged.apply(lambda x:round(x["amt"]*x["rate"],2),axis=1)

    return df_merged[tar_label]

In [11]:
prepared_df["amount_USD"]=sample_df["amt"].copy()
prepared_df["amount_GBP"]=prepare_amount(sample_df[["unix_time","amt"]],"amt","USD","GBP")

In [12]:
prepared_df.head(5)

Unnamed: 0,is_fraud,unix_time,amt,cc_id,person_id,gender_id,job_category,age,city_pop_cluster_id,merchant_id,merchant_category,seconds_from_start,hour_of_day,amount_USD,amount_GBP
609946,0,1347762449,6.52,179,316,1,-1,57,80,70,shopping_net,17375249,2,6.52,4.04
215714,0,1334912162,90.16,79,810,1,113,50,56,8,shopping_net,4524962,8,90.16,55.9
100463,0,1330470063,5.96,649,578,1,-1,65,52,339,travel,82863,23,5.96,3.75
1011350,0,1361404145,55.04,210,727,0,-1,46,33,61,kids_pets,31016945,23,55.04,35.78
1108726,0,1365250920,30.23,945,872,0,-1,27,2,41,home,34863720,12,30.23,19.95


## Person

In [13]:
# number of transactions performed by each person
def transactions_per_entity(ids) -> pd.Series:
    """ids - series of either `person_id` or `merchant_id`"""
    pp_trans=ids.value_counts()
    return ids.apply(lambda x: pp_trans[x])

prepared_df["transaction_by_person"]=transactions_per_entity(sample_df["person_id"])
prepared_df["transaction_by_merchant"]=transactions_per_entity(sample_df["merchant_id"])

In [14]:
# time since last transaction (merchant and customer)
def time_since_last_transaction(id_col,df) -> pd.Series:
    """
    id_col (str) - name of column which contain ids to group by
    df (pd.DataFrame) - dataframe containing `unix_time` and `id_col`
       NOTE -1 = first transaction on record
    """
    times=df[id_col].copy(deep=True)
    
    clean_data["time_since_last_transaction_person"]=-1
    for id_code in df[id_col].unique():
        trans_times=df[df[id_col]==id_code]["unix_time"]
        times.loc[df[id_col]==id_code]=trans_times.diff()

    return times.replace(np.nan,-1).astype(int)

In [15]:
# clean_data["time_since_last_transaction_person"]=time_since_last_transaction("person_id",clean_data[["person_id","unix_time"]])
# clean_data["time_since_last_transaction_merchant"]=time_since_last_transaction("merchant_id",clean_data[["merchant_id","unix_time"]])

prepared_df["time_since_last_transaction_person"]=time_since_last_transaction("person_id",sample_df[["person_id","unix_time"]])
prepared_df["time_since_last_transaction_merchant"]=time_since_last_transaction("merchant_id",sample_df[["merchant_id","unix_time"]])

In [16]:
# mean/min/max amt per merchange/customer
# NOTE this is USD val so maybe change
def entity_amount_statistic(id_col,df,agg_calc) -> pd.Series:
    
    group_by=df[[id_col,"amt"]].groupby([id_col])
    vals=group_by["amt"].agg(agg_calc)
    return df[id_col].transform(lambda x:vals[x])

prepared_df["mean_amt_person"]=entity_amount_statistic("person_id",sample_df[["person_id","amt"]],"mean")
prepared_df["max_amt_merchant"]=entity_amount_statistic("merchant_id",sample_df[["merchant_id","amt"]],"max")

## Save File

In [17]:
prepared_df

Unnamed: 0,is_fraud,unix_time,amt,cc_id,person_id,gender_id,job_category,age,city_pop_cluster_id,merchant_id,...,seconds_from_start,hour_of_day,amount_USD,amount_GBP,transaction_by_person,transaction_by_merchant,time_since_last_transaction_person,time_since_last_transaction_merchant,mean_amt_person,max_amt_merchant
609946,0,1347762449,6.52,179,316,1,-1,57,80,70,...,17375249,2,6.52,4.04,1,1,-1,-1,6.52,6.52
215714,0,1334912162,90.16,79,810,1,113,50,56,8,...,4524962,8,90.16,55.9,1,1,-1,-1,90.16,90.16
100463,0,1330470063,5.96,649,578,1,-1,65,52,339,...,82863,23,5.96,3.75,1,1,-1,-1,5.96,5.96
1011350,0,1361404145,55.04,210,727,0,-1,46,33,61,...,31016945,23,55.04,35.78,1,1,-1,-1,55.04,55.04
1108726,0,1365250920,30.23,945,872,0,-1,27,2,41,...,34863720,12,30.23,19.95,1,1,-1,-1,30.23,30.23
1232505,0,1369850276,48.77,557,976,1,225,65,59,556,...,39463076,17,48.77,32.19,1,1,-1,-1,48.77,48.77
911141,0,1356767105,72.57,899,968,1,-1,32,10,196,...,26379905,7,72.57,44.99,1,1,-1,-1,72.57,72.57
309373,0,1338342688,53.14,919,761,1,-1,49,69,299,...,7955488,1,53.14,34.01,1,1,-1,-1,53.14,53.14
126133,0,1331428413,44.13,188,424,0,-1,56,68,622,...,1041213,1,44.13,27.8,1,1,-1,-1,44.13,44.13
338443,0,1339241299,96.02,474,184,1,262,51,65,153,...,8854099,11,96.02,62.41,1,1,-1,-1,96.02,96.02


In [18]:
def save_data(df:pd.DataFrame,file_path):
    df.to_csv(file_path)