# Feature Selection

In [1]:
from datetime import datetime
import numpy as np
import pandas as pd

In [2]:
# Load the data
clean_data=pd.read_csv("data/cleaned_synthetic_data.csv",index_col=0)

sample_df=clean_data.sample(n=10)
prepared_df=sample_df.copy()

  mask |= (ar1 == a)


In [3]:
sample_df.head(5)

Unnamed: 0,is_fraud,unix_time,amt,cc_id,person_id,gender_id,job_category,age,city_pop_round,merchant_id,merchant_category
106559,0,1330724482,54.59,139,562,0,Professor Emeritus,65,100,136,personal_care
491218,0,1343949448,16.46,652,302,1,Health and safety adviser,75,900,469,kids_pets
803149,0,1354682851,65.21,162,121,0,"Designer, industrial/product",25,391350,681,grocery_net
743312,0,1352865090,6.77,538,367,1,"Psychologist, clinical",57,2050,101,shopping_pos
934962,0,1357492170,42.18,323,864,0,Wellsite geologist,63,222750,469,kids_pets


### Time

In the real data set time is given in seconds since the first transaction. I implement the function `standardise_time` which calculates the number of seconds between mid-night on the day of the first transaction and each transaction.

In [4]:
def standardise_time(series) -> pd.Series:
    min_time=datetime.utcfromtimestamp(series.min())
    min_day=min_time.replace(second=0,minute=0,hour=0)
    return ((series-min_day.timestamp())).astype(int)

In [5]:
prepared_df["seconds_from_start"]=standardise_time(sample_df["unix_time"])

In [6]:
prepared_df.head(10)

Unnamed: 0,is_fraud,unix_time,amt,cc_id,person_id,gender_id,job_category,age,city_pop_round,merchant_id,merchant_category,seconds_from_start
106559,0,1330724482,54.59,139,562,0,Professor Emeritus,65,100,136,personal_care,78082
491218,0,1343949448,16.46,652,302,1,Health and safety adviser,75,900,469,kids_pets,13303048
803149,0,1354682851,65.21,162,121,0,"Designer, industrial/product",25,391350,681,grocery_net,24036451
743312,0,1352865090,6.77,538,367,1,"Psychologist, clinical",57,2050,101,shopping_pos,22218690
934962,0,1357492170,42.18,323,864,0,Wellsite geologist,63,222750,469,kids_pets,26845770
592805,0,1347124941,104.63,96,966,0,"Conservation officer, historic buildings",22,200,224,shopping_pos,16478541
608278,0,1347723752,2.84,665,430,0,Farm manager,26,4500,31,shopping_net,17077352
714366,0,1351781073,89.16,145,13,1,Network engineer,63,200,397,personal_care,21134673
581619,0,1346655862,1.56,699,22,0,Public relations account executive,79,50,174,shopping_net,16009462
1193976,0,1368463300,96.77,157,548,0,Arboriculturist,26,2650,376,entertainment,37816900


## `city_pop`
Categorising `city_pop` seems a useful step to take as it helps group cities and it is intuitive to think that the size of the city will affect whether it is targeted by scammers. In the function `city_pop_cats` I use the foollowing categories, however it is justifiable to use any number of categories
 * `small` - 0 - 9,999
 * `medium` - 10,000 - 99,999
 * `large` - 100,000 - 999,999
 * `huge` - 1mn +
 
This step also increases anonymity and decreases the degrees of freedom of our model.

In [7]:
def city_pop_cats(series) -> pd.DataFrame:
    cats=pd.DataFrame()
    cats["small"]=(series<=9999).astype(int)
    cats["medium"]=((series>=10000) & (series<=99999)).astype(int)
    cats["large"]=((series>=100000) & (series<=999999)).astype(int)
    cats["huge"]=(series>=1000000).astype(int)
    return cats

In [8]:
prepared_df=prepared_df.join(city_pop_cats(sample_df["city_pop_round"]))

In [9]:
prepared_df.head(1)

Unnamed: 0,is_fraud,unix_time,amt,cc_id,person_id,gender_id,job_category,age,city_pop_round,merchant_id,merchant_category,seconds_from_start,small,medium,large,huge
106559,0,1330724482,54.59,139,562,0,Professor Emeritus,65,100,136,personal_care,78082,1,0,0,0


## Currency Conversion
All `amt` values are in dollars, but the real data we have is from Europe so we need to account for the exchange rate. The functions below allow for the conversion between any currency.

In [10]:
# !pip install forex-python
import forex_python.converter as fx

In [11]:
def convert_currency(amount:float,date:datetime,cur_currency:str,tar_currency) -> float:
    """
    Determine the value of an amount of one currency in another currency at a specified point in time
    
    PARAMETERS
    amount (float) - amount of current currency
    date (datetime) - date of exchange rate to use
    cur_currency (str) - three character code for current currency
    tar_currency (str) - three character code for target currency
    
    RETURNS
    float - amount of target currency
    """
    exchange_rate=fx.get_rate(cur_currency,tar_currency,date)
    return round(amount*exchange_rate,2)

In [12]:
def prepare_amount(df,cur_label,cur_currency="USD",tar_currency="GBP") -> pd.Series:
    """
    Convert amounts in a dataframe between currencies, using the exchange rate at the start of the date on which transaction occurred
    NOTE - conversion rate taken at start of day for speed.
    
    PARAMETERS
    df (pd.Dataframe) - dataframe of transactions with at least ["data",cur_label] columns
    cur_label (str) - name of column which contains amounts to convert
    """
    df_local=df.copy(deep=True)
    df_local["date"]=pd.to_datetime(df["unix_time"],unit="s").dt.date
    
    # determine the exchange rate for each day
    exchange_rates=pd.DataFrame()
    exchange_rates["date"]=pd.to_datetime(df_local["date"].unique(),format="%Y-%m-%d")
    exchange_rates["rate"]=exchange_rates.apply(lambda x:convert_currency(1,x["date"],cur_currency,tar_currency),axis=1)
    
    # merge dataframes
    exchange_rates["date"]=exchange_rates["date"].dt.date
    df_merged=df_local[["date","amt"]].reset_index().merge(exchange_rates[["date","rate"]],on="date",how="left").set_index('index')

    # calculated exchanged amounts
    tar_label="amount_{}".format(tar_currency)
    df_merged[tar_label]=df_merged.apply(lambda x:round(x["amt"]*x["rate"],2),axis=1)

    return df_merged[tar_label]

In [13]:
prepared_df["amount_USD"]=sample_df["amt"].copy()
prepared_df["amount_GBP"]=prepare_amount(sample_df[["unix_time","amt"]],"amt","USD","GBP")

In [14]:
prepared_df.head(5)

Unnamed: 0,is_fraud,unix_time,amt,cc_id,person_id,gender_id,job_category,age,city_pop_round,merchant_id,merchant_category,seconds_from_start,small,medium,large,huge,amount_USD,amount_GBP
106559,0,1330724482,54.59,139,562,0,Professor Emeritus,65,100,136,personal_care,78082,1,0,0,0,54.59,34.39
491218,0,1343949448,16.46,652,302,1,Health and safety adviser,75,900,469,kids_pets,13303048,1,0,0,0,16.46,10.53
803149,0,1354682851,65.21,162,121,0,"Designer, industrial/product",25,391350,681,grocery_net,24036451,0,0,1,0,65.21,40.43
743312,0,1352865090,6.77,538,367,1,"Psychologist, clinical",57,2050,101,shopping_pos,22218690,1,0,0,0,6.77,4.27
934962,0,1357492170,42.18,323,864,0,Wellsite geologist,63,222750,469,kids_pets,26845770,0,0,1,0,42.18,26.15
