In [1]:
import pandas as pd
import numpy as np

In [2]:
def reduce_data_size(df):
    
    intcols = [c for c in df.columns if df[c].dtypes == 'int64']
    
    for c in intcols:
        
        mx = df[c].max()
        
        if mx < 255:
            df[c] = df[c].astype(np.uint8)
        elif mx < 65535:
            df[c] = df[c].astype(np.uint16)
        elif mx < 4294967295:
            df[c] = df[c].astype(np.uint32)
        
def top_k_categorical(df, feats, k, target):

    df = df.copy()
    top_k_feat = {}
    for feat in feats:
        top_k_values = (
                df.loc[df[target] == 1, feat]
                    .value_counts()[:k]
                    .index
        )
        top_k_feat[feat] = top_k_values
        df.loc[~df[feat].isin(top_k_values), feat] = 'other'

    dummy_df = pd.get_dummies(df[feats])
    return dummy_df, top_k_feat

def top_k_categorical_test(df, feats_dict):
    
    df = df.copy()
    for k, v in feats_dict.items():
        df.loc[~df[k].isin(v),k] = 'other'
        
    dummy_df = pd.get_dummies(df[list(feats_dict.keys())])
    return dummy_df

def create_ip_profile(df, key):
    df['ip_event_number'] = df.groupby(key).cumcount()
    df['ip_event_count'] = df.groupby(key).transform('count')
    return df 

def create_ts_dummy(df, col):
    df = pd.to_datetime(df[col]).reset_index()
    df['hour'] = df[col].dt.hour
    df['minute'] = df[col].dt.minute
    df['seconds'] = df[col].dt.second
    df.drop(['index', col],axis=1, inplace=True)
    return df

In [3]:
## Filepath to raw data
datapath = '/Users/daniellee/Desktop/Data/Datasets/talkingdata_fraud/'

## Load data 
cols = ['app','device','os','channel','is_attributed']
train = pd.read_csv(datapath+'train.csv', usecols=cols, nrows=10000000)
test = pd.read_csv(datapath+'test.csv', usecols=cols[:-1], nrows=10000000)    

## Get dummy data
dummy_train, feats_dict = top_k_categorical(train, cols[:-1], 10, 'is_attributed')
del train 

dummy_test = top_k_categorical_test(test, feats_dict)
del test

# Reduce size and pickle data
# reduce_data_size(dummy_train)
# reduce_data_size(dummy_test)

In [4]:
train_profile_raw, test_profile_raw = (
                            [pd.read_csv(datapath+'{}.csv'.format(n), usecols=['ip'], nrows=10000000) 
                               for n in ['train','test']]
)

train_profile, test_profile = (
                    [create_ip_profile(df, 'ip') 
                     for df in [train_profile_raw, test_profile_raw]]
)

In [5]:
train_ts_raw, test_ts_raw = (
                            [pd.read_csv(datapath+'{}.csv'.format(n), usecols=['click_time'], nrows=10000000) 
                               for n in ['train','test']]
)

train_ts, test_ts = (
                    [create_ts_dummy(df, 'click_time') 
                     for df in [train_ts_raw, test_ts_raw]]
)

In [6]:
new_train = pd.concat([dummy_train, train_profile_raw, train_profile_raw], axis=1)
new_test = pd.concat([dummy_test, test_profile_raw, test_profile_raw], axis=1)

reduce_data_size(new_train)
reduce_data_size(new_test)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
new_train.to_pickle(datapath+'m3_categorical_train')
new_test.to_pickle(datapath+'m3_categorical_test')