In [1]:
import pandas as pd
import numpy as np

import scipy
import sklearn
from sklearn.preprocessing import OrdinalEncoder

import matplotlib.pyplot as plt

In [2]:
data_path = "../problem_merged_data.csv"
df = pd.read_csv(data_path)

df.head(2)

Unnamed: 0,group,item,channel,date,bid,budget,engagements,page_views,clicks,active_days,...,storySummary,IABCategory,targetGeo,targetInterest,targetAge,targetOs,targetDevices,targetGender,targetLanguages,CATEGORY_1
0,37,997,YAHOO,2021-12-21,0.263696,660.923913,138,388,411,4,...,Peace of Mind with AAA Roadside Service. Join ...,Travel,"{""channelId"":""YAHOO"",""channelName"":""Yahoo"",""re...",,,,,,,
1,37,1372,YAHOO,2021-12-21,0.263043,235.217391,80,253,270,4,...,Peace of Mind with AAA Roadside Service. Join ...,Travel,"{""channelId"":""YAHOO"",""channelName"":""Yahoo"",""re...",,,,,,,


In [3]:
print(len(df))
df.keys()

1682


Index(['group', 'item', 'channel', 'date', 'bid', 'budget', 'engagements',
       'page_views', 'clicks', 'active_days', 'media_spend', 'media_cpc',
       'cpe', 'headline', 'storySummary', 'IABCategory', 'targetGeo',
       'targetInterest', 'targetAge', 'targetOs', 'targetDevices',
       'targetGender', 'targetLanguages', 'CATEGORY_1'],
      dtype='object')

In [4]:
NUMERICAL_KEYS = ['bid', 'budget', 'engagements','page_views', 'clicks', 
                  'active_days', 'media_spend', 'media_cpc','cpe']

CATEGORICAL_KEYS = ['group', 'item', 'channel', 'date', 'headline', 'storySummary',
                    'IABCategory', 'targetGeo', 'targetInterest', 'targetAge', 'targetOs', 
                    'targetDevices','targetGender', 'targetLanguages', 'CATEGORY_1']

In [5]:
df_num = df[NUMERICAL_KEYS].copy()
df_cat = df[CATEGORICAL_KEYS].copy()

### Num Processing

In [6]:
df_num.head(2)

Unnamed: 0,bid,budget,engagements,page_views,clicks,active_days,media_spend,media_cpc,cpe
0,0.263696,660.923913,138,388,411,4,158.31,0.39,1.147174
1,0.263043,235.217391,80,253,270,4,90.48,0.34,1.131


In [7]:
from sklearn.preprocessing import StandardScaler
standard_scaler = StandardScaler()

num_scaled = standard_scaler.fit_transform(df_num)

num_scaled, num_scaled.shape

(array([[-0.93033987,  2.89502353,  1.67493547, ...,  1.83290219,
         -0.4615066 , -0.22715236],
        [-0.93347963,  0.53877777,  0.50153163, ...,  0.55202735,
         -0.72246743, -0.24615987],
        [-0.43392439,  5.10740525,  3.96104985, ...,  6.95998942,
         -0.04396926,  0.43713683],
        ...,
        [-1.14070353, -0.50473574, -0.4897923 , ..., -0.6712525 ,
         -0.35712226, -0.6010323 ],
        [-1.04441768, -0.64291839, -0.71233441, ..., -0.90937474,
         -1.19219694, -0.80614259],
        [-1.01748056, -0.62063034, -0.22678799, ..., -0.61686772,
         -1.0878126 , -0.81196518]]),
 (1682, 9))

In [8]:
bool(["bid", "budget"])

True

### Cat Processing

In [9]:
df_cat.head(2)

Unnamed: 0,group,item,channel,date,headline,storySummary,IABCategory,targetGeo,targetInterest,targetAge,targetOs,targetDevices,targetGender,targetLanguages,CATEGORY_1
0,37,997,YAHOO,2021-12-21,Get Valuable Discounts with AAA. Join for 50% Off,Peace of Mind with AAA Roadside Service. Join ...,Travel,"{""channelId"":""YAHOO"",""channelName"":""Yahoo"",""re...",,,,,,,
1,37,1372,YAHOO,2021-12-21,Peace of Mind with AAA Roadside Service. Join Now,Peace of Mind with AAA Roadside Service. Join ...,Travel,"{""channelId"":""YAHOO"",""channelName"":""Yahoo"",""re...",,,,,,,


In [10]:
# NUM 2 STR
df_cat['group'] = df_cat['group'].apply(lambda x: str(x))
df_cat['item'] = df_cat['item'].apply(lambda x: str(x))

In [11]:
# Process DATE
import datetime

def date_to_day(date):
    dt = datetime.datetime.strptime(date, '%Y-%m-%d')
    return str(datetime.datetime.weekday(dt))

df_cat['date'] = df_cat['date'].apply(lambda x: date_to_day(x))
df_cat.head(2)

Unnamed: 0,group,item,channel,date,headline,storySummary,IABCategory,targetGeo,targetInterest,targetAge,targetOs,targetDevices,targetGender,targetLanguages,CATEGORY_1
0,37,997,YAHOO,1,Get Valuable Discounts with AAA. Join for 50% Off,Peace of Mind with AAA Roadside Service. Join ...,Travel,"{""channelId"":""YAHOO"",""channelName"":""Yahoo"",""re...",,,,,,,
1,37,1372,YAHOO,1,Peace of Mind with AAA Roadside Service. Join Now,Peace of Mind with AAA Roadside Service. Join ...,Travel,"{""channelId"":""YAHOO"",""channelName"":""Yahoo"",""re...",,,,,,,


In [12]:
# NaNs -> UNK
df_cat = df_cat.fillna(value="UNK")
df_cat.head(2)

Unnamed: 0,group,item,channel,date,headline,storySummary,IABCategory,targetGeo,targetInterest,targetAge,targetOs,targetDevices,targetGender,targetLanguages,CATEGORY_1
0,37,997,YAHOO,1,Get Valuable Discounts with AAA. Join for 50% Off,Peace of Mind with AAA Roadside Service. Join ...,Travel,"{""channelId"":""YAHOO"",""channelName"":""Yahoo"",""re...",UNK,UNK,UNK,UNK,UNK,UNK,UNK
1,37,1372,YAHOO,1,Peace of Mind with AAA Roadside Service. Join Now,Peace of Mind with AAA Roadside Service. Join ...,Travel,"{""channelId"":""YAHOO"",""channelName"":""Yahoo"",""re...",UNK,UNK,UNK,UNK,UNK,UNK,UNK


In [13]:
# Add UNK line for Encoder
unk_data = ["UNK" for _ in range(len(CATEGORICAL_KEYS))]

df_unk_line = pd.DataFrame([unk_data], columns=df_cat.columns.to_list())

df_cat = pd.concat([df_cat, df_unk_line])
df_cat.tail(2)

Unnamed: 0,group,item,channel,date,headline,storySummary,IABCategory,targetGeo,targetInterest,targetAge,targetOs,targetDevices,targetGender,targetLanguages,CATEGORY_1
1681,83,1612,YAHOO,6,How little changes can lead to big results,Learn about how micro-habits and implementing ...,Medical Health,"{""channelId"":""YAHOO"",""channelName"":""Yahoo"",""co...","{""channelId"":""YAHOO"",""channelName"":""Yahoo"",""in...",65+;,IOS;,SMARTPHONES;,MALE;,en;,/Health/Women's Health
0,UNK,UNK,UNK,UNK,UNK,UNK,UNK,UNK,UNK,UNK,UNK,UNK,UNK,UNK,UNK


In [14]:
# Dict of Possible Values
dict_sets = {}

for _key in CATEGORICAL_KEYS:
    dict_sets[_key] = list(set(df_cat[_key]))

In [15]:
##################################################
# --  TO BE USED on TESTs --
# Check unknown input values and replace by UNK
##################################################

def replacement(_elem, _list, value="UNK"):
    if not _elem in _list:
        _elem = value
    return _elem

def unk_replacement(df, dict_sets):
    for _key in dict_sets.keys():
        df[_key] = df[_key].apply(lambda x: replacement(x, dict_sets[_key]))
    return df

In [16]:
ordinal_encoder = OrdinalEncoder()

cat_encoded = ordinal_encoder.fit_transform(df_cat)
cat_encoded = cat_encoded[:-1,:].copy()

cat_encoded, cat_encoded.shape

(array([[  0., 155.,   3., ...,   2.,   0.,   7.],
        [  0.,  40.,   3., ...,   2.,   0.,   7.],
        [  0.,  40.,   3., ...,   2.,   0.,   7.],
        ...,
        [  9., 101.,   3., ...,   0.,   1.,   6.],
        [  9.,  60.,   3., ...,   1.,   1.,   6.],
        [  9.,  60.,   3., ...,   1.,   1.,   6.]]),
 (1682, 15))