# Handling non missing categorical data

In [1]:
import pandas as pd
import category_encoders as ce

In [2]:
def read_data():
    return pd.read_csv('data/kickstarter.csv', parse_dates=['deadline', 'launched'])

def read_data_for_encoding():
    dataframe = pd.read_csv('data/kickstarter.csv', parse_dates=['deadline', 'launched'])
    dataframe["week_day"] = dataframe.launched.apply(lambda x: x.day_name())
    dataframe = dataframe.assign(outcome=(dataframe['state'] == 'successful').astype(int))
    dataframe.drop(["name", "state"], axis=1, inplace=True)
    return dataframe

## Aggregate label

This can make the model more performant since we limit the number of choices. Only do that if it makes sense.

In [3]:
df = read_data()
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [4]:
label_choice = pd.unique(df.state)
label_choice

array(['failed', 'canceled', 'successful', 'live', 'undefined',
       'suspended'], dtype=object)

In [5]:
df.groupby('state')['ID'].count()

state
canceled       38779
failed        197719
live            2799
successful    133956
suspended       1846
undefined       3562
Name: ID, dtype: int64

In [6]:
df = df.assign(outcome=(df['state'] == 'successful').astype(int))
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,outcome
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95,0
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0,0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0,0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0,0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0,0


In [7]:
df.groupby('outcome')['ID'].count()

outcome
0    244705
1    133956
Name: ID, dtype: int64

In [8]:
df.drop(['state'], axis=1, inplace=True)
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,backers,country,usd pledged,usd_pledged_real,usd_goal_real,outcome
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,0,GB,0.0,0.0,1533.95,0
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,15,US,100.0,2421.0,30000.0,0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,3,US,220.0,220.0,45000.0,0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,1,US,1.0,1.0,5000.0,0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,14,US,1283.0,1283.0,19500.0,0


## Convert timestamps

This can be done when we read the csv or afterwards. The goal is to potentially create more useful feature for the model.

In [9]:
df = read_data()
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [10]:
datetime_column = list(df.select_dtypes(include="datetime64").columns)
datetime_column

['deadline', 'launched']

In [11]:
df = df.assign(
    launched_hour = df.launched.dt.hour,
    launched_day = df.launched.dt.day,
    launched_month = df.launched.dt.month,
    launched_year = df.launched.dt.year
)
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,launched_hour,launched_day,launched_month,launched_year
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95,12,11,8,2015
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0,4,2,9,2017
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0,0,12,1,2013
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0,3,17,3,2012
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0,8,4,7,2015


## Category encoding

**One Hot Encoding**: create a matrix of 0 and 1 where 1 will be the value that represents this category

* only use this method if the category doesn't have lots of unique values otherwise this will create lots of extra feature dimensions
* use if there is no ordinal relationship between the values

**Label Encoding**: replace text with a number, this approach assumes an ordering of the categories, a mathematical meaning, ...

* if a category has too many unique value, it is okay to use label encoding in order to limit the number of dimensions

**Count Encoding**: replaces each categorical value with the number of times it appears in the dataset

**Target Encoding**: replaces a categorical value with the average value of the target for that value of the feature. This reduces the variance of values with few occurences. Only use the train set for the fit() otherwise target leakage. transform() needs to be applied on all.

**CatBoost Encoding**: similar to target encoding except that the target probability is calculated only from the rows before the row. Only use the train set for the fit() otherwise target leakage. transform() needs to be applied on all.

In [12]:
df = read_data_for_encoding()
df.head()

Unnamed: 0,ID,category,main_category,currency,deadline,goal,launched,pledged,backers,country,usd pledged,usd_pledged_real,usd_goal_real,week_day,outcome
0,1000002330,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,0,GB,0.0,0.0,1533.95,Tuesday,0
1,1000003930,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,15,US,100.0,2421.0,30000.0,Saturday,0
2,1000004038,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,3,US,220.0,220.0,45000.0,Saturday,0
3,1000007540,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,1,US,1.0,1.0,5000.0,Saturday,0
4,1000011046,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,14,US,1283.0,1283.0,19500.0,Saturday,0


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378661 entries, 0 to 378660
Data columns (total 15 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   ID                378661 non-null  int64         
 1   category          378661 non-null  object        
 2   main_category     378661 non-null  object        
 3   currency          378661 non-null  object        
 4   deadline          378661 non-null  datetime64[ns]
 5   goal              378661 non-null  float64       
 6   launched          378661 non-null  datetime64[ns]
 7   pledged           378661 non-null  float64       
 8   backers           378661 non-null  int64         
 9   country           378661 non-null  object        
 10  usd pledged       374864 non-null  float64       
 11  usd_pledged_real  378661 non-null  float64       
 12  usd_goal_real     378661 non-null  float64       
 13  week_day          378661 non-null  object        
 14  outc

In [14]:
object_columns = list(df.select_dtypes(include="object").columns)
object_columns

['category', 'main_category', 'currency', 'country', 'week_day']

In [15]:
def print_unique(df=pd.DataFrame(), columns=list()):
    result = list()
    for column in columns:
        result.append((column, len(df[column].unique())))
    return result

In [16]:
print_unique(df=df, columns=["category", "main_category", "country", "week_day", "currency"])

[('category', 159),
 ('main_category', 15),
 ('country', 23),
 ('week_day', 7),
 ('currency', 14)]

In [17]:
label_encoding_columns = ["week_day"]
count_target_catboost_encoding_columns = ["category"]
one_hot_encoding_columns = ["main_category", "currency", "country"] 
columns_to_drop_after_encoding = ["category", "main_category", "country", "currency", "deadline", "launched"]

### Label encoding

In [18]:
label_encoder = ce.OrdinalEncoder(cols=label_encoding_columns)

df = label_encoder.fit_transform(df)
df.head()

Unnamed: 0,ID,category,main_category,currency,deadline,goal,launched,pledged,backers,country,usd pledged,usd_pledged_real,usd_goal_real,week_day,outcome
0,1000002330,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,0,GB,0.0,0.0,1533.95,1,0
1,1000003930,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,15,US,100.0,2421.0,30000.0,2,0
2,1000004038,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,3,US,220.0,220.0,45000.0,2,0
3,1000007540,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,1,US,1.0,1.0,5000.0,2,0
4,1000011046,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,14,US,1283.0,1283.0,19500.0,2,0


### One Hot Encoding

In [19]:
one_hot_encoder = ce.OneHotEncoder(cols=one_hot_encoding_columns)

df = one_hot_encoder.fit_transform(df)
df.head()

Unnamed: 0,ID,category,main_category_1,main_category_2,main_category_3,main_category_4,main_category_5,main_category_6,main_category_7,main_category_8,...,country_19,country_20,country_21,country_22,country_23,usd pledged,usd_pledged_real,usd_goal_real,week_day,outcome
0,1000002330,Poetry,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0.0,0.0,1533.95,1,0
1,1000003930,Narrative Film,0,1,0,0,0,0,0,0,...,0,0,0,0,0,100.0,2421.0,30000.0,2,0
2,1000004038,Narrative Film,0,1,0,0,0,0,0,0,...,0,0,0,0,0,220.0,220.0,45000.0,2,0
3,1000007540,Music,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1.0,1.0,5000.0,2,0
4,1000011046,Film & Video,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1283.0,1283.0,19500.0,2,0


### Count encoding

In [20]:
count_encoder = ce.CountEncoder()

count_encoded = count_encoder.fit_transform(df[count_target_catboost_encoding_columns])
df = df.join(count_encoded.add_suffix("_count"))
df.head()

Unnamed: 0,ID,category,main_category_1,main_category_2,main_category_3,main_category_4,main_category_5,main_category_6,main_category_7,main_category_8,...,country_20,country_21,country_22,country_23,usd pledged,usd_pledged_real,usd_goal_real,week_day,outcome,category_count
0,1000002330,Poetry,1,0,0,0,0,0,0,0,...,0,0,0,0,0.0,0.0,1533.95,1,0,1369
1,1000003930,Narrative Film,0,1,0,0,0,0,0,0,...,0,0,0,0,100.0,2421.0,30000.0,2,0,5188
2,1000004038,Narrative Film,0,1,0,0,0,0,0,0,...,0,0,0,0,220.0,220.0,45000.0,2,0,5188
3,1000007540,Music,0,0,1,0,0,0,0,0,...,0,0,0,0,1.0,1.0,5000.0,2,0,15727
4,1000011046,Film & Video,0,1,0,0,0,0,0,0,...,0,0,0,0,1283.0,1283.0,19500.0,2,0,10108


In [21]:
df.groupby('category')['ID'].count().sort_values(ascending=False)

category
Product Design     22314
Documentary        16139
Music              15727
Tabletop Games     14180
Shorts             12357
                   ...  
Residencies           69
Letterpress           49
Chiptune              35
Literary Spaces       27
Taxidermy             13
Name: ID, Length: 159, dtype: int64

### Target encoding

In [22]:
target_encoder = ce.TargetEncoder(cols=count_target_catboost_encoding_columns)

# normally fit on all and only do transform on the train set
target_encoded = target_encoder.fit_transform(df[count_target_catboost_encoding_columns], df["outcome"])
df = df.join(target_encoded.add_suffix("_target_encoded"))
df.head()

Unnamed: 0,ID,category,main_category_1,main_category_2,main_category_3,main_category_4,main_category_5,main_category_6,main_category_7,main_category_8,...,country_21,country_22,country_23,usd pledged,usd_pledged_real,usd_goal_real,week_day,outcome,category_count,category_target_encoded
0,1000002330,Poetry,1,0,0,0,0,0,0,0,...,0,0,0,0.0,0.0,1533.95,1,0,1369,0.355734
1,1000003930,Narrative Film,0,1,0,0,0,0,0,0,...,0,0,0,100.0,2421.0,30000.0,2,0,5188,0.388011
2,1000004038,Narrative Film,0,1,0,0,0,0,0,0,...,0,0,0,220.0,220.0,45000.0,2,0,5188,0.388011
3,1000007540,Music,0,0,1,0,0,0,0,0,...,0,0,0,1.0,1.0,5000.0,2,0,15727,0.408978
4,1000011046,Film & Video,0,1,0,0,0,0,0,0,...,0,0,0,1283.0,1283.0,19500.0,2,0,10108,0.30372


### CatBoost Encoding

In [23]:
catboost_encoder = ce.CatBoostEncoder(cols=count_target_catboost_encoding_columns)

catboost_encoded = catboost_encoder.fit_transform(df[count_target_catboost_encoding_columns], df["outcome"])
df = df.join(catboost_encoded.add_suffix("_catboost_encoded"))
df.head()

Unnamed: 0,ID,category,main_category_1,main_category_2,main_category_3,main_category_4,main_category_5,main_category_6,main_category_7,main_category_8,...,country_22,country_23,usd pledged,usd_pledged_real,usd_goal_real,week_day,outcome,category_count,category_target_encoded,category_catboost_encoded
0,1000002330,Poetry,1,0,0,0,0,0,0,0,...,0,0,0.0,0.0,1533.95,1,0,1369,0.355734,0.353762
1,1000003930,Narrative Film,0,1,0,0,0,0,0,0,...,0,0,100.0,2421.0,30000.0,2,0,5188,0.388011,0.353762
2,1000004038,Narrative Film,0,1,0,0,0,0,0,0,...,0,0,220.0,220.0,45000.0,2,0,5188,0.388011,0.176881
3,1000007540,Music,0,0,1,0,0,0,0,0,...,0,0,1.0,1.0,5000.0,2,0,15727,0.408978,0.353762
4,1000011046,Film & Video,0,1,0,0,0,0,0,0,...,0,0,1283.0,1283.0,19500.0,2,0,10108,0.30372,0.353762


## Cleanup

In [24]:
df.drop(columns=columns_to_drop_after_encoding, inplace=True, errors="ignore")
df.head()

Unnamed: 0,ID,main_category_1,main_category_2,main_category_3,main_category_4,main_category_5,main_category_6,main_category_7,main_category_8,main_category_9,...,country_22,country_23,usd pledged,usd_pledged_real,usd_goal_real,week_day,outcome,category_count,category_target_encoded,category_catboost_encoded
0,1000002330,1,0,0,0,0,0,0,0,0,...,0,0,0.0,0.0,1533.95,1,0,1369,0.355734,0.353762
1,1000003930,0,1,0,0,0,0,0,0,0,...,0,0,100.0,2421.0,30000.0,2,0,5188,0.388011,0.353762
2,1000004038,0,1,0,0,0,0,0,0,0,...,0,0,220.0,220.0,45000.0,2,0,5188,0.388011,0.176881
3,1000007540,0,0,1,0,0,0,0,0,0,...,0,0,1.0,1.0,5000.0,2,0,15727,0.408978,0.353762
4,1000011046,0,1,0,0,0,0,0,0,0,...,0,0,1283.0,1283.0,19500.0,2,0,10108,0.30372,0.353762


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378661 entries, 0 to 378660
Data columns (total 64 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   ID                         378661 non-null  int64  
 1   main_category_1            378661 non-null  int64  
 2   main_category_2            378661 non-null  int64  
 3   main_category_3            378661 non-null  int64  
 4   main_category_4            378661 non-null  int64  
 5   main_category_5            378661 non-null  int64  
 6   main_category_6            378661 non-null  int64  
 7   main_category_7            378661 non-null  int64  
 8   main_category_8            378661 non-null  int64  
 9   main_category_9            378661 non-null  int64  
 10  main_category_10           378661 non-null  int64  
 11  main_category_11           378661 non-null  int64  
 12  main_category_12           378661 non-null  int64  
 13  main_category_13           37