In [289]:
import numpy as np 
import pandas as pd

## 1. One hot Encoding
### This can be performed using pd.get_dummies()[](http://)

In [290]:
titanic_df = pd.read_csv('../input/titanic/train.csv',usecols=['PassengerId','Age','Sex','Survived'])
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Sex,Age
0,1,0,male,22.0
1,2,1,female,38.0
2,3,1,female,26.0
3,4,1,female,35.0
4,5,0,male,35.0


In [291]:
#create new dataframe for the dummie variables for 'Sex'
sex = pd.get_dummies(titanic_df['Sex'].dropna(),drop_first=True)

In [292]:
#concatentating new df and removing old column feature from df
titanic_df = pd.concat([titanic_df,sex],axis=1)
titanic_df.drop('Sex',axis=1)

Unnamed: 0,PassengerId,Survived,Age,male
0,1,0,22.0,1
1,2,1,38.0,0
2,3,1,26.0,0
3,4,1,35.0,0
4,5,0,35.0,1
...,...,...,...,...
886,887,0,27.0,1
887,888,1,19.0,0
888,889,0,,0
889,890,1,26.0,1


In [293]:
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Sex,Age,male
0,1,0,male,22.0,1
1,2,1,female,38.0,0
2,3,1,female,26.0,0
3,4,1,female,35.0,0
4,5,0,male,35.0,1


## One hot encoding with many categories in a feature
### with reference to KDD orange cup challenge:


### pd.get_dummies() is not applicable since it takes all categories in a feature and in this case, we only select top 10 features, we have to do manuallu using pd.where()

In [294]:
merc = pd.read_csv('../input/mercedes-benz-greener-manufacturing/train.csv.zip',usecols=['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8'])
merc.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8
0,k,v,at,a,d,u,j,o
1,k,t,av,e,d,y,l,o
2,az,w,n,c,d,x,j,x
3,az,t,n,f,d,x,l,e
4,az,v,n,f,d,h,d,n


In [295]:
for i in merc.columns:
    print('The feature {} has {} categories in it'.format(i,merc[i].nunique()))

The feature X0 has 47 categories in it
The feature X1 has 27 categories in it
The feature X2 has 44 categories in it
The feature X3 has 7 categories in it
The feature X4 has 4 categories in it
The feature X5 has 29 categories in it
The feature X6 has 12 categories in it
The feature X8 has 25 categories in it


In [296]:
#now taking X1 as it has around 47 categories in it and list top 10 categories in it.
lst_10=merc.X1.value_counts().sort_values(ascending=False).head(10).index
lst_10=list(lst_10)
lst_10

['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o']

In [297]:
for category in lst_10:
    merc[category] = np.where(merc['X1']==category,1,0)

In [298]:
lst_10.append('X1')
lst_10

['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o', 'X1']

In [299]:
merc[lst_10].head()

Unnamed: 0,aa,s,b,l,v,r,i,a,c,o,X1
0,0,0,0,0,1,0,0,0,0,0,v
1,0,0,0,0,0,0,0,0,0,0,t
2,0,0,0,0,0,0,0,0,0,0,w
3,0,0,0,0,0,0,0,0,0,0,t
4,0,0,0,0,1,0,0,0,0,0,v


## 2. Ordinal Number Encoding

### here now when we apply categories in a ranking order

In [300]:
import datetime as dt

In [301]:
date_today = datetime.datetime.today()

In [302]:
days = [date_today - datetime.timedelta(x) for x in range(0,15)]
days

[datetime.datetime(2020, 8, 1, 18, 3, 57, 961306),
 datetime.datetime(2020, 7, 31, 18, 3, 57, 961306),
 datetime.datetime(2020, 7, 30, 18, 3, 57, 961306),
 datetime.datetime(2020, 7, 29, 18, 3, 57, 961306),
 datetime.datetime(2020, 7, 28, 18, 3, 57, 961306),
 datetime.datetime(2020, 7, 27, 18, 3, 57, 961306),
 datetime.datetime(2020, 7, 26, 18, 3, 57, 961306),
 datetime.datetime(2020, 7, 25, 18, 3, 57, 961306),
 datetime.datetime(2020, 7, 24, 18, 3, 57, 961306),
 datetime.datetime(2020, 7, 23, 18, 3, 57, 961306),
 datetime.datetime(2020, 7, 22, 18, 3, 57, 961306),
 datetime.datetime(2020, 7, 21, 18, 3, 57, 961306),
 datetime.datetime(2020, 7, 20, 18, 3, 57, 961306),
 datetime.datetime(2020, 7, 19, 18, 3, 57, 961306),
 datetime.datetime(2020, 7, 18, 18, 3, 57, 961306)]

In [303]:
#changing days to a dataframe
days_df = pd.DataFrame(days,columns=['Date'])
days_df

Unnamed: 0,Date
0,2020-08-01 18:03:57.961306
1,2020-07-31 18:03:57.961306
2,2020-07-30 18:03:57.961306
3,2020-07-29 18:03:57.961306
4,2020-07-28 18:03:57.961306
5,2020-07-27 18:03:57.961306
6,2020-07-26 18:03:57.961306
7,2020-07-25 18:03:57.961306
8,2020-07-24 18:03:57.961306
9,2020-07-23 18:03:57.961306


In [304]:
days_df['Day'] = days_df['Date'].dt.day_name()
days_df

Unnamed: 0,Date,Day
0,2020-08-01 18:03:57.961306,Saturday
1,2020-07-31 18:03:57.961306,Friday
2,2020-07-30 18:03:57.961306,Thursday
3,2020-07-29 18:03:57.961306,Wednesday
4,2020-07-28 18:03:57.961306,Tuesday
5,2020-07-27 18:03:57.961306,Monday
6,2020-07-26 18:03:57.961306,Sunday
7,2020-07-25 18:03:57.961306,Saturday
8,2020-07-24 18:03:57.961306,Friday
9,2020-07-23 18:03:57.961306,Thursday


In [305]:
#day to number mapping
day_map_dict = {'Sunday':7,'Saturday':6,'Friday':5,'Thursday':4,'Wednesday':3,'Tuesday':2,'Monday':1}
day_map_dict

{'Sunday': 7,
 'Saturday': 6,
 'Friday': 5,
 'Thursday': 4,
 'Wednesday': 3,
 'Tuesday': 2,
 'Monday': 1}

In [306]:
days_df['Day_Ordinal'] = days_df['Day'].map(day_map_dict)
days_df

Unnamed: 0,Date,Day,Day_Ordinal
0,2020-08-01 18:03:57.961306,Saturday,6
1,2020-07-31 18:03:57.961306,Friday,5
2,2020-07-30 18:03:57.961306,Thursday,4
3,2020-07-29 18:03:57.961306,Wednesday,3
4,2020-07-28 18:03:57.961306,Tuesday,2
5,2020-07-27 18:03:57.961306,Monday,1
6,2020-07-26 18:03:57.961306,Sunday,7
7,2020-07-25 18:03:57.961306,Saturday,6
8,2020-07-24 18:03:57.961306,Friday,5
9,2020-07-23 18:03:57.961306,Thursday,4


## 3. Count or Frequency Encoding

In [307]:
ad_df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',header=None,usecols=[1,3,5,6,7,8,9,13])
ad_df.head()

Unnamed: 0,1,3,5,6,7,8,9,13
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [308]:
#assigning new names to columns
ad_df.columns = ['Employment','Degree','Status','Designation','family_job','Race','Sex','Country']

In [309]:
ad_df.head()

Unnamed: 0,Employment,Degree,Status,Designation,family_job,Race,Sex,Country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [310]:
for i in ad_df.columns:
    print('The feature {} has {} categories'.format(i,ad_df[i].nunique()))

The feature Employment has 9 categories
The feature Degree has 16 categories
The feature Status has 7 categories
The feature Designation has 15 categories
The feature family_job has 6 categories
The feature Race has 5 categories
The feature Sex has 2 categories
The feature Country has 42 categories


In [311]:
#the feature has many categories and it can be done using frequency encoding
frequency_map_dict = ad_df['Country'].value_counts().sort_values(ascending=False).to_dict()
frequency_map_dict

{' United-States': 29170,
 ' Mexico': 643,
 ' ?': 583,
 ' Philippines': 198,
 ' Germany': 137,
 ' Canada': 121,
 ' Puerto-Rico': 114,
 ' El-Salvador': 106,
 ' India': 100,
 ' Cuba': 95,
 ' England': 90,
 ' Jamaica': 81,
 ' South': 80,
 ' China': 75,
 ' Italy': 73,
 ' Dominican-Republic': 70,
 ' Vietnam': 67,
 ' Guatemala': 64,
 ' Japan': 62,
 ' Poland': 60,
 ' Columbia': 59,
 ' Taiwan': 51,
 ' Haiti': 44,
 ' Iran': 43,
 ' Portugal': 37,
 ' Nicaragua': 34,
 ' Peru': 31,
 ' France': 29,
 ' Greece': 29,
 ' Ecuador': 28,
 ' Ireland': 24,
 ' Hong': 20,
 ' Cambodia': 19,
 ' Trinadad&Tobago': 19,
 ' Laos': 18,
 ' Thailand': 18,
 ' Yugoslavia': 16,
 ' Outlying-US(Guam-USVI-etc)': 14,
 ' Hungary': 13,
 ' Honduras': 13,
 ' Scotland': 12,
 ' Holand-Netherlands': 1}

In [312]:
ad_df['country_freq'] = ad_df['Country'].map(frequency_map_dict)
ad_df.head()
# now, last column is replaced by the frequency of times the country present in the dataset.

Unnamed: 0,Employment,Degree,Status,Designation,family_job,Race,Sex,Country,country_freq
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,29170
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,29170
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,29170
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,29170
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,95


## 4. Target Guided Ordinal Encoding

In [313]:
tit_df = pd.read_csv('../input/titanic/train.csv',usecols=['Survived','Cabin'])
tit_df['Cabin'].fillna('Missing',inplace=True)

In [314]:
tit_df.head()

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing


In [315]:
tit_df['Cabin'] = tit_df['Cabin'].astype(str).str[0]
tit_df.head()

Unnamed: 0,Survived,Cabin
0,0,M
1,1,C
2,1,M
3,1,C
4,0,M


In [316]:
ordinal_labels = tit_df.groupby('Cabin')['Survived'].mean().sort_values().index
ordinal_labels

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [317]:
ordinal_labels_2 = {k:i for i,k in enumerate(ordinal_labels)}
ordinal_labels_2

{'T': 0, 'M': 1, 'A': 2, 'G': 3, 'C': 4, 'F': 5, 'B': 6, 'E': 7, 'D': 8}

In [318]:
tit_df['Cabin_ordinal_labels'] = tit_df['Cabin'].map(ordinal_labels_2)
tit_df.head()

Unnamed: 0,Survived,Cabin,Cabin_ordinal_labels
0,0,M,1
1,1,C,4
2,1,M,1
3,1,C,4
4,0,M,1


## 5. Mean Encoding

In [319]:
mean_ordinal=tit_df.groupby(['Cabin'])['Survived'].mean().to_dict()
mean_ordinal

{'A': 0.4666666666666667,
 'B': 0.7446808510638298,
 'C': 0.5932203389830508,
 'D': 0.7575757575757576,
 'E': 0.75,
 'F': 0.6153846153846154,
 'G': 0.5,
 'M': 0.29985443959243085,
 'T': 0.0}

In [320]:
tit_df['Mean_Ordinal'] = tit_df['Cabin'].map(mean_ordinal)
tit_df.head()

Unnamed: 0,Survived,Cabin,Cabin_ordinal_labels,Mean_Ordinal
0,0,M,1,0.299854
1,1,C,4,0.59322
2,1,M,1,0.299854
3,1,C,4,0.59322
4,0,M,1,0.299854


## 6. Probability Ratio Encoding

In [321]:
import pandas as pd
ratio_df = pd.read_csv('../input/titanic/train.csv',usecols=['Survived','Cabin'])
ratio_df.head()

Unnamed: 0,Survived,Cabin
0,0,
1,1,C85
2,1,
3,1,C123
4,0,


In [322]:
ratio_df.fillna('Missing',inplace=True)
ratio_df.head()

Unnamed: 0,Survived,Cabin
0,0,Missing
1,1,C85
2,1,Missing
3,1,C123
4,0,Missing


In [323]:
ratio_df['Cabin'] = ratio_df['Cabin'].astype(str).str[0]

In [324]:
prob_df = pd.DataFrame(ratio_df.groupby('Cabin')['Survived'].mean())
prob_df

Unnamed: 0_level_0,Survived
Cabin,Unnamed: 1_level_1
A,0.466667
B,0.744681
C,0.59322
D,0.757576
E,0.75
F,0.615385
G,0.5
M,0.299854
T,0.0


In [325]:
prob_df['died'] = 1 - prob_df['Survived']
prob_df

Unnamed: 0_level_0,Survived,died
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.466667,0.533333
B,0.744681,0.255319
C,0.59322,0.40678
D,0.757576,0.242424
E,0.75,0.25
F,0.615385,0.384615
G,0.5,0.5
M,0.299854,0.700146
T,0.0,1.0


In [326]:
prob_df['Probability_Ratio'] = (prob_df['Survived'])/(prob_df['died'])
prob_df

Unnamed: 0_level_0,Survived,died,Probability_Ratio
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.466667,0.533333,0.875
B,0.744681,0.255319,2.916667
C,0.59322,0.40678,1.458333
D,0.757576,0.242424,3.125
E,0.75,0.25,3.0
F,0.615385,0.384615,1.6
G,0.5,0.5,1.0
M,0.299854,0.700146,0.428274
T,0.0,1.0,0.0


In [327]:
probability_encoded=prob_df['Probability_Ratio'].to_dict()
probability_encoded

{'A': 0.875,
 'B': 2.916666666666666,
 'C': 1.4583333333333333,
 'D': 3.125,
 'E': 3.0,
 'F': 1.6000000000000003,
 'G': 1.0,
 'M': 0.42827442827442824,
 'T': 0.0}

In [328]:
ratio_df['Cabin_encoded'] = ratio_df['Cabin'].map(probability_encoded)
ratio_df.head(20)

Unnamed: 0,Survived,Cabin,Cabin_encoded
0,0,M,0.428274
1,1,C,1.458333
2,1,M,0.428274
3,1,C,1.458333
4,0,M,0.428274
5,0,M,0.428274
6,0,E,3.0
7,0,M,0.428274
8,1,M,0.428274
9,1,M,0.428274
