In [21]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

# Get Dummies

In [22]:
# simulate data
np.random.seed(8675309)

n = 100
x1 = np.random.choice(["A", "B", "C"], n)
x2 = np.random.normal(0,1,n)

df = pd.DataFrame({"group":x1, "happiness": x2})

df.head()

Unnamed: 0,group,happiness
0,B,-0.998062
1,A,1.152839
2,A,0.193641
3,A,0.356045
4,A,-1.046417


In [23]:
# notice that the categorical variable `group`, has more than 2 categories.

# many algorithms want dummy/binary variables, so let's turn `group` into dummy/binary variables!
dummies = pd.get_dummies(df["group"])

# you can see that we now have 3 columns, A, B, and C that
# are all binary variables indicating whether each person was
# A (1-yes, 0-no), B (1-yes, 0-no), or C (1-yes, 0-no)
print(dummies)

    A  B  C
0   0  1  0
1   1  0  0
2   1  0  0
3   1  0  0
4   1  0  0
.. .. .. ..
95  0  1  0
96  0  0  1
97  0  1  0
98  0  1  0
99  1  0  0

[100 rows x 3 columns]


In [24]:
# let's add these new variables to our original data frame:

df = pd.concat([df,dummies], axis = 1) #there are other ways to do this too

df.head()

Unnamed: 0,group,happiness,A,B,C
0,B,-0.998062,0,1,0
1,A,1.152839,1,0,0
2,A,0.193641,1,0,0
3,A,0.356045,1,0,0
4,A,-1.046417,1,0,0


In [25]:
# you can get rid of the original group column if you want, just be sure NOT
# to include both the group column AND the ABC columns together as predictors

df = df[["happiness", "A", "B", "C"]] #there are other ways to do this too

# BONUS: Dummy Variables with Reference Categories

Because in our example, A, B, and C are *mutually exclusive* (meaning no one can be both A and B, or B and C, or A and C at the same time), insteast of having THREE binary/dummy variables, we can get rid of one.

For example, if we get rid of column `A`, then a person in group `A` is represented by having 0's for `B` AND `C`. `A` person in group `B` has a 1 for `B` and a 0 for `C`. And a person in group `C` has a 0 for `B` and a 1 for `C`. 

The column we get rid of is called the **reference class** because we assume anyone with all 0's for the other categories, is that (the reference) category.

For example:

In [26]:
df_referenceA = df[["happiness", "B", "C"]] # df with reference class A
df_referenceB = df[["happiness", "A", "C"]] # df with reference class B
df_referenceC = df[["happiness", "B", "A"]] # df with reference class C

we often use reference classes when doing linear/logistic regression, because then the interpretation of our coefficients are in *comparison* to the reference class. For example:

In [27]:
from sklearn.linear_model import LinearRegression # Linear Regression Model
from sklearn.preprocessing import StandardScaler #Z-score variables

In [32]:
lm = LinearRegression().fit(df[["B", "C"]], df[["happiness"]])
lm.coef_
coefficients = pd.DataFrame({"Coef":lm.coef_[0],
              "Name": ["B", "C"]})
coefficients = coefficients.append({"Coef": lm.intercept_[0],
                    "Name": "intercept"}, ignore_index = True)

coefficients

Unnamed: 0,Coef,Name
0,0.000232,B
1,-0.223117,C
2,0.100947,intercept


We would interpret these effects as:

`B`: "Compared to being in group A, being in group B is associated with a 0.000232 increase in predicted happiness"

`C`: "Compared to being in group A, being in group C is associated with a -0.223117 decrease in predicted happiness"


Because A is the *reference* class, we are comparing all other effects to it.