In [1]:
import pandas as pd

- [Encoding of categorical variables](#Encoding-of-categorical-variables)
- [Random variables](#Random-variables)

# Encoding of categorical variables

In [2]:
df_master = pd.DataFrame({
    'color' : ['yellow', 'blue', 'red', 'yellow', 'red', 'red'],
    'label' : [1, 2, 3, 1, 3, 3]
})
df_master

Unnamed: 0,color,label
0,yellow,1
1,blue,2
2,red,3
3,yellow,1
4,red,3
5,red,3


In [3]:
X, y = df_master[['color']], df_master.label
X

Unnamed: 0,color
0,yellow
1,blue
2,red
3,yellow
4,red
5,red


In [4]:
from sklearn.svm import SVC

model = SVC(kernel='linear')
model

SVC(kernel='linear')

## With pandas.get_dummiess

### Encode all data and then split to train and test sets

In [5]:
X_encoded = pd.get_dummies(X)
X_encoded

Unnamed: 0,color_blue,color_red,color_yellow
0,0,0,1
1,1,0,0
2,0,1,0
3,0,0,1
4,0,1,0
5,0,1,0


In [6]:
split_index = 3
X_train, X_test, y_train, y_test = X_encoded[:split_index], X_encoded[split_index:], y[:split_index], y[split_index:]
display(X_train); display(X_test)

Unnamed: 0,color_blue,color_red,color_yellow
0,0,0,1
1,1,0,0
2,0,1,0


Unnamed: 0,color_blue,color_red,color_yellow
3,0,0,1
4,0,1,0
5,0,1,0


- <span style="color:green;">Works fine</span>

### Split data to train and test and then encode each of them

In [7]:
X_train, X_test, y_train, y_test = X[:split_index], X[split_index:], y[:split_index], y[split_index:]
display(X_train); display(X_test)

Unnamed: 0,color
0,yellow
1,blue
2,red


Unnamed: 0,color
3,yellow
4,red
5,red


In [8]:
X_train_encoded = pd.get_dummies(X_train)
X_train_encoded

Unnamed: 0,color_blue,color_red,color_yellow
0,0,0,1
1,1,0,0
2,0,1,0


In [9]:
model.fit(X_train_encoded, y_test)

SVC(kernel='linear')

In [10]:
X_test_encoded = pd.get_dummies(X_test)
X_test_encoded

Unnamed: 0,color_red,color_yellow
3,0,1
4,1,0
5,1,0


In [11]:
model.predict(X_test_encoded)

ValueError: X.shape[1] = 2 should be equal to 3, the number of features at training time

- <span style="color:red;">The shape of the train and test color features is not the same (3 for train and 2 for test)</span>
- <span style="color:red;">The encoding is not the same: [1, 0] = red in test and [0, 1, 0] in train</span>

### New data

**- Known feature values**

In [12]:
new_data_1 = pd.DataFrame({'color': ['red', 'yellow', 'blue']})
new_data_1

Unnamed: 0,color
0,red
1,yellow
2,blue


In [13]:
new_data_1_encoded = pd.get_dummies(new_data_1)
new_data_1_encoded

Unnamed: 0,color_blue,color_red,color_yellow
0,0,1,0
1,0,0,1
2,1,0,0


<span style="color:green;">Works fine</span>

In [14]:
model.predict(new_data_1_encoded)

array([3, 1, 3])

- **Unknown feature values**

In [15]:
X_train.color.unique()

array(['yellow', 'blue', 'red'], dtype=object)

In [16]:
new_data_2 = pd.DataFrame({'color': ['purple', 'blue', 'yellow']})
display(new_data_2)

Unnamed: 0,color
0,purple
1,blue
2,yellow


In [17]:
new_data_2.color.unique()

array(['purple', 'blue', 'yellow'], dtype=object)

In [18]:
set(new_data_2.color.unique()) - set(X_train.color.unique())

{'purple'}

In [21]:
new_data_2_encoded = pd.get_dummies(new_data_2)
new_data_2_encoded

Unnamed: 0,color_blue,color_purple,color_yellow
0,0,1,0
1,1,0,0
2,0,0,1


In [22]:
model.predict(new_data_2_encoded)

array([3, 3, 1])

<span style="color:red;">The encoding worked even though the feature value is unknown</span>

## With OneHotEncoder

In [23]:
split_index = 3
X_train, X_test, y_train, y_test = X[:split_index], X[split_index:], y[:split_index], y[split_index:]
display(X_train); display(X_test)

Unnamed: 0,color
0,yellow
1,blue
2,red


Unnamed: 0,color
3,yellow
4,red
5,red


### Known features

- Train

In [24]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(handle_unknown='error', dtype=int, sparse=False)
encoder.fit(X_train.color.values.reshape(-1,1))

OneHotEncoder(dtype=<class 'int'>, sparse=False)

In [25]:
encoder.categories_

[array(['blue', 'red', 'yellow'], dtype=object)]

In [26]:
encoded_colors = encoder.transform(X_train.color.values.reshape(-1,1))
encoded_colors

array([[0, 0, 1],
       [1, 0, 0],
       [0, 1, 0]])

In [27]:
encoded_colors_columns = encoder.get_feature_names(X_train.columns)
encoded_colors_columns

array(['color_blue', 'color_red', 'color_yellow'], dtype=object)

In [28]:
encoded_colors_df = pd.DataFrame(data=encoded_colors, columns=encoded_colors_columns, index=X_train.index)
encoded_colors_df

Unnamed: 0,color_blue,color_red,color_yellow
0,0,0,1
1,1,0,0
2,0,1,0


In [29]:
X_encoded = X_train.copy().join(encoded_colors_df)
X_encoded

Unnamed: 0,color,color_blue,color_red,color_yellow
0,yellow,0,0,1
1,blue,1,0,0
2,red,0,1,0


In [30]:
X_encoded = X_encoded.drop('color', axis=1)
X_encoded

Unnamed: 0,color_blue,color_red,color_yellow
0,0,0,1
1,1,0,0
2,0,1,0


In [31]:
model.fit(X_encoded, y_train)

SVC(kernel='linear')

- Predict

In [32]:
encoded_colors_test = encoder.transform(X_test.color.values.reshape(-1,1))
encoded_colors_test

array([[0, 0, 1],
       [0, 1, 0],
       [0, 1, 0]])

In [33]:
encoded_colors_columns = encoder.get_feature_names(X_test.columns)
encoded_colors_columns

array(['color_blue', 'color_red', 'color_yellow'], dtype=object)

In [34]:
encoded_colors_test_df = pd.DataFrame(data=encoded_colors_test, columns=encoded_colors_columns, index=X_test.index)
encoded_colors_test_df

Unnamed: 0,color_blue,color_red,color_yellow
3,0,0,1
4,0,1,0
5,0,1,0


In [35]:
X_encoded_test = X_test.copy().join(encoded_colors_test_df).drop('color', axis=1)
X_encoded_test

Unnamed: 0,color_blue,color_red,color_yellow
3,0,0,1
4,0,1,0
5,0,1,0


In [36]:
model.predict(X_encoded_test)

array([1, 3, 3])

### Unknown features

In [37]:
X_train.color.unique()

array(['yellow', 'blue', 'red'], dtype=object)

In [38]:
new_data = pd.DataFrame({'color': ['purple', 'blue', 'yellow']})
display(new_data)

Unnamed: 0,color
0,purple
1,blue
2,yellow


- Raise an error

In [39]:
encoder.transform(new_data.color.values.reshape(-1,1))

ValueError: Found unknown categories ['purple'] in column 0 during transform

- Ignore exception

In [40]:
encoder_ignore = OneHotEncoder(handle_unknown='ignore', dtype=int, sparse=False)
encoder_ignore.fit(X_train.color.values.reshape(-1,1))

OneHotEncoder(dtype=<class 'int'>, handle_unknown='ignore', sparse=False)

In [41]:
encoder_ignore.transform(new_data.color.values.reshape(-1,1))

array([[0, 0, 0],
       [1, 0, 0],
       [0, 0, 1]])

In [42]:
model.predict(encoder_ignore.transform(new_data.color.values.reshape(-1,1)))

array([3, 2, 1])

# Random variables

In [43]:
# fixing seed

# Extracting functions to python modules

- Python path
- Re-importing functions at each import

## Python path

## Importing from python modules while changing code 

In [44]:
%load_ext autoreload

In [45]:
%aimport

Modules to reload:


Modules to skip:



In [46]:
%autoreload 2
%aimport

Modules to reload:
all-except-skipped

Modules to skip:



In [47]:
%autoreload 0
%aimport

Modules to reload:
all-except-skipped

Modules to skip:

