## Encoding Categoricals

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

In [2]:
data = pd.read_csv('regression_data.csv')
data.head()

Unnamed: 0,AVGGIFT,HV1_log,IC1_transformed,IC5_transformed,gender,TARGET_D
0,15.5,7.760467,17.343389,4.181353,Male,21.0
1,3.08,6.20859,16.230984,4.150313,Male,3.0
2,7.5,7.113956,18.047227,4.205057,Female,20.0
3,6.7,5.783825,11.73711,4.055333,Male,5.0
4,8.785714,6.64379,12.494862,4.088969,Female,10.0


In [42]:
# X-y split
# watch out when is the best time for you to do the splits
y = data['TARGET_D']
X = data.drop(['TARGET_D'], axis=1)

In [4]:
# now let's work with Categoricals
X_num = X.select_dtypes(include = np.number)
X_cat = X.select_dtypes(include = 'object')
cat_data = pd.get_dummies(X, drop_first=True) # if you need to save the encoding info, this won't do
cat_data

Unnamed: 0,AVGGIFT,HV1_log,IC1_transformed,IC5_transformed,gender_Male,gender_U
0,15.500000,7.760467,17.343389,4.181353,1,0
1,3.080000,6.208590,16.230984,4.150313,1,0
2,7.500000,7.113956,18.047227,4.205057,0,0
3,6.700000,5.783825,11.737110,4.055333,1,0
4,8.785714,6.643790,12.494862,4.088969,0,0
...,...,...,...,...,...,...
4665,15.500000,6.552508,15.280561,4.187901,1,0
4666,11.857143,8.118803,15.029617,4.248632,1,0
4667,11.333333,6.551080,15.742430,4.179506,0,0
4668,13.000000,6.398595,13.326031,4.124159,1,0


In [29]:
data.groupby('gender', as_index=False).agg({'AVGGIFT': 'sum', 'TARGET_D': 'mean'})

Unnamed: 0,gender,AVGGIFT,TARGET_D
0,Female,30911.090708,15.44646
1,Male,22465.747769,15.490148
2,U,1287.030611,14.837838


In [40]:
data.groupby(['gender', 'AVGGIFT'])['gender'].count() #.reset_index(level='AVGGIFT')

gender  AVGGIFT  
Female  2.260870     1
        2.315789     1
        2.439815     1
        2.571429     1
        2.588235     1
                    ..
U       23.000000    1
        24.750000    1
        25.000000    3
        27.285714    1
        57.500000    1
Name: gender, Length: 2140, dtype: int64

In [6]:
X_cat

Unnamed: 0,gender
0,Male
1,Male
2,Female
3,Male
4,Female
...,...
4665,Male
4666,Male
4667,Female
4668,Male


In [37]:
# in case you need to use the encode somewhere else besides your notebook:
encoder = OneHotEncoder(handle_unknown='error', drop='first')
encoder.fit(X_cat)

OneHotEncoder(drop='first')

In [38]:
# after the .fit()
{'Male': [1,0],
'Female': [0,0],
'U': [0,1]}

{'Male': [1, 0], 'Female': [0, 0], 'U': [0, 1]}

In [39]:
encoded = encoder.transform(X_cat).toarray()
encoded #.shape # 

array([[1., 0.],
       [1., 0.],
       [0., 0.],
       ...,
       [0., 0.],
       [1., 0.],
       [1., 0.]])

In [40]:
encoder.categories_

[array(['Female', 'Male', 'U'], dtype=object)]

In [41]:
# just for fun, you can work with the numpy arrays for the model as well
df = pd.DataFrame(encoded)
df.columns = encoder.categories_[0][1:]
df

Unnamed: 0,Male,U
0,1.0,0.0
1,1.0,0.0
2,0.0,0.0
3,1.0,0.0
4,0.0,0.0
...,...,...
4665,1.0,0.0
4666,1.0,0.0
4667,0.0,0.0
4668,1.0,0.0


In [42]:
# Extra: also check out label encoding
encoded

array([[1., 0.],
       [1., 0.],
       [0., 0.],
       ...,
       [0., 0.],
       [1., 0.],
       [1., 0.]])

In [43]:
X = np.concatenate([X_num, encoded], axis=1)
X.shape

(4670, 6)

In [44]:
# traint-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [45]:
model = LinearRegression() # y = ai*xi + aii* xii + b
model.fit(X_train,y_train)
# y = 2*xi + 3*xii + 1

LinearRegression()

In [46]:
predictions  = model.predict(X_test)
predictions.shape

(1401,)

In [47]:
r2_score(y_test, predictions), mean_squared_error(y_test, predictions, squared=False), mean_squared_error(y_test, predictions)

(0.2705944957872364, 11.414726728148338, 130.29598627830407)

###### IMPORTANT: to make predictions on the new data, we have to process the data (X features) in the same way.

#### Remember how linear model works in the backend

[[15.5, 7.8, 17.3, 4.1, 0, 1],
 [...],
]

y = (0.7*x1) + (1.3*x2) + ...  + 8.2 = prediction