# One-Hot Encoding

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from seaborn import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
flights = load_dataset('flights')

## Data

In [None]:
# Check head of dataset

flights.head()

## Split

In [None]:
# Train-test split. Year and month as X, passengers as y

X_train, X_test,  y_train, y_test =\
    train_test_split(flights[['year', 'month']],
                    flights['passengers'],
                    random_state=42)

## One-Hot Encode!

In [None]:
# Instantiate the encoder and fit

ohe = OneHotEncoder()
columns_to_encode = ['month']
ohe.fit(X_train[columns_to_encode])

In [None]:
# Transform the column

encoded = ohe.transform(X_train[columns_to_encode])
encoded

## Inflating

In [None]:
# Inflate!

encoded.todense()

## Getting New Feature Names

In [None]:
# Access the feature names

ohe.get_feature_names()

## Put into DataFrame

In [None]:
# Turn into a DataFrame

new_train_df = pd.DataFrame(encoded.todense(),
                           columns=ohe.get_feature_names(),
                           index=X_train.index)
new_train_df.head()

## All Together

In [None]:
# Stick together with X_train and drop the dummied-out column

df_train_concat = pd.concat([X_train, new_train_df],
                           axis=1).drop('month', axis=1)
df_train_concat.head()

## Model Training

In [None]:
# Fit a Linear Regression model

lr = LinearRegression()
lr.fit(df_train_concat, y_train)

In [None]:
# Score it

lr.score(df_train_concat, y_train)

## Test Set

In [None]:
test_encoded = ohe.transform(X_test[columns_to_encode])

In [None]:
new_test_df = pd.DataFrame(test_encoded.todense(),
                      columns=ohe.get_feature_names(),
                     index=X_test.index)
new_test_df.head()

## Final Test Prep

In [None]:
df_test_concat = pd.concat([X_test, new_test_df],
                           axis=1).drop('month', axis=1)
df_test_concat.head()

## Model Score on Test

In [None]:
lr.score(df_test_concat, y_test)