# One-Hot Encoding

In [5]:
import pandas as pd
import numpy as np

# Create a mock DataFrame
np.random.seed(42)  # Set seed for reproducibility
colors = ['Red', 'Green', 'Blue']
random_colors = np.random.choice(colors, size=100, replace=True)
df = pd.DataFrame({'color': random_colors})

# Display the original DataFrame
print("Original DataFrame:")
df

Original DataFrame:


Unnamed: 0,color
0,Blue
1,Red
2,Blue
3,Blue
4,Red
...,...
95,Red
96,Red
97,Blue
98,Red


In [12]:
# Perform one-hot encoding
# Perform one-hot encoding
one_hot_encoded_df = pd.get_dummies(df['color'], prefix='color')
# Convert boolean values to integers
one_hot_encoded_df = one_hot_encoded_df.astype(int)

# Concatenate the one-hot-encoded columns with the original DataFrame
df_encoded = pd.concat([df, one_hot_encoded_df], axis=1)


In [13]:
# Display the final DataFrame with one-hot encoding
print("\nDataFrame with One-Hot Encoding:")
df_encoded


DataFrame with One-Hot Encoding:


Unnamed: 0,color,color_Blue,color_Green,color_Red
0,Blue,1,0,0
1,Red,0,0,1
2,Blue,1,0,0
3,Blue,1,0,0
4,Red,0,0,1
...,...,...,...,...
95,Red,0,0,1
96,Red,0,0,1
97,Blue,1,0,0
98,Red,0,0,1


# Feature Extraction / Selection

In [3]:
import pandas as pd
import numpy as np

# Set a random seed for reproducibility
np.random.seed(42)

# Mock DataFrame X with 100 rows and 5 columns
X = pd.DataFrame(
    np.random.rand(100, 5), 
    columns=['feature1', 'feature2', 'feature3', 'feature4', 'feature5'])

# Mock Series y with 100 rows
y = pd.Series(
    np.random.randint(0, 2, size=100),
    name='target')

# Display shapes of X and y
print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")

# Display the first few rows of X and y
print("\nMock DataFrame X:")
X.head()


Shape of X: (100, 5)
Shape of y: (100,)

Mock DataFrame X:


Unnamed: 0,feature1,feature2,feature3,feature4,feature5
0,0.37454,0.950714,0.731994,0.598658,0.156019
1,0.155995,0.058084,0.866176,0.601115,0.708073
2,0.020584,0.96991,0.832443,0.212339,0.181825
3,0.183405,0.304242,0.524756,0.431945,0.291229
4,0.611853,0.139494,0.292145,0.366362,0.45607


In [2]:
print("\nMock Series y:")
y.head()


Mock Series y:


0    1
1    0
2    0
3    0
4    0
Name: target, dtype: int64

In [None]:
#this is to make sure my X is shape(m,n)
#where m is number of rows
#and n is the number of columns
assert X.shape[0] > 0
assert X.shape[1] > 0

assert y.shape[0] > 0
# assert y.ndim == 1

from sklearn.model_selection import train_test_split

#random state: you can choose wether number you wanr
#but this will ensure toda or tomorrow, you will ALWAYS get the same split
#for the same random state
#so we can compare our results

X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.3, random_state=999)

In [None]:
#Please check the shape of X_train,X_test,y_train,y_test 
X_train.shape, X_test.shape , y_train.shape, y_test.shape

In [None]:
#one last thing to do, e.g., ensure no datatype "object"
X_train.info()

In [None]:
y_train.info()