### 1. Import Dependecies

In [42]:
import numpy as np 
import pandas as pd 
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder,OrdinalEncoder

### 2. Load Data

In [43]:
df = pd.read_csv('data/processed/Missing_Values_Handled.csv')
df.head()

Unnamed: 0,Platform,Hashtag,Content_Type,Region,Views,Likes,Shares,Comments,Engagement_Level
0,TikTok,#Challenge,Video,UK,4163464,339431,53135,19346,High
1,Instagram,#Education,Shorts,India,4155940,215240,65860,27239,Medium
2,Twitter,#Challenge,Video,Brazil,3666211,327143,39423,36223,Medium
3,YouTube,#Education,Shorts,Australia,917951,127125,11687,36806,Low
4,TikTok,#Dance,Post,Brazil,64866,171361,69581,6376,Medium


### 3. Applying scalling and encoding

In [44]:
nominal_columns = ['Platform', 'Hashtag', 'Content_Type', 'Region']
numerical_columns = ['Views', 'Likes', 'Shares', 'Comments']
ordinal_columns = ['Engagement_Level']

In [45]:
numerical_transformer = Pipeline(
    steps=[('scaler', MinMaxScaler())]
)

nominal_transformer = Pipeline(
    steps=[('encoder', OneHotEncoder(sparse_output=False))]
)

ordinal_transformer = Pipeline(
    steps=[('encoder', OrdinalEncoder())]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('nom', nominal_transformer, nominal_columns),
        ('ord', ordinal_transformer, ordinal_columns)
    ],
    remainder='drop'
)

transformed_array = preprocessor.fit_transform(df)

# Get correct column names for OneHotEncoded nominal features
nominal_feature_names = preprocessor.named_transformers_['nom'].named_steps['encoder'].get_feature_names_out(nominal_columns)

# Combine all column names
all_feature_names = numerical_columns + list(nominal_feature_names) + ordinal_columns

# Create DataFrame
df_transformed = pd.DataFrame(transformed_array, columns=all_feature_names)

# Optional: reset index to match original DataFrame
df_transformed.index = df.index
df_transformed.head()

Unnamed: 0,Views,Likes,Shares,Comments,Platform_Instagram,Platform_TikTok,Platform_Twitter,Platform_YouTube,Hashtag_#Challenge,Hashtag_#Comedy,...,Content_Type_Video,Region_Australia,Region_Brazil,Region_Canada,Region_Germany,Region_India,Region_Japan,Region_UK,Region_USA,Engagement_Level
0,0.832745,0.678653,0.531223,0.386753,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.83124,0.429988,0.658567,0.544692,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0
2,0.733258,0.654049,0.394002,0.724462,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3,0.183404,0.253558,0.116436,0.736128,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.012725,0.342131,0.695805,0.127224,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0


In [None]:
df_transformed.to_csv('data/processed/scalled_and_encoded.csv', index=False)