### 1. Import Dependecies

In [9]:
import numpy as np 
import pandas as pd 
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder,OrdinalEncoder

### 2. Load Data

In [10]:
df = pd.read_csv('data/processed/Outliers_Handled.csv')
df.head()

Unnamed: 0,Platform,Hashtag,Content_Type,Region,Views,Likes,Shares,Comments,Engagement_Level,Like_Rate,Share_Rate,Comment_Rate,dominant_engagement_type
0,TikTok,#Challenge,Video,UK,4163464.0,339431.0,53135.0,19346.0,High,0.081526,0.012762,0.004647,Likes
1,Instagram,#Education,Shorts,India,4155940.0,215240.0,65860.0,27239.0,Medium,0.051791,0.015847,0.006554,Likes
2,Twitter,#Challenge,Video,Brazil,3666211.0,327143.0,39423.0,36223.0,Medium,0.089232,0.010753,0.00988,Likes
3,YouTube,#Education,Shorts,Australia,917951.0,127125.0,11687.0,36806.0,Low,0.138488,0.012732,0.040096,Likes
4,TikTok,#Dance,Post,Brazil,64866.0,171361.0,69581.0,6376.0,Medium,2.641769,1.072688,0.098295,Likes


### 3. Applying scalling and encoding

In [11]:
nominal_columns = ['Platform', 'Hashtag', 'Content_Type', 'Region','dominant_engagement_type']
numerical_columns = ['Views', 'Likes', 'Shares', 'Comments','Like_Rate','Share_Rate','Comment_Rate']
ordinal_columns = ['Engagement_Level']

In [12]:
numerical_transformer = Pipeline(
    steps=[('scaler', MinMaxScaler())]
)

nominal_transformer = Pipeline(
    steps=[('encoder', OneHotEncoder(sparse_output=False))]
)

ordinal_transformer = Pipeline(
    steps=[('encoder', OrdinalEncoder())]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('nom', nominal_transformer, nominal_columns),
        ('ord', ordinal_transformer, ordinal_columns)
    ],
    remainder='drop'
)

transformed_array = preprocessor.fit_transform(df)

# Get correct column names for OneHotEncoded nominal features
nominal_feature_names = preprocessor.named_transformers_['nom'].named_steps['encoder'].get_feature_names_out(nominal_columns)

# Combine all column names
all_feature_names = numerical_columns + list(nominal_feature_names) + ordinal_columns

# Create DataFrame
df_transformed = pd.DataFrame(transformed_array, columns=all_feature_names)

# Optional: reset index to match original DataFrame
df_transformed.index = df.index
df_transformed.head()

Unnamed: 0,Views,Likes,Shares,Comments,Like_Rate,Share_Rate,Comment_Rate,Platform_Instagram,Platform_TikTok,Platform_Twitter,...,Region_Canada,Region_Germany,Region_India,Region_Japan,Region_UK,Region_USA,dominant_engagement_type_Comments,dominant_engagement_type_Likes,dominant_engagement_type_Shares,Engagement_Level
0,0.832745,0.678653,0.531223,0.386753,0.000323,0.000674,0.000162,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,0.83124,0.429988,0.658567,0.544692,0.000205,0.000837,0.000229,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0
2,0.733258,0.654049,0.394002,0.724462,0.000354,0.000568,0.000346,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0
3,0.183404,0.253558,0.116436,0.736128,0.000549,0.000672,0.001405,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,0.012725,0.342131,0.695805,0.127224,0.010489,0.056699,0.003445,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0


In [13]:
df_transformed.to_csv('data/processed/scalled_and_encoded.csv', index=False)