### 01. Import Dependecies

In [5]:
import numpy as np 
import pandas as pd 
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder

### 02. Load Data

In [2]:
df = pd.read_csv('data/processed/Outliers_Handled.csv')
df.head()

Unnamed: 0,Platform,Hashtag,Content_Type,Region,Views,Likes,Shares,Comments,Engagement_Level,Like_Rate,Share_Rate,Comment_Rate,engagement_rate,like_to_comment_ratio,share_to_like_ratio,Views_norm
0,TikTok,#Challenge,Video,UK,4163464.0,339431.0,53135.0,19346.0,High,0.081526,0.012762,0.004647,0.098935,17.544374,0.156541,1.655443
1,Instagram,#Education,Shorts,India,4155940.0,215240.0,65860.0,27239.0,Medium,0.051791,0.015847,0.006554,0.074192,7.901615,0.305983,1.728703
2,Twitter,#Challenge,Video,Brazil,3666211.0,327143.0,39423.0,36223.0,Medium,0.089232,0.010753,0.00988,0.109865,9.031112,0.120507,1.462971
3,YouTube,#Education,Shorts,Australia,917951.0,127125.0,11687.0,36806.0,Low,0.138488,0.012732,0.040096,0.191315,3.453827,0.091932,0.360596
4,Instagram,#Challenge,Shorts,Australia,1323566.0,136282.0,86979.0,47129.0,Low,0.102966,0.065716,0.035608,0.204289,2.891619,0.638223,0.55055


### 03. Applying scalling and encoding

In [3]:
nominal_columns = ['Platform', 'Hashtag', 'Content_Type', 'Region']
numerical_columns = [
    'Views', 'Likes', 'Shares', 'Comments',
    'Like_Rate', 'Share_Rate', 'Comment_Rate',
    'engagement_rate', 'like_to_comment_ratio',
    'share_to_like_ratio', 'Views_norm'
]
ordinal_columns = ['Engagement_Level']

In [6]:
# === Define transformations ===
numerical_transformer = Pipeline(
    steps=[('scaler', StandardScaler())]   # <-- Standardization (Z-score)
)

nominal_transformer = Pipeline(
    steps=[('encoder', OneHotEncoder(sparse_output=False))]
)

ordinal_transformer = Pipeline(
    steps=[('encoder', OrdinalEncoder())]
)

# === Combine transformations ===
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('nom', nominal_transformer, nominal_columns),
        ('ord', ordinal_transformer, ordinal_columns)
    ],
    remainder='drop'
)

# === Apply transformations ===
transformed_array = preprocessor.fit_transform(df)

# Get encoded column names for nominal features
nominal_feature_names = preprocessor.named_transformers_['nom'] \
    .named_steps['encoder'].get_feature_names_out(nominal_columns)

# Combine all feature names
all_feature_names = numerical_columns + list(nominal_feature_names) + ordinal_columns

# Create new DataFrame
df_transformed = pd.DataFrame(transformed_array, columns=all_feature_names)
df_transformed.index = df.index  # Optional, keep same index
df_transformed.head()

Unnamed: 0,Views,Likes,Shares,Comments,Like_Rate,Share_Rate,Comment_Rate,engagement_rate,like_to_comment_ratio,share_to_like_ratio,...,Content_Type_Video,Region_Australia,Region_Brazil,Region_Canada,Region_Germany,Region_India,Region_Japan,Region_UK,Region_USA,Engagement_Level
0,1.056452,0.645129,0.120362,-0.3864,-0.351125,-0.493257,-0.52852,-0.45057,-0.071705,-0.137667,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.050638,-0.220268,0.558953,0.16709,-0.635808,-0.356033,-0.403659,-0.65022,-0.104481,-0.100999,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0
2,0.672239,0.559503,-0.352247,0.797085,-0.27735,-0.582626,-0.18596,-0.362372,-0.100641,-0.146508,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3,-1.451262,-0.834278,-1.308219,0.837968,0.194223,-0.494618,1.791766,0.294857,-0.119598,-0.153519,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-1.137855,-0.770469,1.286858,1.561859,-0.145863,1.862175,1.497994,0.399544,-0.121509,-0.019479,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


### 04. Saving Data

In [8]:
df_transformed.to_csv( 'data/processed/Standarlized_data.csv', index=False)