### 01. Import Dependecies

In [6]:
import numpy as np 
import pandas as pd 
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder

### 02. Load Data

In [7]:
df = pd.read_csv('data/processed/df_cla.csv')
df = df.drop(columns=['Engagement_Level'])
df.head()

Unnamed: 0,Platform,Hashtag,Content_Type,Region,Views,Views_norm
0,TikTok,#Challenge,Video,UK,4163464,1.57157
1,Instagram,#Education,Shorts,India,4155940,1.622473
2,Twitter,#Challenge,Video,Brazil,3666211,1.383053
3,YouTube,#Education,Shorts,Australia,917951,0.347293
4,Instagram,#Challenge,Shorts,Australia,1323566,0.516718


### 03. Applying scalling and encoding

In [8]:
nominal_columns = ['Platform', 'Hashtag', 'Content_Type', 'Region']
numerical_columns = [
    'Views', 'Views_norm'
]
ordinal_columns = []

In [9]:
# === Define transformations ===
numerical_transformer = Pipeline(
    steps=[('scaler', StandardScaler())]   # <-- Standardization (Z-score)
)

nominal_transformer = Pipeline(
    steps=[('encoder', OneHotEncoder(sparse_output=False))]
)

ordinal_transformer = Pipeline(
    steps=[('encoder', OrdinalEncoder())]
)

# === Combine transformations ===
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('nom', nominal_transformer, nominal_columns),
        ('ord', ordinal_transformer, ordinal_columns)
    ],
    remainder='drop'
)

# === Apply transformations ===
transformed_array = preprocessor.fit_transform(df)

# Get encoded column names for nominal features
nominal_feature_names = preprocessor.named_transformers_['nom'] \
    .named_steps['encoder'].get_feature_names_out(nominal_columns)

# Combine all feature names
all_feature_names = numerical_columns + list(nominal_feature_names) + ordinal_columns

# Create new DataFrame
df_transformed = pd.DataFrame(transformed_array, columns=all_feature_names)
df_transformed.index = df.index  # Optional, keep same index
df_transformed.head()

Unnamed: 0,Views,Views_norm,Platform_Instagram,Platform_TikTok,Platform_Twitter,Platform_YouTube,Hashtag_#Challenge,Hashtag_#Comedy,Hashtag_#Dance,Hashtag_#Education,...,Content_Type_Tweet,Content_Type_Video,Region_Australia,Region_Brazil,Region_Canada,Region_Germany,Region_India,Region_Japan,Region_UK,Region_USA
0,1.107899,1.082627,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.102474,1.179045,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.74935,0.725553,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-1.232312,-1.236311,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.939839,-0.915398,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 04. Saving Data

In [10]:
df_transformed.to_csv( 'data/processed/Standarlized_data.csv', index=False)
np.savez('artifacts/clustering/clustering.npz', df_transformed)