### 1. Import Dependecies

In [23]:
import numpy as np 
import pandas as pd 
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder,OrdinalEncoder

### 2. Load Data

In [24]:
df = pd.read_csv('data/processed/Outliers_Handled.csv')
df.head()

Unnamed: 0,Platform,Hashtag,Content_Type,Region,Views,Likes,Shares,Comments,Engagement_Level,Like_Rate,Share_Rate,Comment_Rate,dominant_engagement_type
0,TikTok,#Challenge,Video,UK,4163464.0,339431.0,53135.0,19346.0,High,0.081526,0.012762,0.004647,Likes
1,Instagram,#Education,Shorts,India,4155940.0,215240.0,65860.0,27239.0,Medium,0.051791,0.015847,0.006554,Likes
2,Twitter,#Challenge,Video,Brazil,3666211.0,327143.0,39423.0,36223.0,Medium,0.089232,0.010753,0.00988,Likes
3,YouTube,#Education,Shorts,Australia,917951.0,127125.0,11687.0,36806.0,Low,0.138488,0.012732,0.040096,Likes
4,Instagram,#Challenge,Shorts,Australia,1323566.0,136282.0,86979.0,47129.0,Low,0.102966,0.065716,0.035608,Likes


### 3. Applying scalling and encoding

In [25]:
nominal_columns = ['Platform', 'Hashtag', 'Content_Type', 'Region','dominant_engagement_type']
numerical_columns = ['Views', 'Likes', 'Shares', 'Comments','Like_Rate','Share_Rate','Comment_Rate']
ordinal_columns = ['Engagement_Level']

In [26]:
numerical_transformer = Pipeline(
    steps=[('scaler', MinMaxScaler())]
)

nominal_transformer = Pipeline(
    steps=[('encoder', OneHotEncoder(sparse_output=False))]
)

ordinal_transformer = Pipeline(
    steps=[('encoder', OrdinalEncoder())]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('nom', nominal_transformer, nominal_columns),
        ('ord', ordinal_transformer, ordinal_columns)
    ],
    remainder='drop'
)

transformed_array = preprocessor.fit_transform(df)

# Get correct column names for OneHotEncoded nominal features
nominal_feature_names = preprocessor.named_transformers_['nom'].named_steps['encoder'].get_feature_names_out(nominal_columns)

# Combine all column names
all_feature_names = numerical_columns + list(nominal_feature_names) + ordinal_columns

# Create DataFrame
df_transformed = pd.DataFrame(transformed_array, columns=all_feature_names)

# Optional: reset index to match original DataFrame
df_transformed.index = df.index
df_transformed.head()

Unnamed: 0,Views,Likes,Shares,Comments,Like_Rate,Share_Rate,Comment_Rate,Platform_Instagram,Platform_TikTok,Platform_Twitter,...,Region_Canada,Region_Germany,Region_India,Region_Japan,Region_UK,Region_USA,dominant_engagement_type_Comments,dominant_engagement_type_Likes,dominant_engagement_type_Shares,Engagement_Level
0,0.829417,0.678653,0.5315,0.386753,0.04942,0.024569,0.013823,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,0.827882,0.429988,0.65891,0.544692,0.03136,0.030514,0.019518,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0
2,0.72795,0.654049,0.394207,0.724462,0.0541,0.020698,0.029446,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0
3,0.167153,0.253558,0.116497,0.736128,0.084016,0.024511,0.119642,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,0.249921,0.271893,0.870367,0.942691,0.062441,0.126615,0.106244,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [27]:
df_transformed.to_csv('data/processed/scalled_and_encoded.csv', index=False)

### 4. Data splitting

#### 4.1 Configuring for Logistic regression

In [28]:
X = df_transformed.drop(columns=['Engagement_Level'])
Y = df_transformed['Engagement_Level']

In [29]:
X_train, X_test, Y_train, Y_test = train_test_split(
                                                    X, Y,
                                                    test_size=0.2,
                                                    random_state=42
                                                    )

In [30]:
np.savez('artifacts/logistic/X_train_cat.npz', X_train)
np.savez('artifacts/logistic/Y_train_cat.npz', Y_train)
np.savez('artifacts/logistic/X_test_cat.npz', X_test)
np.savez('artifacts/logistic/Y_test_cat.npz', Y_test)

#### 4.2 Configuring for Linear regression

In [31]:
X = df_transformed.drop(columns=['Shares'])
Y = df_transformed['Shares']

In [32]:
X_train, X_test, Y_train, Y_test = train_test_split(
                                                    X, Y,
                                                    test_size=0.2,
                                                    random_state=42
                                                    )

In [33]:
np.savez('artifacts/linear/X_train_lin.npz', X_train)
np.savez('artifacts/linear/Y_train_lin.npz', Y_train)
np.savez('artifacts/linear/X_test_lin.npz', X_test)
np.savez('artifacts/linear/Y_test_lin.npz', Y_test)