### 1. Import Dependecies

In [76]:
import numpy as np 
import pandas as pd 
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder,OrdinalEncoder
import joblib

### Loading and Splitting data for regression and classification

In [77]:
df = pd.read_csv('data/processed/Outliers_Handled.csv')
df.head()

Unnamed: 0,Platform,Hashtag,Content_Type,Region,Views,Likes,Shares,Comments,Engagement_Level,Like_Rate,Share_Rate,Comment_Rate,engagement_rate,like_to_comment_ratio,share_to_like_ratio,Views_norm
0,TikTok,#Challenge,Video,UK,4163464.0,339431.0,53135.0,19346.0,High,0.081526,0.012762,0.004647,0.098935,17.544374,0.156541,1.655443
1,Instagram,#Education,Shorts,India,4155940.0,215240.0,65860.0,27239.0,Medium,0.051791,0.015847,0.006554,0.074192,7.901615,0.305983,1.728703
2,Twitter,#Challenge,Video,Brazil,3666211.0,327143.0,39423.0,36223.0,Medium,0.089232,0.010753,0.00988,0.109865,9.031112,0.120507,1.462971
3,YouTube,#Education,Shorts,Australia,917951.0,127125.0,11687.0,36806.0,Low,0.138488,0.012732,0.040096,0.191315,3.453827,0.091932,0.360596
4,Instagram,#Challenge,Shorts,Australia,1323566.0,136282.0,86979.0,47129.0,Low,0.102966,0.065716,0.035608,0.204289,2.891619,0.638223,0.55055


In [78]:
df_reg = df.copy()
df_cla = df.copy()

df_reg= df_reg.drop(columns=['Share_Rate','share_to_like_ratio','engagement_rate'])
df_cla = df_cla[['Platform','Hashtag','Content_Type','Region','Views','Views_norm','Engagement_Level']]

df_reg.to_csv( 'data/processed/df_reg.csv', index=False)
df_cla.to_csv( 'data/processed/df_cla.csv', index=False)

### 2. Load Data for Regression and Classification

In [79]:
# Load separate datasets for regression and classification
df_reg = pd.read_csv('data/processed/df_reg.csv')
df_cla = pd.read_csv('data/processed/df_cla.csv')

print("Regression dataset shape:", df_reg.shape)
print("Classification dataset shape:", df_cla.shape)
print("\nRegression columns:", df_reg.columns.tolist())
print("\nClassification columns:", df_cla.columns.tolist())

Regression dataset shape: (4385, 13)
Classification dataset shape: (4385, 7)

Regression columns: ['Platform', 'Hashtag', 'Content_Type', 'Region', 'Views', 'Likes', 'Shares', 'Comments', 'Engagement_Level', 'Like_Rate', 'Comment_Rate', 'like_to_comment_ratio', 'Views_norm']

Classification columns: ['Platform', 'Hashtag', 'Content_Type', 'Region', 'Views', 'Views_norm', 'Engagement_Level']


#### 3.2 Define columns for Classification (target: Engagement_Level)

In [80]:
# Columns for Regression dataset
nominal_columns_reg = ['Platform', 'Hashtag', 'Content_Type', 'Region']
numerical_columns_reg = ['Views', 'Likes', 'Comments', 'Like_Rate', 'Comment_Rate', 'like_to_comment_ratio', 'Views_norm']
ordinal_columns_reg = ['Engagement_Level']  # Can be used as feature for regression

# Columns for Classification dataset
nominal_columns_cla = ['Platform', 'Hashtag', 'Content_Type', 'Region']
numerical_columns_cla = ['Views', 'Views_norm']
# No ordinal columns for classification (target is Engagement_Level)

#### 3.3 Preprocessing Pipeline for Regression

In [81]:
# Create transformers
numerical_transformer = Pipeline(steps=[('scaler', MinMaxScaler())])
nominal_transformer = Pipeline(steps=[('encoder', OneHotEncoder(sparse_output=False, drop='first'))])
ordinal_transformer = Pipeline(steps=[('encoder', OrdinalEncoder())])

# Preprocessor for Regression
preprocessor_reg = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns_reg),
        ('nom', nominal_transformer, nominal_columns_reg),
        ('ord', ordinal_transformer, ordinal_columns_reg)
    ],
    remainder='drop'
)

# Fit and transform regression data
transformed_array_reg = preprocessor_reg.fit_transform(df_reg)

# Get feature names
nominal_feature_names_reg = preprocessor_reg.named_transformers_['nom'].named_steps['encoder'].get_feature_names_out(nominal_columns_reg)
ordinal_feature_names_reg = ordinal_columns_reg  # OrdinalEncoder doesn't change names

# Combine all column names for regression
all_feature_names_reg = numerical_columns_reg + list(nominal_feature_names_reg) + ordinal_feature_names_reg

# Create DataFrame
df_reg_transformed = pd.DataFrame(transformed_array_reg, columns=all_feature_names_reg, index=df_reg.index)

# Scale the target column (Shares) separately
target_scaler = MinMaxScaler()
shares_scaled = target_scaler.fit_transform(df_reg[['Shares']]).flatten()

# Add scaled target column (Shares)
df_reg_transformed['Shares'] = shares_scaled

# Save the target scaler for inverse transformation during prediction
joblib.dump(target_scaler, 'artifacts/linear/target_scaler.joblib')
print("Saved target scaler to artifacts/linear/target_scaler.joblib")

print("Regression processed shape:", df_reg_transformed.shape)
print("Regression columns:", df_reg_transformed.columns.tolist())
print(f"\nShares range - Original: [{df_reg['Shares'].min():.2f}, {df_reg['Shares'].max():.2f}]")
print(f"Shares range - Scaled: [{df_reg_transformed['Shares'].min():.4f}, {df_reg_transformed['Shares'].max():.4f}]")
df_reg_transformed.head()

Saved target scaler to artifacts/linear/target_scaler.joblib
Regression processed shape: (4385, 33)
Regression columns: ['Views', 'Likes', 'Comments', 'Like_Rate', 'Comment_Rate', 'like_to_comment_ratio', 'Views_norm', 'Platform_TikTok', 'Platform_Twitter', 'Platform_YouTube', 'Hashtag_#Comedy', 'Hashtag_#Dance', 'Hashtag_#Education', 'Hashtag_#Fashion', 'Hashtag_#Fitness', 'Hashtag_#Gaming', 'Hashtag_#Music', 'Hashtag_#Tech', 'Hashtag_#Viral', 'Content_Type_Post', 'Content_Type_Reel', 'Content_Type_Shorts', 'Content_Type_Tweet', 'Content_Type_Video', 'Region_Brazil', 'Region_Canada', 'Region_Germany', 'Region_India', 'Region_Japan', 'Region_UK', 'Region_USA', 'Engagement_Level', 'Shares']

Shares range - Original: [52.00, 99953.00]
Shares range - Scaled: [0.0000, 1.0000]


Unnamed: 0,Views,Likes,Comments,Like_Rate,Comment_Rate,like_to_comment_ratio,Views_norm,Platform_TikTok,Platform_Twitter,Platform_YouTube,...,Content_Type_Video,Region_Brazil,Region_Canada,Region_Germany,Region_India,Region_Japan,Region_UK,Region_USA,Engagement_Level,Shares
0,0.82856,0.678653,0.386753,0.134207,0.013823,0.001566,0.791168,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.531356
1,0.827017,0.429988,0.544692,0.085162,0.019518,0.000705,0.827299,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.658732
2,0.726583,0.654049,0.724462,0.146917,0.029446,0.000806,0.696241,0.0,1.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.3941
3,0.16297,0.253558,0.736128,0.228159,0.119642,0.000307,0.152553,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.116465
4,0.246154,0.271893,0.942691,0.169569,0.106244,0.000257,0.246238,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.870131


#### 3.4 Preprocessing Pipeline for Classification


In [82]:
# Preprocessor for Classification
preprocessor_cla = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns_cla),
        ('nom', nominal_transformer, nominal_columns_cla)
    ],
    remainder='drop'
)

# Fit and transform classification data
transformed_array_cla = preprocessor_cla.fit_transform(df_cla)

# Get feature names
nominal_feature_names_cla = preprocessor_cla.named_transformers_['nom'].named_steps['encoder'].get_feature_names_out(nominal_columns_cla)

# Combine all column names for classification
all_feature_names_cla = numerical_columns_cla + list(nominal_feature_names_cla)

# Create DataFrame
df_cla_transformed = pd.DataFrame(transformed_array_cla, columns=all_feature_names_cla, index=df_cla.index)

# Add target column (Engagement_Level) - encode it as ordinal
ordinal_encoder_target = OrdinalEncoder()
df_cla_transformed['Engagement_Level'] = ordinal_encoder_target.fit_transform(df_cla[['Engagement_Level']]).flatten()

print("Classification processed shape:", df_cla_transformed.shape)
print("Classification columns:", df_cla_transformed.columns.tolist())
print("Engagement_Level mapping:", dict(zip(ordinal_encoder_target.categories_[0], range(len(ordinal_encoder_target.categories_[0])))))
df_cla_transformed.head()


Classification processed shape: (4385, 27)
Classification columns: ['Views', 'Views_norm', 'Platform_TikTok', 'Platform_Twitter', 'Platform_YouTube', 'Hashtag_#Comedy', 'Hashtag_#Dance', 'Hashtag_#Education', 'Hashtag_#Fashion', 'Hashtag_#Fitness', 'Hashtag_#Gaming', 'Hashtag_#Music', 'Hashtag_#Tech', 'Hashtag_#Viral', 'Content_Type_Post', 'Content_Type_Reel', 'Content_Type_Shorts', 'Content_Type_Tweet', 'Content_Type_Video', 'Region_Brazil', 'Region_Canada', 'Region_Germany', 'Region_India', 'Region_Japan', 'Region_UK', 'Region_USA', 'Engagement_Level']
Engagement_Level mapping: {'High': 0, 'Low': 1, 'Medium': 2}


Unnamed: 0,Views,Views_norm,Platform_TikTok,Platform_Twitter,Platform_YouTube,Hashtag_#Comedy,Hashtag_#Dance,Hashtag_#Education,Hashtag_#Fashion,Hashtag_#Fitness,...,Content_Type_Tweet,Content_Type_Video,Region_Brazil,Region_Canada,Region_Germany,Region_India,Region_Japan,Region_UK,Region_USA,Engagement_Level
0,0.82856,0.791168,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.827017,0.827299,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0
2,0.726583,0.696241,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3,0.16297,0.152553,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.246154,0.246238,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


#### 3.5 Save Processed Datasets


In [83]:
# Save processed regression dataset
df_reg_transformed.to_csv('data/processed/regression_scaled_encoded.csv', index=False)
print("Saved: data/processed/regression_scaled_encoded.csv")

# Save processed classification dataset
df_cla_transformed.to_csv('data/processed/classification_scaled_encoded.csv', index=False)
print("Saved: data/processed/classification_scaled_encoded.csv")


Saved: data/processed/regression_scaled_encoded.csv
Saved: data/processed/classification_scaled_encoded.csv


### 4. Data Splitting for Modeling

#### 4.1 Splitting for Regression (target: Shares)

In [84]:
X_reg = df_reg_transformed.drop(columns=['Shares'])
Y_reg = df_reg_transformed['Shares']

In [85]:
X_train_reg, X_test_reg, Y_train_reg, Y_test_reg = train_test_split(
    X_reg, Y_reg,
    test_size=0.2,
    random_state=42
)

print(f"Regression - Train: {X_train_reg.shape}, Test: {X_test_reg.shape}")

Regression - Train: (3508, 32), Test: (877, 32)


In [86]:
# Save regression splits
np.savez('artifacts/linear/X_train_reg.npz', X_train_reg)
np.savez('artifacts/linear/Y_train_reg.npz', Y_train_reg)
np.savez('artifacts/linear/X_test_reg.npz', X_test_reg)
np.savez('artifacts/linear/Y_test_reg.npz', Y_test_reg)
print("Saved regression train/test splits to artifacts/linear/")

Saved regression train/test splits to artifacts/linear/


#### 4.2 Splitting for Classification (target: Engagement_Level)

In [87]:
X_cla = df_cla_transformed.drop(columns=['Engagement_Level'])
Y_cla = df_cla_transformed['Engagement_Level']

In [88]:
X_train_cla, X_test_cla, Y_train_cla, Y_test_cla = train_test_split(
    X_cla, Y_cla,
    test_size=0.2,
    random_state=42,
    stratify=Y_cla  # Stratify for classification to maintain class distribution
)

print(f"Classification - Train: {X_train_cla.shape}, Test: {X_test_cla.shape}")
print(f"Class distribution in train: {pd.Series(Y_train_cla).value_counts().to_dict()}")
print(f"Class distribution in test: {pd.Series(Y_test_cla).value_counts().to_dict()}")

Classification - Train: (3508, 26), Test: (877, 26)
Class distribution in train: {1.0: 1205, 0.0: 1178, 2.0: 1125}
Class distribution in test: {1.0: 301, 0.0: 295, 2.0: 281}


In [89]:
# Save classification splits
np.savez('artifacts/logistic/X_train_cla.npz', X_train_cla)
np.savez('artifacts/logistic/Y_train_cla.npz', Y_train_cla)
np.savez('artifacts/logistic/X_test_cla.npz', X_test_cla)
np.savez('artifacts/logistic/Y_test_cla.npz', Y_test_cla)
print("Saved classification train/test splits to artifacts/logistic/")

Saved classification train/test splits to artifacts/logistic/
