# Program Prediction Model

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
df = pd.read_csv('https://drive.google.com/file/d/1vqxpwQTbh80GILVhrm-id6efxcXjn3mu/view?usp=drive_link')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5865 entries, 0 to 5864
Data columns (total 48 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   id_program                       5865 non-null   int64  
 1   customer_id                      5865 non-null   int64  
 2   program_id                       5865 non-null   int64  
 3   promocode_id                     5864 non-null   float64
 4   paid                             5865 non-null   int64  
 5   paid_amount                      5865 non-null   float64
 6   delivery_start_date              5865 non-null   object 
 7   total_days                       5865 non-null   int64  
 8   free_days                        5865 non-null   int64  
 9   status                           5865 non-null   object 
 10  created_at_program               5865 non-null   object 
 11  diet_program_name                5865 non-null   object 
 12  master_plan_name    

## 2. Target Variable Identification and Feature Separation

In [9]:
# Define target variable
target = "program_id"

# Separate features (X) and target (y)
# Exclude diet_program_name and master_plan_name to avoid data leakage
X = df.drop(columns=[target, 'diet_program_name', 'master_plan_name'])
y = df[target]

print(f"Target variable identified: {target}")
print(f"Shape of features (X): {X.shape}")
print(f"Shape of target (y): {y.shape}")
print(f"Number of unique programs: {y.nunique()}")

Target variable identified: program_id
Shape of features (X): (5865, 45)
Shape of target (y): (5865,)
Number of unique programs: 196


## 3. Feature Identification

In [10]:
# Columns to exclude from automatic type inference for features
exclude_cols = [target, 'id_program', 'customer_id', 'promocode_id', 'paid_amount',
                'created_at_program', 'delivery_start_date', 'created_at_customer', 'date_of_birth',
                'deleted_at', 'created_month_year_str_program', 'created_month_year_str_customer',
                'subscribe_month_name', 'created_month_year']

# Identify numerical features
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
numerical_features = [col for col in numerical_features if col not in exclude_cols]

# Identify categorical features
categorical_features = X.select_dtypes(include='object').columns.tolist()
categorical_features = [col for col in categorical_features if col not in exclude_cols]

print("Numerical Features:", numerical_features)
print("Categorical Features:", categorical_features)

Numerical Features: ['paid', 'total_days', 'free_days', 'subscribe_year', 'subscribe_month', 'subscribe_day', 'subscribe_quarter', 'delivery_duration_days', 'id_customer', 'age', 'height', 'weight', 'birth_year', 'birth_month', 'birth_day', 'birth_quarter', 'created_year', 'created_month', 'created_day', 'created_quarter', 'gender_encoded', 'bmi']
Categorical Features: ['status', 'subscribe_weekday', 'username', 'email', 'nationality', 'gender', 'birth_weekday', 'created_month_name', 'created_weekday', 'email_domain']


## 4. Data Splitting

In [11]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")

Shape of X_train: (4692, 45)
Shape of X_test: (1173, 45)
Shape of y_train: (4692,)
Shape of y_test: (1173,)


## 5. Preprocessing Pipeline

In [12]:
# Numerical pipeline: Imputation + Scaling
numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

# Categorical pipeline: Imputation + One-Hot Encoding
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Create a preprocessor object using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features)
    ])

# Fit and transform the training data
X_train_processed = preprocessor.fit_transform(X_train)

# Transform the test data
X_test_processed = preprocessor.transform(X_test)

# Save the preprocessor for future use
joblib.dump(preprocessor, "program_preprocessor.pkl")

print("Preprocessing pipeline created and applied.")
print(f"Shape of processed X_train: {X_train_processed.shape}")
print(f"Shape of processed X_test: {X_test_processed.shape}")

Preprocessing pipeline created and applied.
Shape of processed X_train: (4692, 4240)
Shape of processed X_test: (1173, 4240)


## 6. Model Training and Evaluation - RandomForestClassifier

In [13]:
# Initialize and train the RandomForestClassifier model
# Using default parameters for now, can tune later if needed
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_processed, y_train)

# Make predictions on the test set
y_pred_rf = rf_model.predict(X_test_processed)

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
report_rf = classification_report(y_test, y_pred_rf, zero_division=0)

print(f"RandomForestClassifier - Accuracy: {accuracy_rf:.4f}")
print(f"RandomForestClassifier - Classification Report:{report_rf}")

# Save the trained model
joblib.dump(rf_model, "random_forest_classifier_program_model.pkl")
print("RandomForestClassifier model trained and saved.")

RandomForestClassifier - Accuracy: 0.4561
RandomForestClassifier - Classification Report:              precision    recall  f1-score   support

  5637163328       0.57      0.67      0.62         6
  5637163329       0.80      0.44      0.57         9
  5637167076       1.00      0.11      0.20         9
  5637167077       0.44      0.71      0.54        48
  5637167078       0.67      0.33      0.44         6
  5637167079       0.29      0.25      0.27         8
  5637167080       0.42      0.70      0.52        63
  5637167081       0.50      0.20      0.29         5
  5637167082       0.38      0.40      0.39        15
  5637167083       0.33      0.14      0.20         7
  5637167084       0.41      0.39      0.40        18
  5637167099       0.00      0.00      0.00         1
  5637167101       0.59      0.40      0.48        40
  5637167826       0.00      0.00      0.00         1
  5637167828       0.00      0.00      0.00         1
  5637167831       0.33      0.20      0.25   

## 7. Feature Importance Analysis

In [15]:
# Get feature names after preprocessing
feature_names = []

# Add numerical feature names
feature_names.extend(numerical_features)

# Add categorical feature names (after one-hot encoding)
cat_feature_names = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features)
feature_names.extend(cat_feature_names)

# Get feature importances
feature_importances = rf_model.feature_importances_

# Create a dataframe for better visualization
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importances
}).sort_values('importance', ascending=False)

# Display top 20 most important features
print("Top 20 Most Important Features:")
print(importance_df.head(20))

Top 20 Most Important Features:
                                feature  importance
1                            total_days    0.049536
8                           id_customer    0.038713
3                        subscribe_year    0.034658
5                         subscribe_day    0.034514
4                       subscribe_month    0.033005
7                delivery_duration_days    0.029859
21                                  bmi    0.024649
11                               weight    0.024198
12                           birth_year    0.023892
18                          created_day    0.023867
10                               height    0.023497
9                                   age    0.023419
14                            birth_day    0.023324
13                          birth_month    0.021232
16                         created_year    0.020999
6                     subscribe_quarter    0.020607
2                             free_days    0.016645
15                        birth_

In [17]:
program_mapping = df.set_index('program_id')['diet_program_name'].to_dict()

# Save the mapping dictionary to a pickle file
joblib.dump(program_mapping, 'program_name_mapping.pkl')

['program_name_mapping.pkl']