# Sales Amount Prediction Model

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
df = pd.read_csv('https://drive.google.com/file/d/1vqxpwQTbh80GILVhrm-id6efxcXjn3mu/view?usp=drive_link')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5865 entries, 0 to 5864
Data columns (total 48 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   id_program                       5865 non-null   int64  
 1   customer_id                      5865 non-null   int64  
 2   program_id                       5865 non-null   int64  
 3   promocode_id                     5864 non-null   float64
 4   paid                             5865 non-null   int64  
 5   paid_amount                      5865 non-null   float64
 6   delivery_start_date              5865 non-null   object 
 7   total_days                       5865 non-null   int64  
 8   free_days                        5865 non-null   int64  
 9   status                           5865 non-null   object 
 10  created_at_program               5865 non-null   object 
 11  diet_program_name                5865 non-null   object 
 12  master_plan_name    

## 2. Target Variable Identification and Feature Separation

In [9]:
# Define target variable
target = "paid_amount"

# Separate features (X) and target (y)
X = df.drop(columns=[target])
y = df[target]

print(f"Target variable identified: {target}")
print(f"Shape of features (X): {X.shape}")
print(f"Shape of target (y): {y.shape}")

Target variable identified: paid_amount
Shape of features (X): (5865, 47)
Shape of target (y): (5865,)


## 3. Feature Identification

In [10]:
# Columns to exclude from automatic type inference for features
exclude_cols = [target, 'id_program', 'customer_id', 'program_id', 'promocode_id', 
                'created_at_program', 'delivery_start_date', 'created_at_customer', 'date_of_birth',
                'deleted_at', 'created_month_year_str_program', 'created_month_year_str_customer',
                'subscribe_month_name', 'created_month_year']

# Identify numerical features
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
numerical_features = [col for col in numerical_features if col not in exclude_cols]

# Identify categorical features
categorical_features = X.select_dtypes(include='object').columns.tolist()
categorical_features = [col for col in categorical_features if col not in exclude_cols]

print("Numerical Features:", numerical_features)
print("Categorical Features:", categorical_features)

Numerical Features: ['paid', 'total_days', 'free_days', 'subscribe_year', 'subscribe_month', 'subscribe_day', 'subscribe_quarter', 'delivery_duration_days', 'id_customer', 'age', 'height', 'weight', 'birth_year', 'birth_month', 'birth_day', 'birth_quarter', 'created_year', 'created_month', 'created_day', 'created_quarter', 'gender_encoded', 'bmi']
Categorical Features: ['status', 'diet_program_name', 'master_plan_name', 'subscribe_weekday', 'username', 'email', 'nationality', 'gender', 'birth_weekday', 'created_month_name', 'created_weekday', 'email_domain']


## 4. Data Splitting

In [11]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")

Shape of X_train: (4692, 47)
Shape of X_test: (1173, 47)
Shape of y_train: (4692,)
Shape of y_test: (1173,)


## 5. Preprocessing Pipeline

In [12]:
# Numerical pipeline: Imputation + Scaling
numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

# Categorical pipeline: Imputation + One-Hot Encoding
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Create a preprocessor object using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features)
    ])

# Fit and transform the training data
X_train_processed = preprocessor.fit_transform(X_train)

# Transform the test data
X_test_processed = preprocessor.transform(X_test)

# Save the preprocessor for future use
joblib.dump(preprocessor, "preprocessor.pkl")

print("Preprocessing pipeline created and applied.")
print(f"Shape of processed X_train: {X_train_processed.shape}")
print(f"Shape of processed X_test: {X_test_processed.shape}")

Preprocessing pipeline created and applied.
Shape of processed X_train: (4692, 4512)
Shape of processed X_test: (1173, 4512)


## 6. Model Training and Evaluation - Linear Regression

In [13]:
# Initialize and train the Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train_processed, y_train)

# Make predictions on the test set
y_pred_lr = linear_model.predict(X_test_processed)

# Evaluate the model
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print(f"Linear Regression - Mean Squared Error: {mse_lr:.2f}")
print(f"Linear Regression - R-squared: {r2_lr:.2f}")

# Save the trained model
joblib.dump(linear_model, "linear_regression_model.pkl")
print("Linear Regression model trained and saved.")

Linear Regression - Mean Squared Error: 2564.33
Linear Regression - R-squared: 0.31
Linear Regression model trained and saved.


## 7. Model Training and Evaluation - RandomForestRegressor

In [14]:
# Initialize and train the RandomForestRegressor model
# Using default parameters for now, can tune later if needed
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train_processed, y_train)

# Make predictions on the test set
y_pred_rf = rf_model.predict(X_test_processed)

# Evaluate the model
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"RandomForestRegressor - Mean Squared Error: {mse_rf:.2f}")
print(f"RandomForestRegressor - R-squared: {r2_rf:.2f}")

# Save the trained model
joblib.dump(rf_model, "random_forest_regressor_model.pkl")
print("RandomForestRegressor model trained and saved.")

RandomForestRegressor - Mean Squared Error: 1524.88
RandomForestRegressor - R-squared: 0.59
RandomForestRegressor model trained and saved.
