In [1]:
import pandas as pd

import mlflow
import mlflow.sklearn
import mlflow.xgboost
import mlflow.lightgbm

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

### Preparing X and Y variables

In [2]:
df = pd.read_csv("survey_results_v2.csv")

In [3]:
X = df.drop(['respondent_id', 'price_range'], axis='columns')
Y = df['price_range']

### Data Splitting and Data Encoding

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

In [5]:
label_cols = ["age_group", "income_levels", "health_concerns", 
              "consume_frequency(weekly)", "preferable_consumption_size"]


# This code will ensure the uniformity in the data so no error are put out!
for col in label_cols:
    X_train[col] = X_train[col].astype(str).str.strip()
    X_test[col] = X_test[col].astype(str).str.strip()

    X_train[col] = X_train[col].str.lower()
    X_test[col] = X_test[col].str.lower()


X_train.columns = X_train.columns.str.replace(' ', '_')
X_test.columns = X_test.columns.str.replace(' ', '_')

In [6]:
# This code will print any unseen column by the test data
for col in label_cols:
    train_vals = set(X_train[col].astype(str).unique())
    test_vals = set(X_test[col].astype(str).unique())
    diff = test_vals - train_vals
    if diff:
        print(f"Column: {col} ‚Äî Unseen in train: {diff}")


In [7]:
label_cols = ["age_group", "income_levels", "health_concerns", 
              "consume_frequency(weekly)", "preferable_consumption_size"]

# Initializes a dictionary to store encoders (optional, but useful)
label_encoders = {}

for col in label_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    label_encoders[col] = le # Save if you need to inverse transform later

In [8]:
for col in label_cols:
    X_test[col] = label_encoders[col].transform(X_test[col])

In [9]:
le = LabelEncoder()
Y_train = le.fit_transform(Y_train)
Y_test = le.transform(Y_test)

In [10]:
one_hot_cols = list(X_train.drop(label_cols, axis=1).columns)

In [11]:
oe = OneHotEncoder(sparse_output=False, drop='first')
encoded_array_train = oe.fit_transform(X_train[one_hot_cols])

In [12]:
encoded_df_train = pd.DataFrame(encoded_array_train, columns=oe.get_feature_names_out(one_hot_cols), index=X_train.index)

In [13]:
X_train.drop(columns=one_hot_cols, inplace=True)
X_train = pd.concat([X_train, encoded_df_train], axis=1)

In [14]:
encoded_array_test = oe.transform(X_test[one_hot_cols])
encoded_df_test = pd.DataFrame(encoded_array_test, columns=oe.get_feature_names_out(one_hot_cols), index=X_test.index)
X_test.drop(columns=one_hot_cols, inplace=True)
X_test = pd.concat([X_test, encoded_df_test], axis=1)

#### Model Testing

In [15]:
models = {
    "Gaussian Naive Bayes": GaussianNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Support Vector Machine (SVM)": SVC(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(),
    "LightGBM": LGBMClassifier()
}

In [18]:
mlflow.set_experiment("Beverage Price Prediction Tracking")
mlflow.set_tracking_uri("http://127.0.0.1:5000")

In [19]:
for name, model in models.items():
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(Y_test, y_pred)
    report_dict = classification_report(Y_test, y_pred, output_dict=True)
    report_df = pd.DataFrame(report_dict).transpose()

    print(f"\n{'*' * 20} {name.upper()} {'*' * 20}")
    print(f"‚úÖ Accuracy Score: {accuracy:.4f}\n")
    print("üìä Classification Report:\n")
    print(report_df.round(4))
    print("*" * (44 + len(name)))


******************** GAUSSIAN NAIVE BAYES ********************
‚úÖ Accuracy Score: 0.4541

üìä Classification Report:

              precision  recall  f1-score    support
0                0.2352  0.3000    0.2637  1930.0000
1                0.6067  0.0243    0.0467  2223.0000
2                0.8083  0.7757    0.7917  2430.0000
3                0.3388  0.9746    0.5028   906.0000
accuracy         0.4541  0.4541    0.4541     0.4541
macro avg        0.4973  0.5187    0.4012  7489.0000
weighted avg     0.5440  0.4541    0.3995  7489.0000
****************************************************************

******************** LOGISTIC REGRESSION ********************
‚úÖ Accuracy Score: 0.8151

üìä Classification Report:

              precision  recall  f1-score    support
0                0.7540  0.7736    0.7637  1930.0000
1                0.7706  0.7859    0.7782  2223.0000
2                0.9100  0.8992    0.9046  2430.0000
3                0.8074  0.7494    0.7773   906.0000
accur

In [20]:
for name, model in models.items():
    with mlflow.start_run(run_name=name):
        # Fit and predict
        model.fit(X_train, Y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(Y_test, y_pred)

        # Log model-specific parameters if needed
        if hasattr(model, 'get_params'):
            mlflow.log_params(model.get_params())

        # Log accuracy
        mlflow.log_metric("accuracy", accuracy)

        # Log the model (framework-specific for LightGBM/XGBoost)
        if name == "XGBoost":
            mlflow.xgboost.log_model(model, artifact_path="model")
        elif name == "LightGBM":
            mlflow.lightgbm.log_model(model, artifact_path="model")
        else:
            mlflow.sklearn.log_model(model, artifact_path="model")




üèÉ View run Gaussian Naive Bayes at: http://127.0.0.1:5000/#/experiments/367472464984165969/runs/46854396004b49c1813ca99a1b835a49
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/367472464984165969




üèÉ View run Logistic Regression at: http://127.0.0.1:5000/#/experiments/367472464984165969/runs/bb2527dd7a3d47a9bcb938fc95079288
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/367472464984165969




üèÉ View run Support Vector Machine (SVM) at: http://127.0.0.1:5000/#/experiments/367472464984165969/runs/2cc73e61fadf4a52bc9cc9464dc656c8
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/367472464984165969




üèÉ View run Random Forest at: http://127.0.0.1:5000/#/experiments/367472464984165969/runs/ebe81cba3b354b64bd646b9babe7f428
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/367472464984165969


  self.get_booster().save_model(fname)


üèÉ View run XGBoost at: http://127.0.0.1:5000/#/experiments/367472464984165969/runs/8075b5a3f1e14ba8895c7788ed9ddad1
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/367472464984165969
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000544 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 98
[LightGBM] [Info] Number of data points in the train set: 22467, number of used features: 44
[LightGBM] [Info] Start training from score -1.343386
[LightGBM] [Info] Start training from score -1.228925
[LightGBM] [Info] Start training from score -1.126779
[LightGBM] [Info] Start training from score -2.100810




üèÉ View run LightGBM at: http://127.0.0.1:5000/#/experiments/367472464984165969/runs/b28cb7ded2804a849788d4b7f5a0a069
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/367472464984165969
