In [1]:
import os 
import re
import string
import pandas as pd
import numpy 
import dagshub
import mlflow
import mlflow.sklearn
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score , precision_score , recall_score , f1_score
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import scipy.sparse

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
CONFIG = {
    "test_size" : 0.2,
    "data_path" : "IMDB Dataset.csv",
    "mlflow_tracking_uri" : "",
    "dagshub_repo_owner" :  "codewithkaran-21",
    "dagshub_repo_name" :  "Capstone-Project-MLOPS",
    "experimnent_name" : "BOW vs TFIDF"
}

In [4]:
mlflow.set_tracking_uri(CONFIG["mlflow_tracking_uri"])
dagshub.init(repo_owner=CONFIG['dagshub_repo_owner'] , repo_name=CONFIG['dagshub_repo_name'],mlflow=True)
mlflow.set_experiment(CONFIG['experimnent_name'])

<Experiment: artifact_location='mlflow-artifacts:/f371d00198bf4fc78ce10a16584f3c1a', creation_time=1762078512751, experiment_id='1', last_update_time=1762078512751, lifecycle_stage='active', name='BOW vs TFIDF', tags={}>

In [5]:
def lemmatization(text):
    lemmatizer = WordNetLemmatizer()
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

def remove_stop_words(text):
    stop_words = set(stopwords.words("english"))
    return " ".join([word for word in text.split() if word not in stop_words])

def removing_numbers(text):
    return ''.join([char for char in text if not char.isdigit()])

def lower_case(text):
    return text.lower()

def removing_punctuations(text):
    return re.sub(f"[{re.escape(string.punctuation)}]", ' ', text)

def removing_urls(text):
    return re.sub(r'https?://\S+|www\.\S+', '', text)

def normalize_text(df):
    try:
        df['review'] = df['review'].apply(lower_case)
        df['review'] = df['review'].apply(remove_stop_words)
        df['review'] = df['review'].apply(removing_numbers)
        df['review'] = df['review'].apply(removing_punctuations)
        df['review'] = df['review'].apply(removing_urls)
        df['review'] = df['review'].apply(lemmatization)
        return df
    except Exception as e:
        print(f"Error during text normalization: {e}")
        raise

In [6]:
def load_data(file_path):
    try:
        df = pd.read_csv(file_path)
        df = normalize_text(df)
        df = df[df['sentiment'].isin(['positive' , 'negative'])]
        df['sentiment'] = df['sentiment'].replace({'negative' : 0 , 'positive' : 1}).infer_objects(copy=False)
        return df
    except Exception as e:
        print(f"Error occured loading data {e}")
        raise

In [7]:
VECTORIZERS = {
    "BoW" : CountVectorizer(),
    "TF-IDF" : TfidfVectorizer()
}

In [8]:
ALGORITHMS = {
    "LogisticRegression"  :LogisticRegression(),
    "RandomForestClassifier" : RandomForestClassifier(),
    "GradientBoostingClassifier" : GradientBoostingClassifier(),
    "MultinomialNB" : MultinomialNB(),
    "xgboost" : XGBClassifier()
}

In [None]:
def train_and_evaluate(df):
    with mlflow.start_run(run_name="All-Experiment") as parent_run:
        for algo_name , ALGORITHM in ALGORITHMS.items():
            for vec , vectorizer in VECTORIZERS.items():
                with mlflow.start_run(run_name=f"{algo_name} with {vec}" , nested=True) as child_run:
                    X = vectorizer.fit_tansform(df['review'])
                    y = df['sentiment']
                    X_train, X_test, y_train, y_test = train_test_split(X,y , test_size=CONFIG['test_size'] , random_state=42)
                    mlflow.log_param(
                        "vectorizer" : vec,
                        "algorithm" : algo_name,
                        "testsize" : CONFIG['test_size']
                    )

                    model = ALGORITHM
                    model.fit(X_train , y_train)

                    mlflow.log_model_params(algo_name , model)
                    y_pred = model.predict(X_test)

                    metrics = {
                        "Accuracy"  : accuracy_score(y_test , y_pred),
                        "precision"  :precision_score(y_test , y_pred),
                        "reacll"  : recall_score(y_test , y_pred),
                        "f1-score" : f1_score(y_test , y_pred)

                    }

                    mlflow.log_metrics(metrics)

                    input_example = X_test[:5] if not scipy.sparse.issparse(X_test) else X_test[:5].toarray()

{"status":"OK","request_id":"04fd5e87-baba-4782-a87e-ed1023393f42","parameters":{"asin":"B07ZPKBL9V","country":"US","language":"en_US"},"data":{"asin":"B07ZPKBL9V","product_title":"Apple iPhone 11, 64GB, (PRODUCT)RED - Fully Unlocked (Renewed)","product_price":"164.89","product_original_price":"$178.00","minimum_order_quantity":null,"currency":"USD","country":"US","product_byline":"Visit the Amazon Renewed Store","product_byline_link":"https://www.amazon.com/Amazon-Renewed/b/ref=bl_dp_s_web_12653393011?ie=UTF8&node=12653393011&field-lbr_brands_browse-bin=Amazon+Renewed","product_byline_links":["https://www.amazon.com/Amazon-Renewed/b/ref=bl_dp_s_web_12653393011?ie=UTF8&node=12653393011&field-lbr_brands_browse-bin=Amazon+Renewed"],"product_star_rating":"4.2","product_num_ratings":56822,"product_url":"https://www.amazon.com/dp/B07ZPKBL9V","product_slug":"Apple-iPhone-11-64GB-Red","product_photo":"https://m.media-amazon.com/images/I/514k7uOBMwL._AC_SL1000_.jpg","product_num_offers":30,"pr

{"location":{"name":"London","region":"City of London, Greater London","country":"United Kingdom","lat":51.5171,"lon":-0.1062,"tz_id":"Europe/London","localtime_epoch":1762176766,"localtime":"2025-11-03 13:32"},"alerts":{"alert":[]}}


{
    "success": true,
    "status": 200,
    "health": "100%"
}


RAW RESPONSE:
 {"success":true,"status":200,"health":"100%"}
{
    "success": true,
    "status": 200,
    "health": "100%"
}


{"id":82,"url":"https://www.tvmaze.com/shows/82/game-of-thrones","name":"Game of Thrones","type":"Scripted","language":"English","genres":["Drama","Adventure","Fantasy"],"status":"Ended","runtime":60,"averageRuntime":61,"premiered":"2011-04-17","ended":"2019-05-19","officialSite":"http://www.hbo.com/game-of-thrones","schedule":{"time":"21:00","days":["Sunday"]},"rating":{"average":8.9},"weight":99,"network":{"id":8,"name":"HBO","country":{"name":"United States","code":"US","timezone":"America/New_York"},"officialSite":"https://www.hbo.com/"},"webChannel":null,"dvdCountry":null,"externals":{"tvrage":24493,"thetvdb":121361,"imdb":"tt0944947"},"image":{"medium":"https://static.tvmaze.com/uploads/images/medium_portrait/498/1245274.jpg","original":"https://static.tvmaze.com/uploads/images/original_untouched/498/1245274.jpg"},"summary":"<p>Based on the bestselling book series <i>A Song of Ice and Fire</i> by George R.R. Martin, this sprawling new HBO drama is set in a world where summers spa

{"status":"OK","request_id":"ea97217d-efea-44a3-bb95-907d20026235","data":[{"location":"United States","job_title":"Software Developer","company":"Amazon","min_salary":174943.22,"max_salary":264149.97,"median_salary":212601.19,"min_base_salary":121828.93,"max_base_salary":165003.3,"median_base_salary":141782.14,"min_additional_pay":53114.29,"max_additional_pay":99146.67,"median_additional_pay":70819.05,"salary_period":"YEAR","salary_currency":"USD","salary_count":1075,"confidence":"CONFIDENT"}]}
