In [1]:
!pip list

Package                            Version
---------------------------------- -----------
alembic                            1.16.4
asttokens                          3.0.0
blinker                            1.9.0
cachetools                         5.5.2
certifi                            2025.7.14
charset-normalizer                 3.4.2
click                              8.2.1
cloudpickle                        3.1.1
colorama                           0.4.6
comm                               0.2.2
contourpy                          1.3.2
cycler                             0.12.1
debugpy                            1.8.15
decorator                          5.2.1
docker                             7.1.0
entrypoints                        0.4
exceptiongroup                     1.3.0
executing                          2.2.0
Flask                              3.1.1
fonttools                          4.59.0
gitdb                              4.0.12
GitPython                          3.1.44


## Import Libraries

In [2]:
import re
import string

import mlflow
import numpy as np
import pandas as pd
import mlflow.sklearn

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


## Import Data

In [6]:
df = pd.read_csv('https://raw.githubusercontent.com/campusx-official/jupyter-masterclass/main/tweet_emotions.csv').drop(columns=['tweet_id'])
df.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


## Data Preprocessing

In [7]:
# Define text preprocessing functions
def lemmatization(text):
    """Lemmatize the text."""
    lemmatizer = WordNetLemmatizer()
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text]
    return " ".join(text)

def remove_stop_words(text):
    """Remove stop words from the text."""
    stop_words = set(stopwords.words("english"))
    text = [word for word in str(text).split() if word not in stop_words]
    return " ".join(text)

def removing_numbers(text):
    """Remove numbers from the text."""
    text = ''.join([char for char in text if not char.isdigit()])
    return text

def lower_case(text):
    """Convert text to lower case."""
    text = text.split()
    text = [word.lower() for word in text]
    return " ".join(text)

def removing_punctuations(text):
    """Remove punctuations from the text."""
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = text.replace('؛', "")
    text = re.sub('\s+', ' ', text).strip()
    return text

def removing_urls(text):
    """Remove URLs from the text."""
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def normalize_text(df):
    """Normalize the text data."""
    try:
        df['content'] = df['content'].apply(lower_case)
        df['content'] = df['content'].apply(remove_stop_words)
        df['content'] = df['content'].apply(removing_numbers)
        df['content'] = df['content'].apply(removing_punctuations)
        df['content'] = df['content'].apply(removing_urls)
        df['content'] = df['content'].apply(lemmatization)
        return df
    except Exception as e:
        print(f'Error during text normalization: {e}')
        raise

In [8]:
df = normalize_text(df)
df.head()

Unnamed: 0,sentiment,content
0,empty,tiffanylue know listenin bad habit earlier sta...
1,sadness,layin n bed headache ughhhh waitin call
2,sadness,funeral ceremony gloomy friday
3,enthusiasm,want hang friend soon
4,neutral,dannycastillo want trade someone houston ticke...


## Converting to binary classification problem

In [9]:
df['sentiment'].value_counts(normalize = True)

sentiment
neutral       0.215950
worry         0.211475
happiness     0.130225
sadness       0.129125
love          0.096050
surprise      0.054675
fun           0.044400
relief        0.038150
hate          0.033075
empty         0.020675
enthusiasm    0.018975
boredom       0.004475
anger         0.002750
Name: proportion, dtype: float64

In [10]:
df = df[df['sentiment'].isin(['happiness','sadness'])]
df['sentiment'] = df['sentiment'].replace({'sadness':0,'happiness':1})
print(df.shape)
print(df['sentiment'].value_counts())
print(df.head(3))

(10374, 2)
sentiment
1    5209
0    5165
Name: count, dtype: int64
   sentiment                                            content
1          0            layin n bed headache ughhhh waitin call
2          0                     funeral ceremony gloomy friday
6          0  sleep im not thinking old friend want married ...


  df['sentiment'] = df['sentiment'].replace({'sadness':0,'happiness':1})


## Apply Bag of Words

In [17]:
num_features = 1000
vectorizer = CountVectorizer(max_features=num_features)
X = vectorizer.fit_transform(df['content'])
y = df['sentiment']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(8299, 1000)
(8299,)
(2075, 1000)
(2075,)


## Dagshub Integration

In [14]:
import dagshub
dagshub.init(repo_owner='datta-abhi', repo_name='mlops-mini-project', mlflow=True)
mlflow.set_tracking_uri("https://dagshub.com/datta-abhi/mlops-mini-project.mlflow")

mlflow.set_experiment("BOW Logistic Baseline")

2025/07/17 11:46:38 INFO mlflow.tracking.fluent: Experiment with name 'BOW Logistic Baseline' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/3a327dc9a9cb46e08350cb0f110d7d58', creation_time=1752732998391, experiment_id='0', last_update_time=1752732998391, lifecycle_stage='active', name='BOW Logistic Baseline', tags={}>

## MLflow runs

In [20]:
with mlflow.start_run(description="simple BOW based Logistic baseline model to compare against"):
    # model building
    model = LogisticRegression()
    model.fit(X_train,y_train)
    
    # model evaluation
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)    
    recall = recall_score(y_test,y_pred)
    f1 = f1_score(y_test,y_pred)
    
    # log params
    mlflow.log_params({"vectorizer":"Bag of Words",
                       "num_features": num_features,
                       "test_size":0.2,
                       "model": "Logistic"})
    
    # log metrics
    mlflow.log_metrics({"accuracy":accuracy,
                        "precision": precision,
                        "recall": recall,
                        "f1": f1})
    
    # log notebook
    import os
    notebook_path = "exp1_baseline_model.ipynb"
    os.system(f"jupyter nbconvert --to notebook --execute --inplace {notebook_path}")
    mlflow.log_artifact(notebook_path)
    
    # log model
    mlflow.sklearn.log_model(model,"model")
    
    # log tags
    mlflow.set_tags({"author":"Abhigyan"})
    
    # print for checking
    print({"accuracy":accuracy,
            "precision": precision,
            "recall": recall,
            "f1": f1})
    
    

{'accuracy': 0.7773493975903615, 'precision': 0.7692307692307693, 'recall': 0.7783251231527094, 'f1': 0.7737512242899118}
