In [14]:
import mlflow
import pandas as pd
import mlflow.sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np
from bs4 import BeautifulSoup
import html
import time

In [None]:
#df = pd.read_csv('IMDB.csv')
#df = df.sample(500)
#df.to_csv('data.csv',index = False)
#df.head()

Unnamed: 0,review,sentiment
928,pardon my spelling. This is probably the funni...,negative
301,This ranks way up there on my top list of wors...,negative
992,Holy @#%& this movie was still warm and juicy ...,negative
0,Film version of Sandra Bernhard's one-woman of...,negative
591,"The best thing about the movie is the name, as...",negative


In [3]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,review,sentiment
0,pardon my spelling. This is probably the funni...,negative
1,This ranks way up there on my top list of wors...,negative
2,Holy @#%& this movie was still warm and juicy ...,negative
3,Film version of Sandra Bernhard's one-woman of...,negative
4,"The best thing about the movie is the name, as...",negative


In [None]:
def clean_text(text):
    
    text = BeautifulSoup(text,'html.parser').get_text()
    text = re.sub(r'https?://\S+|www\.\S+','',text)
    text = text.lower()
    text = re.sub(r'\d+','',text)
    text = re.sub(r'[^\w\s]',' ',text)
    text = re.sub(r'\s+',' ',text)
    text = text.strip()
    
    return text

import nltk
nltk.download('stopwords')
def remove_stopwords(text):
    
    stopword = set(stopwords.words('english')) 
    text = [word for word in str(text).split() if word not in stopword]
    text = " ".join(text)

    return text

nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
def text_lemmatize(text):

    pos_text = nltk.pos_tag(str(text).split())
    lemma = WordNetLemmatizer()
    text = [lemma.lemmatize(word[0]) for word in pos_text]
    text = " ".join(text)

    return text

def normalize_text(df):
    
    df['review'] = df['review'].apply(clean_text)
    df['review'] = df['review'].apply(remove_stopwords)
    df['review'] = df['review'].apply(text_lemmatize)

    return df

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jhanvi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Jhanvi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Jhanvi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [5]:
df = normalize_text(df)
df.head()

Unnamed: 0,review,sentiment
0,pardon spelling probably funniest horror movie...,negative
1,rank way top list worst movie seen far starz d...,negative
2,holy movie still warm juicy pile made tried wa...,negative
3,film version sandra bernhard one woman broadwa...,negative
4,best thing movie name describes plot acting le...,negative


In [6]:
df['sentiment'].value_counts()

sentiment
negative    262
positive    238
Name: count, dtype: int64

In [7]:
df = df[df['sentiment'].isin(['positive','negative'])] 

In [8]:
df['sentiment'] = df['sentiment'].map({'positive' : 1, 'negative' : 0})
df

Unnamed: 0,review,sentiment
0,pardon spelling probably funniest horror movie...,0
1,rank way top list worst movie seen far starz d...,0
2,holy movie still warm juicy pile made tried wa...,0
3,film version sandra bernhard one woman broadwa...,0
4,best thing movie name describes plot acting le...,0
...,...,...
495,gender bender sex thing bit x file episode int...,1
496,nearly always case britain come entertaining s...,1
497,one favorite movie great cast lead jonathan si...,1
498,mess referring destruction title could go hack...,0


In [9]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [67]:
max_features = 1000
test_size = 0.40

In [68]:
vectorizer = CountVectorizer(max_features = max_features)
x = vectorizer.fit_transform(df['review'])
y = df['sentiment']

In [69]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = test_size, random_state = 42)

In [17]:
import dagshub

mlflow.set_tracking_uri('https://dagshub.com/deepno1/MLOps-sentiment-analysis.mlflow')
dagshub.init(repo_owner='deepno1', repo_name='MLOps-sentiment-analysis', mlflow=True)
mlflow.set_experiment("Logistic Regression Baseline")

2025/11/20 01:48:34 INFO mlflow.tracking.fluent: Experiment with name 'Logistic Regression Baseline' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/867eab57f85a43da982580a2dd8f93b8', creation_time=1763583516020, experiment_id='1', last_update_time=1763583516020, lifecycle_stage='active', name='Logistic Regression Baseline', tags={}>

In [70]:
import mlflow

with mlflow.start_run():

  start = time.time()
  
  mlflow.log_param("vectorizer",'Bag of words')
  mlflow.log_param('max_features',max_features)
  mlflow.log_param('test_size',test_size)

  mlflow.log_param('model',"LogisticRegression")
  model = LogisticRegression(max_iter= 1000)
  model.fit(x_train,y_train)
  y_pred = model.predict(x_test)
  mlflow.log_param('max_iter',1000)
  mlflow.sklearn.log_model(model,'model')

  test_acc = accuracy_score(y_test,y_pred)
  train_acc = accuracy_score(y_train,model.predict(x_train))
  precision = precision_score(y_test,y_pred)
  recall = recall_score(y_test,y_pred)
  f_one = f1_score(y_test,y_pred)
  
  mlflow.log_metric('test_accuracy',test_acc)
  mlflow.log_metric('train_accuracy',train_acc)
  mlflow.log_metric('precision',precision)
  mlflow.log_metric('recall',recall)
  mlflow.log_metric('f1',f_one)

  end = time.time()

print('time : ',end - start)



üèÉ View run capable-quail-396 at: https://dagshub.com/deepno1/MLOps-sentiment-analysis.mlflow/#/experiments/1/runs/eba5e42bbcb94fadaec408b1de686016
üß™ View experiment at: https://dagshub.com/deepno1/MLOps-sentiment-analysis.mlflow/#/experiments/1
time :  10.607516765594482
