In [1]:
import mlflow
import pandas as pd
import mlflow.sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np

In [2]:
cols=['tweetid', 'entity', 'target', 'content']

data = pd.read_csv("data.csv",names=cols)
data.head()

Unnamed: 0,tweetid,entity,target,content
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [3]:
data.shape

(74682, 4)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   tweetid  74682 non-null  int64 
 1   entity   74682 non-null  object
 2   target   74682 non-null  object
 3   content  73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [5]:
# drop unnecessary columns
data = data.drop(columns=['tweetid','entity'])

In [6]:
# drop null values 
data = data.dropna()

In [7]:
# check duplicates values in data
data.duplicated().sum()

np.int64(4227)

In [None]:
# drop duplicated values
data = data.drop_duplicates()

In [9]:
data.to_csv("data.csv")

In [51]:
# data['content'][0]

In [10]:
def lemmatization(text):
    lemmitizer = WordNetLemmatizer()
    text = text.split()
    text = [lemmitizer.lemmatize(word) for word in text]
    return " ".join(text)

def remove_stop_words(text):
    stop_words = set(stopwords.words("english"))
    text = [word for word in str(text).split() if word not in stop_words]
    return " ".join(text)

def removing_numbers(text):
    text = "".join([char for char in text if not char.isdigit()])
    return text

def lower_case(text):
    text = text.split()
    text = [word.lower() for word in text]
    return " ".join(text)

def removing_punctuations(text):
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)  
    text = re.sub('\s+', ' ', text).strip()  
    return text

def removing_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def normalize_text(data):
    try:
        data['content'] = data['content'].apply(lower_case)
        data['content'] = data['content'].apply(remove_stop_words)
        data['content'] = data['content'].apply(removing_numbers)
        data['content'] = data['content'].apply(removing_punctuations)
        data['content'] = data['content'].apply(removing_urls)
        data['content'] = data['content'].apply(lemmatization)
        return data
    except Exception as e:
        print(f'Error during text normalization: {e}')
        raise

In [11]:
data = normalize_text(data)
data.head()

Unnamed: 0,target,content
0,Positive,im getting borderland murder
1,Positive,coming border kill all
2,Positive,im getting borderland kill all
3,Positive,im coming borderland murder all
4,Positive,im getting borderland murder all


In [12]:
x = data['target'].isin(['Positive','Negative','Neutral'])
data = data[x]

In [13]:
data['target'].value_counts()

target
Negative    21237
Positive    19138
Neutral     17110
Name: count, dtype: int64

In [14]:
import warnings
warnings.filterwarnings("ignore")
data['target'] = data['target'].replace({'Positive':1,'Negative':0,'Neutral':2})
data.head()

Unnamed: 0,target,content
0,1,im getting borderland murder
1,1,coming border kill all
2,1,im getting borderland kill all
3,1,im coming borderland murder all
4,1,im getting borderland murder all


In [15]:
vectorizer = CountVectorizer(max_features=200)
X = vectorizer.fit_transform(data['content'])
y = data['target']

In [16]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [17]:
import dagshub

mlflow.set_tracking_uri("https://dagshub.com/gauravbosamiya/mlops-sentiment-analysis-project.mlflow")
dagshub.init(repo_owner="gauravbosamiya",repo_name="mlops-sentiment-analysis-project",mlflow=True)

mlflow.set_experiment("Logistic Regression Baseline")

<Experiment: artifact_location='mlflow-artifacts:/42326d3995a6432a8f4d49ee898fe8c2', creation_time=1742994681036, experiment_id='0', last_update_time=1742994681036, lifecycle_stage='active', name='Logistic Regression Baseline', tags={'mlflow.sharedViewState.60b5838b3c07b10d688fd52b4dd6c37593b139dcfb12d21877e12fcb552682f6': 'deflate;eJxdUl1PwzAM/CsozxOC176VMT7EhlA3JqQJbVnirZbSpIqdsYL233HXQQePPt/5nHO+FIGOprxDxxBVptRAhWgh3jRP0EitmSOuEwNdEuvIS8YKelJORmUb7QgG6tifte1M5eOxkBxuwDTGwe/43DDuWr3VrAmYfjqL94GqggU3h0gYfK9w7qJInkRD4MAw2GFwqRIkW5yvt5qGFA2shHiOTtqh9B+97exXSmyjTB/ta+0tWJV9HTrkBb1v68WJ8YDWgu/rORKu0SE3E133so7Wmsrud4/FdLa8vloWr89T2WCH8DHRe6zwsx19yk1kYyT+MTihSLkxkrJkUcAx7TNJIriPIdVg59oloEc/LCV8iYRjkn6JFkZVzc1feNtqbuSsPjl3Kv89XScOBWwiUDnyeu3OTLcurLUbo4fj1GHwG9yKSu3zPVL3XYihPh7ew7QKgUsPJPZX/fHeWvIE5BSmk6jD4RvYX+ai'}>

In [18]:
import logging

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logging.info("Starting MLflow run..")

with mlflow.start_run():
    try:
        logging.info("logging preprocessing params")
        mlflow.log_param("vectorizer","Bag of Words")
        mlflow.log_param("num_features",200)
        mlflow.log_param("test_size",0.2)
        
        logging.info("Intializing Logistic Regression Model..")
        model = LogisticRegression(penalty=None,solver='saga',max_iter=1500, multi_class="multinomial")
        
        logging.info("fitting the model..")
        model.fit(X_train,y_train)
        logging.info("Model training complete..")
        
        logging.info("logging model params")
        mlflow.log_param("model","Logistic Regression")
        
        logging.info("Making predictions...")
        y_pred =  model.predict(X_test)
        
        logging.info("Evaluation Metrics..")
        accuracy = accuracy_score(y_test,y_pred)
        precision = precision_score(y_test,y_pred,average="macro")
        recall = recall_score(y_test,y_pred,average="macro")
        f1 = f1_score(y_test,y_pred,average="macro")
                
        logging.info("Logging Evaluation Metrics..")
        mlflow.log_metric("accuracy",accuracy)
        mlflow.log_metric("precision",precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)
        
        logging.info("Saving and logging the model...")
        mlflow.sklearn.log_model(model,"model")
        
        logging.info(f"Accuracy: {accuracy}")
        logging.info(f"Precision: {precision}")
        logging.info(f"Recall: {recall}")
        logging.info(f"F1 Score: {f1}")
        
    except Exception as e:
        logging.error(f"An error occurred: {e}", exc_info=True)
        

2025-03-27 17:28:40,860 - INFO - Starting MLflow run..
2025-03-27 17:28:41,577 - INFO - logging preprocessing params
2025-03-27 17:28:43,005 - INFO - Intializing Logistic Regression Model..
2025-03-27 17:28:43,005 - INFO - fitting the model..
2025-03-27 17:28:46,627 - INFO - Model training complete..
2025-03-27 17:28:46,627 - INFO - logging model params
2025-03-27 17:28:46,962 - INFO - Making predictions...
2025-03-27 17:28:46,962 - INFO - Evaluation Metrics..
2025-03-27 17:28:46,978 - INFO - Logging Evaluation Metrics..
2025-03-27 17:28:48,572 - INFO - Saving and logging the model...
2025-03-27 17:29:03,827 - INFO - Accuracy: 0.5984169783421762
2025-03-27 17:29:03,827 - INFO - Precision: 0.6053347766188678
2025-03-27 17:29:03,827 - INFO - Recall: 0.5894116518159972
2025-03-27 17:29:03,827 - INFO - F1 Score: 0.5884495586544781


🏃 View run rebellious-vole-20 at: https://dagshub.com/gauravbosamiya/mlops-sentiment-analysis-project.mlflow/#/experiments/0/runs/d8f665d21e3046c0aeb80bdf74dda8a0
🧪 View experiment at: https://dagshub.com/gauravbosamiya/mlops-sentiment-analysis-project.mlflow/#/experiments/0
