In [2]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import warnings
import yaml
import os
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import json
import nltk
import string
from nltk.stem import SnowballStemmer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, roc_auc_score, classification_report, f1_score
import joblib

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/campusx-official/jupyter-masterclass/main/tweet_emotions.csv')
df.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [4]:
def process_data(df: pd.DataFrame, test_size: float) -> tuple:
# delete tweet id
    df.drop(columns=['tweet_id'],inplace=True)

    final_df = df[df['sentiment'].isin(['happiness','sadness'])]

    final_df['sentiment'].replace({'happiness':1, 'sadness':0},inplace=True)
    
    train_data, test_data = train_test_split(final_df, test_size=test_size, random_state=42)

    data_path = os.path.join('data', 'raw')
    
    return data_path, train_data, test_data

In [5]:
_, train_data, test_data = process_data(df, test_size=0.2)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  final_df['sentiment'].replace({'happiness':1, 'sadness':0},inplace=True)
  final_df['sentiment'].replace({'happiness':1, 'sadness':0},inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['sentiment'].replace({'happiness':1, 'sadness':0},inplace=True)


In [6]:
train_data

Unnamed: 0,sentiment,content
23531,0,&quot;My problem isn't that I miss you... 'cau...
8051,0,That's it? It's done already? This is one proo...
11499,0,I am so hungry! And there is no food for me to...
31288,1,Feet hurt...finally in bed...will not forget t...
18561,0,really ill atm
...,...,...
21697,1,@chocolatesuze yes yes you should! Especially ...
19445,0,@kickzfadayz Our boy better get it in tonight!...
20216,1,tafe was actually quite good. for once
3258,0,10 minutes to boarding; 14 hours to home. no w...


In [7]:
def lemmatization(text):
    lemmatizer= WordNetLemmatizer()

    text = text.split()

    text=[lemmatizer.lemmatize(y) for y in text]

    return " " .join(text)

def remove_stop_words(text):
    stop_words = set(stopwords.words("english"))
    Text=[i for i in str(text).split() if i not in stop_words]
    return " ".join(Text)

def removing_numbers(text):
    text=''.join([i for i in text if not i.isdigit()])
    return text

def lower_case(text):

    text = text.split()

    text=[y.lower() for y in text]

    return " " .join(text)

def removing_punctuations(text):
    ## Remove punctuations
    text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,،-./:;<=>؟?@[\\]^_`{|}~"""), ' ', text)
    text = text.replace('؛',"", )

    ## remove extra whitespace
    text = re.sub('\s+', ' ', text)
    text =  " ".join(text.split())
    return text.strip()

def removing_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def remove_small_sentences(df):
    for i in range(len(df)):
        if len(df.text.iloc[i].split()) < 3:
            df.text.iloc[i] = np.nan

def normalize_text(df):
    df.content=df.content.apply(lambda content : lower_case(content))
    df.content=df.content.apply(lambda content : remove_stop_words(content))
    df.content=df.content.apply(lambda content : removing_numbers(content))
    df.content=df.content.apply(lambda content : removing_punctuations(content))
    df.content=df.content.apply(lambda content : removing_urls(content))
    df.content=df.content.apply(lambda content : lemmatization(content))
    return df


  text = re.sub('\s+', ' ', text)


In [8]:
train_data_processed = normalize_text(train_data)
test_data_processed = normalize_text(test_data)

In [9]:
train_data

Unnamed: 0,sentiment,content
23531,0,quot my problem miss you cause don t quot
8051,0,that s it done already one proof there s nothi...
11499,0,hungry food steal
31288,1,foot hurt finally bed will forget crunch over ...
18561,0,really ill atm
...,...,...
21697,1,chocolatesuze yes yes should especially wine m...
19445,0,kickzfadayz boy better get tonight
20216,1,tafe actually quite good
3258,0,minute boarding hour home window seat


In [10]:
test_data

Unnamed: 0,sentiment,content
11535,0,look like rained weekend climbing
32287,0,hi everyone miss much muahhhhhhhhhhhhhhhhhhhhh...
17534,0,rode moped mall fun stuff flippin gorgeous out...
4696,0,gutted vodafone wont repair faulty samsung omn...
23706,1,shadowowns aww lt thank youu
...,...,...
38000,1,russellburnham nice one
1540,0,tired climb bed fall asleep hope weekend fun c...
38031,1,jadeyyg http twitpic com wrxq whens little gin...
14702,0,leirastorm that s sucky miss on


In [11]:
train_preprocessed_df = train_data_processed
test_preprocessed_df = test_data_processed

X_train = train_preprocessed_df['content'].values
y_train = train_preprocessed_df['sentiment'].values

X_test = test_preprocessed_df['content'].values
y_test = test_preprocessed_df['sentiment'].values

        
# Apply Bag of Words (CountVectorizer)
vectorizer = CountVectorizer(max_features=1000)

# # Fit the vectorizer on the training data and transform it
X_train_bow = vectorizer.fit_transform(X_train)

# Transform the test data using the same vectorizer
X_test_bow = vectorizer.transform(X_test)

train_bow = pd.DataFrame(X_train_bow.toarray())
train_bow['label'] = y_train

test_bow = pd.DataFrame(X_test_bow.toarray())
test_bow['label'] = y_test

vectorized_df = pd.concat([train_bow, test_bow], axis=0)

X = vectorized_df.iloc[:, :-1]
y = vectorized_df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
import dagshub
import mlflow 
from mlflow.models.signature import infer_signature
from dotenv import load_dotenv

load_dotenv()

mlflow.set_tracking_uri(os.getenv('MLFLOW-TRACKING-URI'))

dagshub.init(repo_owner='faheem-afk', repo_name="mlops-mini-project", mlflow=True)

mlflow.set_experiment("Logistic Regression Baseline")
with mlflow.start_run():
    mlflow.log_param("vectorizer", 'bag of words')
    mlflow.log_param("num_features", 1000)
    mlflow.log_param("test_size", 0.2)
    
    model = LogisticRegression()
    model.fit(X_train, y_train)
    
    mlflow.log_param("model", "logistic Regression")
    
    y_pred = model.predict(X_test)
    
    accuracy_ = accuracy_score(y_test, y_pred)
    recall_ = recall_score(y_test, y_pred)
    precision_ = precision_score(y_test, y_pred)
    f1_score_ = f1_score(y_test, y_pred)
    
    mlflow.log_metric("accuracy", accuracy_)
    mlflow.log_metric("recall", recall_)
    mlflow.log_metric("precision", precision_)
    mlflow.log_metric("f1_score", f1_score_)
    
    input_example = X_train.iloc[:5, : ]
    
    signature = infer_signature(X_train, model.predict(X_train))
    
    mlflow.sklearn.log_model(model, "LogisticRegression", signature=signature, input_example=input_example)
    
    notebook_path = "experiment_baseline_model.ipynb"
    # os.system(f"jupyter nbconvert --to notebook --execute --inplace {notebook_path}")
    
    mlflow.log_artifact(notebook_path)



🏃 View run indecisive-fly-746 at: https://dagshub.com/faheem-afk/mlops-mini-project.mlflow/#/experiments/1/runs/ba125c98cdde4d0ca83f3ee06918c1e8
🧪 View experiment at: https://dagshub.com/faheem-afk/mlops-mini-project.mlflow/#/experiments/1
