# Lab

##### Objective : The main purpose behind this lab is to get familiar with NLP language models using Sklearn library.

## Part 2: Language Modeling / Classification

In [44]:
# import libraries
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Dataset used : https://www.kaggle.com/datasets/jp797498e/twitter-entity-sentiment-analysis

In [45]:
#import data
train_dataset = pd.read_csv('twitter_training.csv')

In [46]:
train_dataset.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [47]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74681 entries, 0 to 74680
Data columns (total 4 columns):
 #   Column                                                 Non-Null Count  Dtype 
---  ------                                                 --------------  ----- 
 0   2401                                                   74681 non-null  int64 
 1   Borderlands                                            74681 non-null  object
 2   Positive                                               74681 non-null  object
 3   im getting on borderlands and i will murder you all ,  73995 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [48]:
train_dataset.describe()

Unnamed: 0,2401
count,74681.0
mean,6432.640149
std,3740.423819
min,1.0
25%,3195.0
50%,6422.0
75%,9601.0
max,13200.0


In [49]:
train_dataset.columns = ['Tweet ID', 'Entity','Sentiment','Tweet content']

In [50]:
train_dataset.head()

Unnamed: 0,Tweet ID,Entity,Sentiment,Tweet content
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [51]:
train_dataset.isnull().sum()

Tweet ID           0
Entity             0
Sentiment          0
Tweet content    686
dtype: int64

In [52]:
train_dataset.dropna(inplace=True)
train_dataset.isnull().sum()

Tweet ID         0
Entity           0
Sentiment        0
Tweet content    0
dtype: int64

In [53]:
train_dataset.duplicated().sum()

2340

In [54]:
train_dataset.drop_duplicates(inplace=True)
train_dataset.duplicated().sum()

0

In [55]:
# Remove Non-String
def filter_non_string(df, column):
    """
    Filter out rows with non-string values in the specified column.
    Convert non-string values to strings.
    """
    df = df.dropna(subset=[column])
    df[column] = df[column].astype(str)
    return df

In [56]:
# Convert In LowerCase
def normalize_text(text):
    """Convert text to lowercase to ensure consistency across the corpus."""
    return text.lower()

In [57]:
# Remove HTML Tags
def remove_html_tags(text):
    """Remove HTML tags from text."""
    return re.sub(r'<.*?>', '', text)

In [58]:
# Remove URL Or HyperLink
def remove_urls(text):
    """Remove URLs or hyperlinks from the text."""
    return re.sub(r'http\S+|www\S+', '', text)

In [59]:
# Remove Numeric Digit
def remove_numbers(text):
    """Exclude numerical digits from the text."""
    return re.sub(r'\d+', '', text)

In [60]:
# Remove Punctuation
def remove_punctuation(text):
    """Remove punctuation marks from the text."""
    return text.translate(str.maketrans('', '', string.punctuation))

In [61]:
# Initialize the stemmer
stemmer = PorterStemmer()

def stem_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    # Stemming
    stemmed = [stemmer.stem(word) for word in tokens]
    return " ".join(stemmed)

In [62]:
# Split Text In Token
def tokenize_text(text):
    """Split the text into individual words or tokens."""
    return word_tokenize(text)

In [63]:
# Eliminate Stopwords
def remove_stopwords(tokens):
    """Eliminate common stopwords from the tokenized text."""
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words]

In [64]:
# Remove Emojis
import re

def remove_emojis(text):
    """Remove emojis from the text."""
    if isinstance(text, str):
        emoji_pattern = re.compile("["
                                   u"\U0001F600-\U0001F64F"  # emoticons
                                   u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                   u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                   u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   u"\U00002500-\U00002BEF"  # chinese char
                                   u"\U00002702-\U000027B0"
                                   u"\U00002702-\U000027B0"
                                   u"\U000024C2-\U0001F251"
                                   u"\U0001f926-\U0001f937"
                                   u"\U00010000-\U0010ffff"
                                   u"\u2640-\u2642"
                                   u"\u2600-\u2B55"
                                   u"\u200d"
                                   u"\u23cf"
                                   u"\u23e9"
                                   u"\u231a"
                                   u"\ufe0f"  # dingbats
                                   u"\u3030"
                                   "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', text)
    else:
        return text

In [65]:
def preprocess_text(df):
    df = filter_non_string(df, 'Tweet content')
    df['Tweet content'] = df['Tweet content'].apply(normalize_text)
    df['Tweet content'] = df['Tweet content'].apply(remove_html_tags)
    df['Tweet content'] = df['Tweet content'].apply(remove_urls)
    df['Tweet content'] = df['Tweet content'].apply(remove_numbers)
    df['Tweet content'] = df['Tweet content'].apply(remove_punctuation)
    df['Tweet content'] = df['Tweet content'].apply(stem_text)
    df['Tweet content'] = df['Tweet content'].apply(tokenize_text)
    df['Tweet content'] = df['Tweet content'].apply(remove_stopwords)
    df['Tweet content'] = df['Tweet content'].apply(remove_emojis)
    return df

# Usage:
data_processed = preprocess_text(train_dataset)

In [66]:
data_processed.head()

Unnamed: 0,Tweet ID,Entity,Sentiment,Tweet content
0,2401,Borderlands,Positive,"[come, border, kill]"
1,2401,Borderlands,Positive,"[im, get, borderland, kill]"
2,2401,Borderlands,Positive,"[im, come, borderland, murder]"
3,2401,Borderlands,Positive,"[im, get, borderland, murder]"
4,2401,Borderlands,Positive,"[im, get, borderland, murder]"


In [67]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
le = LabelEncoder()

# Encode the 'Entity' feature
data_processed['Entity'] = le.fit_transform(data_processed['Entity'])

# Encode the 'Sentiment' target variable
data_processed['Sentiment'] = le.fit_transform(data_processed['Sentiment'])

In [68]:
data_processed.head()

Unnamed: 0,Tweet ID,Entity,Sentiment,Tweet content
0,2401,4,3,"[come, border, kill]"
1,2401,4,3,"[im, get, borderland, kill]"
2,2401,4,3,"[im, come, borderland, murder]"
3,2401,4,3,"[im, get, borderland, murder]"
4,2401,4,3,"[im, get, borderland, murder]"


In [69]:
data_processed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 71655 entries, 0 to 74680
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Tweet ID       71655 non-null  int64 
 1   Entity         71655 non-null  int32 
 2   Sentiment      71655 non-null  int32 
 3   Tweet content  71655 non-null  object
dtypes: int32(2), int64(1), object(1)
memory usage: 2.2+ MB


In [70]:
# For CBOW, set sg=0
model = Word2Vec(sentences=data_processed['Tweet content'], vector_size=100, window=5, min_count=1, workers=4, sg=0)

def document_vector(word2vec_model, doc):
    # Remove out-of-vocabulary words and get word vectors
    doc_vectors = [word2vec_model.wv[word] for word in doc if word in word2vec_model.wv]

    # Calculate the mean vector
    if doc_vectors:
        return np.mean(doc_vectors, axis=0)
    else:
        # Handle cases where no valid word vectors are found
        return np.zeros(word2vec_model.vector_size)

# Assuming 'df_processed' contains your preprocessed data
data_processed['vector'] = data_processed['Tweet content'].apply(lambda x: document_vector(model, x))

In [71]:
# Drop the 'Tweet content' column
data_processed.drop(columns=['Tweet content'], inplace=True)

data_processed.head()

Unnamed: 0,Tweet ID,Entity,Sentiment,vector
0,2401,4,3,"[-0.4013606, 0.28093326, 0.09325939, -0.227747..."
1,2401,4,3,"[-0.26614723, -0.3103593, 0.64801896, -1.22333..."
2,2401,4,3,"[-0.44388476, -0.6167488, 0.29682255, -0.98067..."
3,2401,4,3,"[-0.23910753, -0.5785432, 0.56877416, -1.14945..."
4,2401,4,3,"[-0.23910753, -0.5785432, 0.56877416, -1.14945..."


In [72]:
# Split the data into features (X) and target (y)
X = np.vstack(data_processed['vector'].values)
y = data_processed['Sentiment'].values

In [73]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [74]:
# Create a pipeline with only the Random Forest Classifier
model_lr = LogisticRegression(max_iter=1000)

# Fit the model on the training data
model_lr.fit(X_train, y_train)

# Make predictions on the testing set
y_pred_lr = model_lr.predict(X_test)

# Calculate the accuracy of the model
accuracy_lr = accuracy_score(y_test, y_pred_lr)

# Generate the classification report
report_lr = classification_report(y_test, y_pred_lr)

print(f"Accuracy: {accuracy_lr}")
print(f"Classification Report:\n{report_lr}")

Accuracy: 0.541064824506315
Classification Report:
              precision    recall  f1-score   support

           0       0.47      0.19      0.27      2455
           1       0.59      0.70      0.64      4433
           2       0.49      0.53      0.50      3532
           3       0.54      0.60      0.57      3911

    accuracy                           0.54     14331
   macro avg       0.52      0.50      0.50     14331
weighted avg       0.53      0.54      0.52     14331



In [35]:
# Create a pipeline with only the Random Forest Classifier
model_rf = RandomForestClassifier()

# Fit the model on the training data
model_rf.fit(X_train, y_train)

# Make predictions on the testing set
y_pred_rf = model_rf.predict(X_test)

# Calculate the accuracy of the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)

# Generate the classification report
report_rf = classification_report(y_test, y_pred_rf)

print(f"Accuracy: {accuracy_rf}")
print(f"Classification Report:\n{report_rf}")

Accuracy: 0.7383992743004675
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.54      0.65      2455
           1       0.75      0.82      0.78      4433
           2       0.74      0.70      0.72      3532
           3       0.70      0.80      0.75      3911

    accuracy                           0.74     14331
   macro avg       0.75      0.72      0.73     14331
weighted avg       0.74      0.74      0.73     14331



In [36]:
# Create a pipeline with only the Random Forest Classifier
model_ab = AdaBoostClassifier()

# Fit the model on the training data
model_ab.fit(X_train, y_train)

# Make predictions on the testing set
y_pred_ab = model_ab.predict(X_test)

# Calculate the accuracy of the model
accuracy_ab = accuracy_score(y_test, y_pred_ab)

# Generate the classification report
report_ab = classification_report(y_test, y_pred_ab)

print(f"Accuracy: {accuracy_ab}")
print(f"Classification Report:\n{report_ab}")

Accuracy: 0.5034540506594096
Classification Report:
              precision    recall  f1-score   support

           0       0.39      0.18      0.24      2455
           1       0.56      0.68      0.61      4433
           2       0.47      0.41      0.44      3532
           3       0.49      0.59      0.53      3911

    accuracy                           0.50     14331
   macro avg       0.48      0.46      0.46     14331
weighted avg       0.49      0.50      0.49     14331



We performed the embedding on our data using Word2Vec using the CBOW approach, and the RandomForest was the best model so far, we're going to try now another approach of embedding using TF-IDF on the RandomForest model and compare it with the performance of CBOW Word2Vec 

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

# Preprocess our data
data_processed = preprocess_text(train_dataset)

# Join the words in the 'Tweet content' column
data_processed['Tweet content'] = data_processed['Tweet content'].apply(' '.join)

# Split the data into features (X) and target (y)
X = data_processed['Tweet content']
y = data_processed['Sentiment']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with TF-IDF Vectorizer and Random Forest Classifier
pipeline_rf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', RandomForestClassifier())
])

# Fit the model on the training data
pipeline_rf.fit(X_train, y_train)

# Make predictions on the testing set
y_pred_rf = pipeline_rf.predict(X_test)

# Calculate the accuracy of the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)

# Generate the classification report
report_rf = classification_report(y_test, y_pred_rf)

print(f"Accuracy: {accuracy_rf}")
print(f"Classification Report:\n{report_rf}")

Accuracy: 0.9064964063917382
Classification Report:
              precision    recall  f1-score   support

  Irrelevant       0.96      0.84      0.90      2455
    Negative       0.93      0.93      0.93      4433
     Neutral       0.92      0.88      0.90      3532
    Positive       0.85      0.94      0.89      3911

    accuracy                           0.91     14331
   macro avg       0.91      0.90      0.90     14331
weighted avg       0.91      0.91      0.91     14331



In our classification task, we experimented with two different models: TF-IDF with Word Embedding and CBOW. The TF-IDF model achieved an impressive accuracy of approximately 90.65%, outperforming CBOW, which had an accuracy of around 73.84%. The precision, recall, and F1-score were consistently high for the TF-IDF model across all classes (Irrelevant, Negative, Neutral, and Positive). In contrast, CBOW struggled with class 0 (low recall) and class 2 (lower precision and recall). Further exploration and optimization may be needed to enhance CBOW’s performance.