In [7]:
import pandas as pd

# Load training and validation datasets
train_data = pd.read_csv('twitter/twitter_training.csv')
val_data = pd.read_csv('twitter/twitter_validation.csv')

# Check the first few rows to understand the data structure
print("Training Data:\n", train_data.head())
print("Validation Data:\n", val_data.head())


Training Data:
    2401  Borderlands  Positive  \
0  2401  Borderlands  Positive   
1  2401  Borderlands  Positive   
2  2401  Borderlands  Positive   
3  2401  Borderlands  Positive   
4  2401  Borderlands  Positive   

  im getting on borderlands and i will murder you all ,  
0  I am coming to the borders and I will kill you...     
1  im getting on borderlands and i will kill you ...     
2  im coming on borderlands and i will murder you...     
3  im getting on borderlands 2 and i will murder ...     
4  im getting into borderlands and i can murder y...     
Validation Data:
    3364   Facebook Irrelevant  \
0   352     Amazon    Neutral   
1  8312  Microsoft   Negative   
2  4371      CS-GO   Negative   
3  4433     Google    Neutral   
4  6273       FIFA   Negative   

  I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks

In [11]:
# Load data with specified column names
train_data = pd.read_csv('twitter/twitter_training.csv', names=['id', 'source', 'sentiment', 'text'])
val_data = pd.read_csv('twitter/twitter_validation.csv', names=['id', 'source', 'sentiment', 'text'])

# Check the first few rows to confirm
print("Training Data Sample:\n", train_data.head())
print("Validation Data Sample:\n", val_data.head())


Training Data Sample:
      id       source sentiment  \
0  2401  Borderlands  Positive   
1  2401  Borderlands  Positive   
2  2401  Borderlands  Positive   
3  2401  Borderlands  Positive   
4  2401  Borderlands  Positive   

                                                text  
0  im getting on borderlands and i will murder yo...  
1  I am coming to the borders and I will kill you...  
2  im getting on borderlands and i will kill you ...  
3  im coming on borderlands and i will murder you...  
4  im getting on borderlands 2 and i will murder ...  
Validation Data Sample:
      id     source   sentiment  \
0  3364   Facebook  Irrelevant   
1   352     Amazon     Neutral   
2  8312  Microsoft    Negative   
3  4371      CS-GO    Negative   
4  4433     Google     Neutral   

                                                text  
0  I mentioned on Facebook that I was struggling ...  
1  BBC News - Amazon boss Jeff Bezos rejects clai...  
2  @Microsoft Why do I pay for WORD when it fun

In [12]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Initialize necessary NLTK data and tools
nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    if not isinstance(text, str):
        return ""  # Return an empty string for non-string entries
    # Remove URLs, mentions, hashtags
    text = re.sub(r"(http\S+|@\S+|#\S+)", "", text)
    # Remove numbers and special characters
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    # Lowercase
    text = text.lower()
    # Tokenize
    tokens = nltk.word_tokenize(text)
    # Remove stopwords and stem words
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    # Rejoin tokens into the preprocessed sentence
    return ' '.join(tokens)

# Apply preprocessing to the 'text' column in training and validation data
train_data['clean_text'] = train_data['text'].apply(preprocess_text)
val_data['clean_text'] = val_data['text'].apply(preprocess_text)

# Apply preprocessing to training and validation data
train_data['clean_text'] = train_data['text'].apply(preprocess_text)
val_data['clean_text'] = val_data['text'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize and fit TF-IDF vectorizer on training data
tfidf = TfidfVectorizer(max_features=5000)
X_train = tfidf.fit_transform(train_data['clean_text']).toarray()

# Transform validation data using the fitted TF-IDF vectorizer
X_val = tfidf.transform(val_data['clean_text']).toarray()


In [14]:
from sklearn.preprocessing import LabelEncoder

# Initialize label encoder and fit on training labels
le = LabelEncoder()
y_train = le.fit_transform(train_data['sentiment'])
y_val = le.transform(val_data['sentiment'])


In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from tqdm import tqdm
import numpy as np

# Initialize and train the model with max_iter set to 5000 for thorough convergence
model = LogisticRegression(max_iter=2000)

# Train the model
model.fit(X_train, y_train)

# Prediction with chunking and progress bar
chunk_size = 1000  # Customize this chunk size as needed
y_val_pred = []

for i in tqdm(range(0, len(X_val), chunk_size), desc="Validation Prediction Progress"):
    end = min(i + chunk_size, len(X_val))
    y_val_pred.extend(model.predict(X_val[i:end]))

y_val_pred = np.array(y_val_pred)

# Evaluate the model's performance on the validation set
report = classification_report(y_val, y_val_pred, target_names=le.classes_)

# Display the classification report
print("### Classification Report Overview ###")
print(report)

# Detailed explanation for each row and column:
# Class (Row): Each row represents a specific sentiment class:
# - Irrelevant: Instances classified as irrelevant.
# - Negative: Instances with a negative sentiment.
# - Neutral: Instances with a neutral sentiment.
# - Positive: Instances with a positive sentiment.

# Column descriptions:
# - Precision: Percentage of correctly predicted instances for each class out of all predicted as that class.
# - Recall: Percentage of correctly predicted instances for each class out of all actual instances of that class.
# - F1-Score: The harmonic mean of precision and recall, balancing these two metrics.
# - Support: The number of actual samples for each class in the dataset.

# Summary Rows:
# - Accuracy: The overall accuracy of the model across all classes.
# - Macro Avg: The unweighted average of precision, recall, and F1-score across all classes, treating each class equally.
# - Weighted Avg: The average of precision, recall, and F1-score, weighted by support (class frequency), which is useful in imbalanced datasets.


Validation Prediction Progress: 100%|██████████| 1/1 [00:00<00:00, 44.01it/s]

### Classification Report Overview ###
              precision    recall  f1-score   support

  Irrelevant       0.75      0.70      0.72       172
    Negative       0.74      0.85      0.79       266
     Neutral       0.86      0.73      0.79       285
    Positive       0.82      0.87      0.84       277

    accuracy                           0.79      1000
   macro avg       0.79      0.79      0.79      1000
weighted avg       0.80      0.79      0.79      1000




