<a href="https://colab.research.google.com/github/engige/git_practice3/blob/main/Project4b.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

# Load the dataset to inspect its structure
tweet_data = pd.read_csv('tweet_sentiments.csv', encoding='ISO-8859-1')

# Display the first few rows of the dataset
tweet_data.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [2]:
# Drop rows with missing tweet_text and drop the column 'emotion_in_tweet_is_directed_at' as it is not necessary for sentiment analysis
cleaned_tweet_data = tweet_data.dropna(subset=['tweet_text']).drop(columns=['emotion_in_tweet_is_directed_at'])

# Display the cleaned dataset for further inspection
cleaned_tweet_data.head()

Unnamed: 0,tweet_text,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Positive emotion


In [3]:
# Import necessary libraries for text preprocessing
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Define stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function for text preprocessing
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenization
    words = word_tokenize(text)
    # Remove stop words and lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Apply the preprocessing function to the 'tweet_text' column
cleaned_tweet_data['cleaned_text'] = cleaned_tweet_data['tweet_text'].apply(preprocess_text)

# Display the first few rows of the preprocessed data
cleaned_tweet_data[['tweet_text', 'cleaned_text']].head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Unnamed: 0,tweet_text,cleaned_text
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,wesley g iphone hr tweeting riseaustin dead ne...
1,@jessedee Know about @fludapp ? Awesome iPad/i...,jessedee know fludapp awesome ipadiphone app y...
2,@swonderlin Can not wait for #iPad 2 also. The...,swonderlin wait ipad also sale sxsw
3,@sxsw I hope this year's festival isn't as cra...,sxsw hope year festival isnt crashy year iphon...
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,sxtxstate great stuff fri sxsw marissa mayer g...


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Define the TF-IDF Vectorizer with a maximum of 5000 features and stop words removed
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the cleaned text data into TF-IDF features
X_tfidf = tfidf_vectorizer.fit_transform(cleaned_tweet_data['cleaned_text'])

# Display the shape of the resulting TF-IDF matrix
X_tfidf.shape

(9092, 5000)

In [5]:
from sklearn.model_selection import train_test_split

# Filter relevant rows and map sentiment labels to numerical categories
multiclass_data = cleaned_tweet_data[cleaned_tweet_data['is_there_an_emotion_directed_at_a_brand_or_product'].isin(['Positive emotion', 'Negative emotion', 'No emotion toward brand or product'])]

# Re-apply the TF-IDF transformation on the filtered data
X_tfidf_multiclass = tfidf_vectorizer.transform(multiclass_data['cleaned_text'])

# Define the target variable (sentiment)
y_multiclass = multiclass_data['is_there_an_emotion_directed_at_a_brand_or_product'].map({
    'Positive emotion': 1,
    'Negative emotion': 0,
    'No emotion toward brand or product': 2
})

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_multiclass, y_multiclass, test_size=0.2, random_state=42, stratify=y_multiclass)

# Display the shapes of the training and testing sets
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Training set shape: (7148, 5000) (7148,)
Testing set shape: (1788, 5000) (1788,)


In [6]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [8]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, Dataset
import pandas as pd

# Define a custom dataset class
class TweetDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Prepare data
X_train, X_test, y_train, y_test = train_test_split(multiclass_data['cleaned_text'], y_multiclass, test_size=0.2, random_state=42)

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Tokenize the data
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=512)

# Convert data to PyTorch datasets
train_dataset = TweetDataset(train_encodings, y_train.tolist())
test_dataset = TweetDataset(test_encodings, y_test.tolist())

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Trainer object for fine-tuning
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.8109,0.741414
2,0.5848,0.775641
3,0.393,0.903158


{'eval_loss': 0.9031582474708557,
 'eval_runtime': 1.3471,
 'eval_samples_per_second': 1327.292,
 'eval_steps_per_second': 83.141,
 'epoch': 3.0}

In [10]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, Dataset
import pandas as pd

# Define a custom dataset class
class TweetDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Prepare data
X_train, X_test, y_train, y_test = train_test_split(multiclass_data['cleaned_text'], y_multiclass, test_size=0.2, random_state=42)

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Tokenize the data
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=512)

# Convert data to PyTorch datasets
train_dataset = TweetDataset(train_encodings, y_train.tolist())
test_dataset = TweetDataset(test_encodings, y_test.tolist())

# Training arguments with evaluation per epoch and logging
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=10,  # You can set higher epochs since early stopping will prevent overfitting
    weight_decay=0.01,
    logging_dir='./logs',
    save_total_limit=2,  # Only keep the last 2 checkpoints to save disk space
    load_best_model_at_end=True,  # Load the best model at the end of training
    save_strategy = "epoch" # Set save_strategy to epoch to match evaluation_strategy
)

# Trainer object for fine-tuning with early stopping
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]  # Early stopping with patience of 1 epoch
)

# Train the model with early stopping
trainer.train()

# Evaluate the model
trainer.evaluate()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.8423,0.847652
2,0.7801,0.865968


{'eval_loss': 0.8476523160934448,
 'eval_runtime': 1.3689,
 'eval_samples_per_second': 1306.187,
 'eval_steps_per_second': 81.819,
 'epoch': 2.0}

In [12]:
# Get predictions from the BERT model
predictions = trainer.predict(test_dataset)

# Import numpy with the alias np
import numpy as np

# Extract predicted labels (use argmax for the highest probability in case of softmax output)
preds = np.argmax(predictions.predictions, axis=1)

# Evaluate the model using classification report and accuracy score
from sklearn.metrics import classification_report, accuracy_score

# Calculate accuracy
accuracy = accuracy_score(y_test, preds)

# Calculate classification metrics (precision, recall, F1-score)
classification_rep = classification_report(y_test, preds, target_names=['Negative', 'Positive', 'Neutral'])

# Display the results
print(f"BERT Model Accuracy: {accuracy}")
print("Classification Report:\n", classification_rep)

BERT Model Accuracy: 0.535234899328859
Classification Report:
               precision    recall  f1-score   support

    Negative       0.00      0.00      0.00       126
    Positive       0.39      0.80      0.53       568
     Neutral       0.80      0.46      0.58      1094

    accuracy                           0.54      1788
   macro avg       0.40      0.42      0.37      1788
weighted avg       0.61      0.54      0.52      1788



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE  # Import SMOTE for oversampling
import torch
from torch.utils.data import DataLoader, Dataset
import pandas as pd

# Define a custom dataset class
class TweetDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Prepare data
X_train, X_test, y_train, y_test = train_test_split(multiclass_data['cleaned_text'], y_multiclass, test_size=0.2, random_state=42)

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the data (tokenize text into sequences of integers, preserving important features)
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=512)

# Convert the tokenized encodings into feature vectors for SMOTE
X_train_tokenized = train_encodings['input_ids'] # Access the 'input_ids' attribute directly

# Apply SMOTE to balance the classes in y_train
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_tokenized, y_train)

# Convert the oversampled data back into PyTorch-compatible datasets
train_encodings_smote = tokenizer(
    [' '.join([str(token_id) for token_id in token_list]) for token_list in X_train_smote],
    truncation=True,
    padding=True,
    max_length=512
)

# Create PyTorch datasets
train_dataset = TweetDataset(train_encodings_smote, y_train_smote.tolist())
test_dataset = TweetDataset(test_encodings, y_test.tolist())

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=10,  # You can set higher epochs since early stopping will prevent overfitting
    weight_decay=0.01,
    logging_dir='./logs',
    save_total_limit=2,  # Only keep the last 2 checkpoints to save disk space
    load_best_model_at_end=True,  # Load the best model at the end of training
    save_strategy="epoch" # Set save_strategy to epoch to match evaluation_strategy
)

# Trainer object for fine-tuning with early stopping
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]  # Early stopping with patience of 1 epoch
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()



Epoch,Training Loss,Validation Loss
1,1.104,1.139257
2,1.1032,1.097265
3,1.1013,1.043647
4,1.0994,1.101048


{'eval_loss': 1.0436468124389648,
 'eval_runtime': 1.4008,
 'eval_samples_per_second': 1276.416,
 'eval_steps_per_second': 79.954,
 'epoch': 4.0}

In [15]:
# Get predictions from the BERT model
predictions = trainer.predict(test_dataset)

# Import numpy with the alias np
import numpy as np

# Extract predicted labels (use argmax for the highest probability in case of softmax output)
preds = np.argmax(predictions.predictions, axis=1)

# Evaluate the model using classification report and accuracy score
from sklearn.metrics import classification_report, accuracy_score

# Calculate accuracy
accuracy = accuracy_score(y_test, preds)

# Calculate classification metrics (precision, recall, F1-score)
classification_rep = classification_report(y_test, preds, target_names=['Negative', 'Positive', 'Neutral'])

# Display the results
print(f"BERT Model Accuracy: {accuracy}")
print("Classification Report:\n", classification_rep)

BERT Model Accuracy: 0.6118568232662193
Classification Report:
               precision    recall  f1-score   support

    Negative       0.00      0.00      0.00       126
    Positive       0.00      0.00      0.00       568
     Neutral       0.61      1.00      0.76      1094

    accuracy                           0.61      1788
   macro avg       0.20      0.33      0.25      1788
weighted avg       0.37      0.61      0.46      1788



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
