In [1]:
from zipfile import ZipFile
import numpy as np
import pandas as pd
import nltk 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score , precision_score, recall_score, f1_score

import re
from sklearn.linear_model import LogisticRegression
import joblib
nltk.download('stopwords')
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning, module="pandas")



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AI-15\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


1) Importing Libraries and Downloading NLTK Data:

2) LOADING THE DATASET

3) SAMPLING THE DATASET TO REDUCE DATASET SIZE AND REDUCE PROCESSING TIME (SKIPPED STEP)

In [2]:
labels = ['target', 'ids', 'date', 'flag', 'user', 'text']
df1 = pd.read_csv('../dataset/twitter/tweets.csv' , names = labels , encoding='latin1')


FileNotFoundError: [Errno 2] No such file or directory: '../dataset/twitter/tweets.csv'

In [None]:
df1

In [None]:
df1.info()

4) PRE-PROCESSING THE DATA

In [None]:
stop_words = stopwords.words('english')
port_stem = PorterStemmer() 

def pre_process(content):

    # Re Sub replaces non alphabetic words with 0 space.
    process_content = re.sub(r'[^a-zA-Z\s]' , "" , content)

    # Lower Casing all words to reduce formation of unique tokens.
    process_content = process_content.lower()

    # Tokenization is necessary for processing each word individually, such as removing stop words and stemming.
    process_content = process_content.split(' ')
    
    # Port stem reduces word to their root form and stop words used to remove words like (and , the , in).
    process_content = [port_stem.stem(word) for word in process_content if word not in stop_words ]

    # join again all tokens in a single sentence.
    process_content = " ".join(process_content)

    return process_content

In [None]:
df1['text_clean'] = df1['text'].apply(pre_process)

In [None]:
df1.info()

In [None]:
df1['target']

In [None]:
df1['target_text'] = df1['target'].apply(lambda x: 'Negative' if x == 0 else 'Positive' if x == 4 else 'Neutral')

In [None]:
df1['target_text']

In [None]:
x = df1['text_clean']
y = df1['target_text']

X_train , X_test , Y_train , Y_test = train_test_split(x,y, test_size =0.2 , stratify=y)

Vectorize the data for model fitting using TfIdf Vectorizer.

vectorizer = TfidfVectorizer()

1. fit_transform on Training Data (X_train)
Fit: When you call fit_transform on X_train, the TfidfVectorizer learns the vocabulary of the entire corpus (X_train), computes the IDF (Inverse Document Frequency) values, and transforms the text into a feature matrix.

Reason:

Vocabulary Learning: The vectorizer builds a vocabulary from X_train, which includes all unique words (or tokens) present in the training text data.
Feature Extraction: It converts each text document in X_train into a numerical vector representation based on the learned vocabulary and TF-IDF weights.
Example:


2. transform on Test Data (X_test)
Transform: After fitting the vectorizer on X_train, you use transform on X_test. This applies the vocabulary and IDF weights learned from X_train to transform X_test into the same feature space.

Reason:

Consistency: Ensures that both X_train and X_test are transformed using the same vocabulary and IDF values. This consistency is crucial because machine learning models expect input data to have the same dimensions and meaning.
Avoiding Data Leakage: Applying fit_transform directly to X_test would introduce vocabulary words that the model has not seen during training, leading to inconsistencies and potentially lower model performance.



In [None]:
X_train_vect=vectorizer.fit_transform(X_train)
X_test_vect=vectorizer.transform(X_test)

In [None]:
model=LogisticRegression(max_iter=1800)
model.fit(X_train_vect,Y_train)

1. model.score(X_test_vect, Y_test)
This method computes the accuracy of the model on the test set (X_test_vect, Y_test). It internally predicts the labels using the model (model) and compares them with the true labels (Y_test).

2. accuracy_score(Y_train, pred)
This function calculates the accuracy of the model on the training set (X_train_vect, Y_train). Here, pred contains the predictions made by the model (model) on the training data (X_train_vect).

1) 0.7823
2) 0.8176
#1000 iters

1) 0.7826
2) 0.8174
#5000 iters

1)0.7826
2)0.8174
#500 iters

#200 iters
not enough iterations
 
#3000 iters
same as 5000 iters
same as 1800 iters


In [None]:

# Predict using the trained model
Y_pred = model.predict(X_test_vect)

# Calculate metrics
accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred, average='weighted')
recall = recall_score(Y_test, Y_pred, average='weighted')
f1 = f1_score(Y_test, Y_pred, average='weighted')

# Print metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")


In [None]:
    # Predict using the trained model for training data
Y_train_pred = model.predict(X_train_vect)

# Calculate metrics for training data
train_accuracy = accuracy_score(Y_train, Y_train_pred)
train_precision = precision_score(Y_train, Y_train_pred, average='weighted')
train_recall = recall_score(Y_train, Y_train_pred, average='weighted')
train_f1 = f1_score(Y_train, Y_train_pred, average='weighted')

# Print metrics for training data
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Training Precision: {train_precision:.4f}")
print(f"Training Recall: {train_recall:.4f}")
print(f"Training F1-score: {train_f1:.4f}")

In [None]:
df1.info()

 USING BERT INSTEAD OF TF-IDF

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import DataLoader, TensorDataset


In [None]:
X = df1['text_clean']
Y = df1['target']
# Split data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
# Check data types and values
print(f"Y_train type: {type(Y_train)}")
print(f"Y_test type: {type(Y_test)}")

# Print a sample of Y_train and Y_test
print("Sample Y_train:")
print(Y_train.head())
print("\nSample Y_test:")
print(Y_test.head())

# Convert Y_train and Y_test to lists if they are not already
if not isinstance(Y_train, list):
    Y_train = Y_train.tolist()

if not isinstance(Y_test, list):
    Y_test = Y_test.tolist()

# Verify conversion
print(f"\nConverted Y_train type: {type(Y_train)}")
print(f"Converted Y_test type: {type(Y_test)}")

In [None]:


# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize input texts
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True)


In [None]:
# Assuming Y_train and Y_test are lists with labels 0 and 4
Y_train_adjusted = [0 if label == 0 else 1 for label in Y_train]
Y_test_adjusted = [0 if label == 0 else 1 for label in Y_test]


# Convert data into PyTorch tensors
train_dataset = TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(Y_train_adjusted)  # Use adjusted labels here
)
test_dataset = TensorDataset(
    torch.tensor(test_encodings['input_ids']),
    torch.tensor(test_encodings['attention_mask']),
    torch.tensor(Y_test_adjusted)  # Use adjusted labels here
)



In [None]:
# Create data loaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
# Load BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()

# Training loop
model.train()
for epoch in range(3):  # Example: train for 3 epochs
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()


In [None]:
# Evaluation loop
model.eval()
predicted_labels = []
true_labels = []
for batch in test_loader:
    input_ids, attention_mask, labels = batch
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    predicted_labels.extend(predictions.cpu().numpy())
    true_labels.extend(labels.cpu().numpy())

# Calculate accuracy
accuracy = accuracy_score(true_labels, predicted_labels)
print(f'Accuracy: {accuracy}')
