# Part 1:

First we're gonna do sentiment analysis on the twitter data to categorise the tweets into positive, neutral and negative tweets.
We choose VADER (Valence Aware Dictionary and sEntiment Reasoner) because it is a rule-based sentiment analysis tool specifically designed for social media text.

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from autocorrect import Speller

In [None]:
df = pd.read_csv("Data/Twitter_Data.csv")
df = df.dropna()
df.head()

In [None]:
df.columns = df.columns.str.strip()

In [None]:
df = df[['clean_text', 'category']]

In [None]:
df.shape

In [None]:
print(df.dtypes)

## Preprocessing

In [None]:
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()
spell = Speller(lang='en')

# Preprocessing function
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove user mentions (e.g., @username)
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags
    text = re.sub(r'#\w+', '', text)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join tokens back to string
    text = ' '.join(tokens)
    return text

In [None]:
df['clean_text'] = df['clean_text'].apply(preprocess_text)

In [None]:
#df.to_csv('Data/preprocessedtweets.csv', index=False)

In [None]:
#df = pd.read_csv("Data/preprocessedtweets.csv")

In [None]:
df['category'] = df['category'].apply(lambda x: 1 if x > 0 else 0)

In [None]:
df.head()

## Sentiment Analyser

In [None]:
# Create a SentimentIntensityAnalyzer object
analyzer = SentimentIntensityAnalyzer()

In [None]:
pd.options.mode.chained_assignment = None

# Function to assign sentiment
def assign_sentiment(tweet):
    sentiment_scores = analyzer.polarity_scores(tweet)
    if sentiment_scores['compound'] > 0:
        return 1
    else:
        return 0
    

In [None]:
# Apply sentiment analysis on training data
pred_sentiment = df['clean_text'].apply(assign_sentiment)
#temp_sentiment = df['Sentiment'].apply(lambda x: 1 if x > 0 else -1 if x < 0 else 0)


In [None]:
# Evaluate the model
print("Classification Report:")
print(classification_report(df['category'], pred_sentiment))

print("Confusion Matrix:")
conf_matrix = confusion_matrix(df['category'], pred_sentiment)
print(conf_matrix)


## BERT model

In [None]:
df_temp = df

In [None]:
# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df_temp, test_size=0.2, random_state=42)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import classification_report
import pandas as pd
from tqdm import tqdm

# Load pre-trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

# Function to encode text and labels for training
def encode_data(texts, labels=None):
    inputs = tokenizer(texts, return_tensors='pt', truncation=True, padding=True, max_length=512)
    if labels is not None:
        labels = torch.tensor(labels)  # Ensure labels are tensor
        return inputs, labels
    return inputs

# Assume train_df and test_df are already defined DataFrames
train_inputs, train_labels = encode_data(train_df['clean_text'].tolist(), train_df['category'].tolist())

# Training loop
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
batch_size = 32
epochs = 3

model.train()
for epoch in range(epochs):
    epoch_loss = 0
    for i in tqdm(range(0, len(train_inputs['input_ids']), batch_size)):
        batch_inputs = {key: val[i:i+batch_size] for key, val in train_inputs.items()}
        batch_labels = train_labels[i:i+batch_size]

        optimizer.zero_grad()
        outputs = model(**batch_inputs, labels=batch_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Epoch {epoch + 1}/{epochs} - Loss: {epoch_loss}")

# Encode test data
test_inputs, test_labels = encode_data(test_df['clean_text'].tolist(), test_df['category'].tolist())

# Evaluate the model using the test set
model.eval()
with torch.no_grad():
    outputs = model(**test_inputs)
    logits = outputs.logits
    predicted_labels = torch.argmax(logits, dim=1)

print(classification_report(test_labels, predicted_labels))

# Optionally save the test results to a new CSV file
test_df['predicted_sentiment'] = predicted_labels.numpy()
test_df.to_csv('test_sentiment_analysis_results.csv', index=False)


In [None]:
print("Confusion Matrix:")
conf_matrix = confusion_matrix(test_labels, predicted_labels)
print(conf_matrix)

In [None]:
#!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
#!pip install transformers requests beautifulsoup4

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from bs4 import BeautifulSoup
from tqdm import tqdm

In [None]:
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

In [None]:
def sentiment_score(review):
    tokens = tokenizer.encode(review, return_tensors='pt')
    result = model(tokens)
    return int(torch.argmax(result.logits))+1

In [None]:
sentiment_score(train_df['clean_text'].iloc[1])

In [None]:
train_df['clean_text'].iloc[9]

In [None]:
# Function to perform sentiment analysis on a batch
def batch_sentiment_score(reviews):
    tokens = tokenizer(reviews, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**tokens)
    return torch.argmax(outputs.logits, dim=1).cpu().numpy() + 1

# Apply sentiment analysis in batches
batch_size = 32
sentiments = []
model.train()
for i in tqdm(range(0, len(train_df), batch_size)):
    batch_reviews = train_df['clean_text'][i:i+batch_size].tolist()
    batch_sentiments = batch_sentiment_score(batch_reviews)
    sentiments.extend(batch_sentiments)



In [None]:
train_df.to_csv('sentiment_analysis_results.csv', index=False)

In [None]:
train_df.head()

In [None]:
#train_df['sentiment'] = train_df['sentiment'].apply(lambda x: 1 if x > 3 else -1 if x < 3 else 0)

In [None]:
train_df.head()

In [None]:
# Apply sentiment analysis in batches for the test set
# Evaluation
model.eval()
test_sentiments = []
for i in tqdm(range(0, len(test_df), batch_size)):
    batch_reviews = test_df['clean_text'][i:i+batch_size].tolist()
    batch_sentiments = batch_sentiment_score(batch_reviews)
    test_sentiments.extend(batch_sentiments)


In [None]:
# Evaluate the model using the test set
y_true = test_df['category'].astype(int)
y_pred = test_sentiments
print(classification_report(y_true, y_pred))
print(confusion_matrix(y_true, y_pred))

In [None]:
# Dropping rows with sentiment values 0 and 1. 
tweet_final = df[df['category'] == 1]

Train the model to find categories with product description data. Use the model to find categories of tweets. 

In [None]:
import pandas as pd
import glob

# List all CSV files in a directory
csv_files = glob.glob('Data/Product_ratings/*.csv')  # Update the path as per your file location

# Initialize an empty list to store individual DataFrames
dfs = []

# Loop through each CSV file and read it into a DataFrame, then append it to the list
for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)

# Concatenate all DataFrames in the list into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)  # Set ignore_index=True to reindex rows

combined_df.head(5)

In [None]:
# Dropping the unnamed column
combined_df = combined_df.drop(combined_df.columns[-1], axis=1)

In [None]:
combined_df.shape

In [None]:
categories = combined_df['main_category'].unique()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Vectorization
vectorizer = TfidfVectorizer()
text_vectors = vectorizer.fit_transform(tweet_final['tweet'])
category_vectors = vectorizer.transform(categories)

# Similarity measurement
similarity_matrix = cosine_similarity(text_vectors, category_vectors)

# Assign categories
threshold = 0.2  # Define a threshold for similarity
assigned_categories = []

for idx, similarities in enumerate(similarity_matrix):
    max_similarity = max(similarities)
    if max_similarity >= threshold:
        assigned_category = categories[similarities.argmax()]
    else:
        assigned_category = None
    assigned_categories.append(assigned_category)

# Create a DataFrame for the results
results_df = pd.DataFrame({'Text': tweet_final['tweet'], 'Assigned Category': assigned_categories})
print(results_df)

In [None]:
results_df = results_df.dropna(subset=['Assigned Category'])

In [None]:
results_df.head()

In [None]:
results_df['Assigned Category'].value_counts()