In [None]:
import pandas as pd
import requests
import os

import pandas as pd
from bs4 import BeautifulSoup
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# importing the needed libraries
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import re 
import string
import preprocessor as p
import emoji
from emot.emo_unicode import EMOTICONS_EMO
from flashtext import KeywordProcessor

import torch
from transformers import BertTokenizer, BertModel

# Data Collection

## Gathering the Data

For the data collection process, 

In [None]:

df = pd.read_csv("./data/texts.csv")
df.head()

### TODO: talk about the nature of the data here.

## Annotating the Dataset

This section provides information on the sentiment annotation process using the `twitter-roberta-base-sentiment model` from Hugging Face. This process is crucial for labeling the data sentiment, preparing it for further analysis.

### Load and Prepare Data

In [None]:
# Load our data from a CSV file, handling UTF-8 encoding issues
df = pd.read_csv("./data/1000texts.csv", encoding='utf-8-sig')

# Display the first three rows of the dataframe to inspect the data
df.head(3)

We start by loading the dataset, ensuring that UTF-8 encoding is used to handle any special characters in the text. 

The initial peek at the data with `df.head(3)` helps to confirm the structure and data types we are working with.

### Data Cleaning

In [None]:
# Drop rows with any missing values
df = df.dropna()

# Convert the 'Content' column into a list of sentences
sentences = df['Content'].tolist()

Next, we clean the data by removing rows with missing values to maintain the quality and consistency of our dataset. 

We extract the tweet content into a list to facilitate the subsequent batch processing.

### Model Details

For our annotation, we will be using [`twitter-roberta-base-sentiment`](https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment) model on huggingface.

This model is a RoBERTa-based neural network trained on approximately 58 million tweets and fine-tuned for sentiment analysis, making it highly adept at understanding the nuances of language used in tweets.

**Labels Explained**
- 0: Negative
- 1: Neutral
- 2: Positive

These labels correspond to the sentiment expressed in each tweet.

### API Configuration

The api requires us to group the sentences in 10.

In [None]:
# Group sentences into sub-lists of 10 for batch processing
grouped_list = [sentences[n:n+10] for n in range(0, len(sentences), 10)]

### Set Up API for Annotation

In [None]:
# API token and endpoint for the annotation Hugging Face's model
API_TOKEN = "###"  # actual API token goes here
API_URL = "https://api-inference.huggingface.co/models/cardiffnlp/twitter-roberta-base-sentiment"
headers = {"Authorization": f"Bearer {API_TOKEN}"}  # Authorization header for the API request

We configure the API with the required endpoint and authentication details. We use the API_TOKEN gotten from Hugging Face.

### Batch Processing Setup

In [None]:
# Group sentences into sub-lists of 10 for batch processing
grouped_list = [sentences[n:n+10] for n in range(0, len(sentences), 10)]

Tweets are grouped in batches of ten to optimize the API calls.

### Annotation Execution

In [None]:
# Define a function to send data to the sentiment analysis API and get the response
def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

In [None]:
# Initialize an empty list to store outputs
output = []

# Loop through each group of sentences and perform sentiment analysis
for i in range(len(grouped_list)):
    output.append(query(grouped_list[i]))


We define a function to send each batch to the API and store the responses. Each response includes sentiment scores and labels for the batch of tweets processed.

### Understanding the Output

In [None]:
output[:5]

The output from the API provides a score for each sentiment category per tweet, indicating the confidence level of each sentiment prediction. This allows us to determine the most likely sentiment expressed in each tweet.

### Label Extraction and Assignment

In [None]:
# Initialize an empty list to hold the highest sentiment labels
highest_labels = []

# Extract the highest sentiment label from each result
for group in output:
    for result in group:
        highest = max(result, key=lambda x: x['score'])
        highest_labels.append(highest['label'].split('_')[1])

# Add the highest sentiment labels back to the dataframe
df['label'] = highest_labels


After processing, we extract the highest scoring label for each tweet and add this label back into our DataFrame. This step converts the raw output into a practical annotation of the dataset.

### Saving the Results

In [None]:
# Define the final dataframe to be saved
df_final = df[['Content', 'label']]

# Define the file path for the new CSV
file_path = os.path.join('data', 'labeled_texts_1000.csv')

# Save the dataframe to a CSV file, without the index, and handle UTF-8 encoding
df_final.to_csv(file_path, index=False, encoding='utf-8-sig')

The fully annotated dataset is saved as a CSV file, preserving the original text alongside the newly assigned sentiment labels. This file can now be used for further analysis and training predictive models.

### Citation


Barbieri, F., Camacho-Collados, J., Espinosa Anke, L., & Neves, L. (2020). TweetEval: Unified Benchmark and Comparative Evaluation for Tweet Classification. In Findings of the Association for Computational Linguistics: EMNLP 2020 (pp. 1644–1650). Association for Computational Linguistics.



# Preprocessing

### Read Data

In [None]:
import pandas as pd
from langdetect import detect, LangDetectException

In [None]:
## language detection
#def detect_lang(text):
#    try:
#        return detect(text)
#    except LangDetectException:
#        return None
#
#df_texts_orig = pd.read_csv('data/labeled_texts_1000.csv', encoding='utf-8-sig')
#df_texts_orig.dropna(inplace=True)
#
## detect language and add a new column
#df_texts_orig['lang'] = df_texts_orig['Content'].apply(detect_lang)
#
## select only English texts
#df_eng = df_texts_orig[df_texts_orig['lang'] == 'en'].reset_index(drop=True)
#
#df_eng.to_csv('data/labeled_texts_eng.csv', index=False)

### Load Data

In [None]:
df_eng_loaded = pd.read_csv('data/labeled_texts_eng.csv')

df_labels = df_eng_loaded['label']
df_labels.to_pickle('data/labels.pkl')

df_texts = df_eng_loaded['Content']

texts = [text for text in df_texts]
print(texts)
len(texts)

### Text Preprocessing

In [None]:
#emoji and emoticons detection package for Python
!pip install emot

In [None]:
#emoji package for Python
!pip install emoji 

In [None]:
#tweet preprocessing package for Python
!pip install tweet-preprocessor

In [None]:
#replace keywords in sentences
!pip install flashtext

In [None]:
# necessary for BERT tokenizer
!pip install transformers

In [None]:
# importing the needed libraries
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import re 
import string
import preprocessor as p
import emoji
from emot.emo_unicode import EMOTICONS_EMO
from flashtext import KeywordProcessor
from transformers import BertTokenizer

In [None]:
def get_wordnet_tag(tag):
    if tag.startswith('J'):
        return 'a'
    elif tag.startswith('N'):
        return 'n'
    elif tag.startswith('R'):
        return 'r'
    elif tag.startswith('V'):
        return 'v'
    else:
        return None

In [None]:
def convert_emoticons(text):
    ## formatting
    all_emoji_emoticons = {**EMOTICONS_EMO}
    all_emoji_emoticons = {k:v.replace(":","").replace("_"," ").strip() for k,v in all_emoji_emoticons.items()}

    kp_all_emoji_emoticons = KeywordProcessor()
    for k,v in all_emoji_emoticons.items():
        kp_all_emoji_emoticons.add_keyword(k, v)
    output = kp_all_emoji_emoticons.replace_keywords(text)

    return output

In [None]:
def normalize_text(text):
    # handle abbreviations
    normalized_text = re.sub(r'\bfav\b', "favorite", text)
    normalized_text = re.sub(r'\btkt\b', "ticket", normalized_text)
    normalized_text = re.sub(r'\(gm\)', 'good morning', normalized_text)
    
    # remove unnecessary information
    normalized_text = re.sub(r'\([^)]*(via|h/t)[^)]*\)', '', normalized_text)

    # reduce repeated characters
    normalized_text = re.sub(r'([a-zA-Z])\1{2,}', r'\1\1', normalized_text)
    
    return normalized_text

In [None]:
def clean_characters(text):
    # Replacing special characters with space
    sentence_cleaned = re.sub(r'[-_:]', ' ', text)

    # Removing any character which is not a space, letter or a number
    regular_expression_num_letters = r"[^a-zA-Z0-9 ']"
    sentence_cleaned = re.sub(regular_expression_num_letters, '', sentence_cleaned)

    # Removing any extra spaces
    sentence_cleaned = re.sub(r'\s+', ' ', sentence_cleaned)
    
    return sentence_cleaned

In [None]:
def clean_tweet(tweet, mode='default'):
    # remove URLs, mentions, reserved words (RT, FAV)
    p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.RESERVED)
    cleaned_tweet = p.clean(tweet)

    # remove emojis
    cleaned_tweet = emoji.demojize(cleaned_tweet)
    
    # convert emoticons to words
    cleaned_tweet = convert_emoticons(cleaned_tweet)

    # handle abbreviations
    normalized_text = normalize_text(cleaned_tweet.lower())

    # BERT Tokenizer
    if mode == 'bert':
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        encoded_input = tokenizer.encode(normalized_text, add_special_tokens=True)
        tokens = tokenizer.convert_ids_to_tokens(encoded_input)
        return tokens

    # clean characters
    sentence_cleaned = clean_characters(normalized_text)

    # Tokenize the tweet
    tokens = word_tokenize(sentence_cleaned)
    
    # POS tagging
    pos_tagged = pos_tag(tokens)

    # Remove Stopwords
    stop_words_removed = [word for word in pos_tagged if word[0] not in stopwords.words('english')]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = [word if get_wordnet_tag(tag) is None else lemmatizer.lemmatize(word, get_wordnet_tag(tag)) for word, tag in stop_words_removed]
    
    return lemmatized_sentence

In [None]:
def get_cleaned_tweets(tweets, mode='default'):
    if mode == 'bert':
        result = []
        for tweet in tweets:
            tokens = clean_tweet(tweet, mode)
            if len(tokens) <= 512:
                result.append(tokens)

        return result

    return [clean_tweet(tweet, mode) for tweet in tweets]

In [None]:
result = get_cleaned_tweets(texts)
result

In [None]:
result_bert = get_cleaned_tweets(texts, mode='bert')
result_bert


### Save Output

In [None]:
import pickle

In [None]:
# Save the list of tokens to a file using Pickle
with open('data/preprocessing_output.pkl', 'wb') as file:
    pickle.dump(result, file)

# save dataframe as a CSV file
preprocessed_data = [' '.join(document) for document in result]
df_data = pd.DataFrame(preprocessed_data, columns=['Content'])
df_data['Label'] = df_labels
df_data.to_csv('data/preprocessed_data.csv', index=False, encoding='utf-8-sig')

### Save Output for BERT

In [None]:
# Save the list of tokens to a file using Pickle
with open('data/preprocessing_output_bert.pkl', 'wb') as file:
    pickle.dump(result_bert, file)

# save dataframe as a CSV file
preprocessed_data_bert = [' '.join(document) for document in result_bert]
df_data = pd.DataFrame(preprocessed_data_bert, columns=['Content'])
df_data['Label'] = df_labels
df_data.to_csv('data/preprocessed_data_bert.csv', index=False, encoding='utf-8-sig')

### Load Output

In [None]:
# to read the pkl file:
with open('data/preprocessing_output.pkl', 'rb') as file:
    loaded_list = pickle.load(file)
    
# Print the dictionary
print(loaded_list)
print(len(loaded_list))

# print the labels
df_labels_loaded = pd.read_pickle('data/labels.pkl')
print(df_labels_loaded)
print(len(df_labels_loaded))

# read the CSV file
df_data_loaded = pd.read_csv('data/preprocessed_data.csv', encoding='utf-8-sig')
print(df_data_loaded)

In [None]:
# read the CSV file
df_data_loaded_bert = pd.read_csv('data/preprocessed_data_bert.csv', encoding='utf-8-sig')
print(df_data_loaded_bert)

# Feature Extraction

## TFIDF

Read preprocessed data

In [None]:
import pickle
# to read the pkl file:
with open('data/preprocessing_output.pkl', 'rb') as file:
    loaded_list = pickle.load(file)
    
# Print the dictionary
print(loaded_list)

In [None]:
import pandas as pd

labels = pd.read_pickle('data/labels.pkl')
print(labels)

TD-IDF

In [None]:
print(len(loaded_list))
documents = [" ".join(doc) for doc in loaded_list]

split train and test dataset

In [None]:
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(documents, labels, test_size = 0.2, random_state = 42)

tf-idf matrix

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

vectorizer = TfidfVectorizer()
tfidf_matrix_train = vectorizer.fit_transform(X_train)
tfidf_matrix_test = vectorizer.transform(X_test)
print(tfidf_matrix_train)

In [None]:
labels.value_counts()

train data

In [None]:
model = SVC(kernel='linear')
model.fit(tfidf_matrix_train, y_train)

predict data

In [None]:
y_pred = model.predict(tfidf_matrix_test)

accuracy = (y_pred == y_test).mean()
print("Accuracy:", accuracy)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion matrix')
plt.show()

In [None]:
from sklearn.metrics import classification_report

report = classification_report(y_test, y_pred)
print(report)

## Word Embedding

In [None]:
import pickle
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split

### Read preprocessed data

In [None]:
df_sentences = pd.read_csv('data/preprocessed_data.csv')
df_sentences

### Split data into train and test set

In [None]:
train, test = train_test_split(df_sentences, test_size=0.2, random_state=42)
train['Label'].value_counts()

### Split into tokens

In [None]:
train_tokenized = [[word for word in sentence.split()] for sentence in train['Content']]
test_tokenized = [[word for word in sentence.split()] for sentence in test['Content']]
train_tokenized

### Train Word2Vec model (skip-gram)

In [None]:
model_skipGram = Word2Vec(sentences=train_tokenized, vector_size=100, window=5,  min_count=1, workers=4, sg=1)
# save the trained model
model_skipGram.save('model/modelSkipGram.bin')

In [None]:
# load the trained model
model_sg = Word2Vec.load('model/modelSkipGram.bin')
print(type(model_sg.wv['binance']))
model_sg.wv['binance']

### Convert all words to vectors

In [None]:
def convert_to_vector(model, tokenized_sentence):
    vector = []
    for word in tokenized_sentence:
        if word in model.wv:
            vector.append(model.wv[word])
    
    # return the average of the vectors
    output = np.mean(vector, axis=0) if vector else np.zeros(model.vector_size)
    
    return output

train_vectors = np.array([convert_to_vector(model_sg, sentence) for sentence in train_tokenized])
test_vectors = np.array([convert_to_vector(model_sg, sentence) for sentence in test_tokenized])
train_vectors.shape

### Cross-validation using RandomForestClassifier

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=42)
cross_val_score(clf, train_vectors, train['Label'], cv=5)

### Train the classifier and get the performance on test set

In [None]:
from sklearn.metrics import accuracy_score

# train the model
clf.fit(train_vectors, train['Label'])

# predict the test set
y_pred = clf.predict(test_vectors)
accuracy_score(test['Label'], y_pred)

## Contextual Embeddings

# Contextual Embeddings Using BERT

This documentation provides a detailed guide to preprocessing text data and extracting contextual embeddings using the BERT model. This process enhances the representation of text for advanced NLP tasks

## Import Dataset

In [None]:
# Read CSV file and drop any rows with missing values immediately
df = pd.read_csv('../data/labeled_texts_1000.csv')
df.dropna(inplace=True)
df

## Prepare Data

In [None]:
# Extract content and labels into separate variables
X = df['Content']
y = df['label']
X, y

The content of the tweets and their labels are separated to facilitate preprocessing and model training.

## Preprocess Text For BERT

### Text Cleaning and Normalization

Define Helper Functions

In [None]:
# Function to convert emoticons to words
def convert_emoticons(text):
    # Merge emoji and emoticon dictionaries into one dictionary
    all_emoji_emoticons = {**EMOTICONS_EMO}
    
    # Replace colons and underscores in keys with spaces, and trim spaces
    all_emoji_emoticons = {k:v.replace(":","").replace("_"," ").strip() for k,v in all_emoji_emoticons.items()}
    
    # Initialize a KeywordProcessor for replacing keywords
    kp_all_emoji_emoticons = KeywordProcessor()
    
    # Add each emoticon and its corresponding word to the KeywordProcessor
    for k, v in all_emoji_emoticons.items():
        kp_all_emoji_emoticons.add_keyword(k, v)
    
    # Replace all emoticons in the text with corresponding words
    return kp_all_emoji_emoticons.replace_keywords(text)

In [None]:
# Function to handle abbreviations and normalize text
def normalize_text(text):
    # Replace 'fav' with 'favorite'
    text = re.sub(r'\bfav\b', "favorite", text)
    
    # Replace 'tkt' with 'ticket'
    text = re.sub(r'\btkt\b', "ticket", text)
    
    # Replace '(gm)' with 'good morning'
    text = re.sub(r'\(gm\)', 'good morning', text)
    
    # Replace '(r.i.p)' with 'rest in peace'
    text = re.sub(r'\(r.i.p\)', 'rest in peace', text)
    
    # Remove parenthetical references (typically credits like via or hat tips)
    text = re.sub(r'\([^)]*(via|h/t)[^)]*\)', '', text)
    
    # Reduce excess letter repetitions (more than two) to two
    text = re.sub(r'([a-zA-Z])\1{2,}', r'\1\1', text)
    
    return text


In [None]:
import re

def clean_characters(text):
    # Replace special characters (hyphens, underscores, colons) with a space
    text = re.sub(r'[-_:]', ' ', text)
    
    # Normalize line endings, replacing carriage return and newline with just newline
    normalized_text = re.sub(r'\r\n', '\n', text)
    
    # Remove decimal points used in numbers
    no_decimal_text = re.sub(r'(\d)\.(\d)', r'\1\2', normalized_text)
    
    # Remove characters that are not letters, numbers, basic punctuation, or newline
    cleaned_text = re.sub(r"[^a-zA-Z0-9 '.\n]", '', no_decimal_text)
    
    # Reduce multiple consecutive dots to a single dot
    cleaned_text = re.sub(r'\.{2,}', '.', cleaned_text)
    
    # Reduce multiple consecutive question marks to a single one
    cleaned_text = re.sub(r'\?{2,}', '?', cleaned_text)
    
    # Replace multiple consecutive newlines with a single period or space
    cleaned_text = re.sub(r'(\n)+', lambda m: '.' if m.group().startswith('\n') and not m.group().endswith('.') else '. ', cleaned_text)
    
    # Clean up multiple spaces or periods into a single space or period
    cleaned_text = re.sub(r'\. \.', '. ', cleaned_text)
    
    # Reduce multiple spaces to a single space
    cleaned_text = re.sub(r' {2,}', ' ', cleaned_text)
    
    # Return the cleaned text, stripped of leading/trailing whitespace
    return cleaned_text.strip()


In [None]:
def clean_tweet(tweet):
    # Configure the preprocessor to remove URLs, mentions, and reserved words like RT or FAV
    p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.RESERVED)
    
    # Clean the tweet using preprocessor settings
    cleaned_tweet = p.clean(tweet)
    
    # Convert all emojis in the tweet to text
    cleaned_tweet = emoji.demojize(cleaned_tweet)
    
    # Convert emoticons within the tweet to words
    cleaned_tweet = convert_emoticons(cleaned_tweet)
    
    # Normalize text to handle abbreviations and remove unnecessary parts
    normalized_text = normalize_text(cleaned_tweet.lower())
    
    # Clean characters and correct formatting issues
    sentence_cleaned = clean_characters(normalized_text)
    
    # Return the fully cleaned and processed tweet
    return sentence_cleaned

In [None]:
def get_cleaned_tweets(tweets):
    # Process a list of tweets, cleaning each one using clean_tweet function
    return [clean_tweet(tweet) for tweet in tweets]

In [None]:
# Apply the cleaning process to all tweets in X and store results
result = get_cleaned_tweets(X)
result[:5]

## Add BERT Special Tokens

In [None]:
def add_special_tokens(sentence):
    # Step 1: Adding the [CLS] token at the beginning
    sentence_with_cls = "[CLS] " + sentence
    
    # Step 2: Adding the [SEP] token before each full stop
    split_sentence = sentence_with_cls.split('.')
    sentence_with_sep = " [SEP].".join(split_sentence)
    
    # Clean up to handle cases where [SEP] might be added at the end unnecessarily
    sentence_with_sep = sentence_with_sep.replace(" [SEP].", " [SEP]").rstrip()
    
    return sentence_with_sep

BERT requires specific tokens to be added to the text. This function inserts the [CLS] token at the start and the [SEP] token at sentence boundaries.

In [None]:
# Applying the function to all sentences in the results list
processed_results = [add_special_tokens(sentence) for sentence in result]
processed_results[:5]

## Tokenization and Input Formatting

In [None]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize all the processed results
tokenized_results = [tokenizer.tokenize(sentence) for sentence in processed_results]
tokenized_results

# Convert tokens to their respective IDs in the BERT vocabulary
indexed_tokens_list = [tokenizer.convert_tokens_to_ids(tokens) for tokens in tokenized_results]
indexed_tokens_list

Tokenize the preprocessed text and convert the tokens into indices that correspond to BERT's vocabulary.

## Prepare Model Inputs

### Create Segment IDs and Attention Masks

In [None]:
# Initialize lists for segment IDs and attention masks
token_type_list = []
attention_mask_list = []

# Generate segment IDs and attention masks for each sentence
for indexed_sentence in indexed_tokens_list:
    # For each sentence, all tokens belong to the same segment, so use 0
    segment_ids = [0] * len(indexed_sentence)
    token_type_list.append(segment_ids)
    
    # If you're not padding, all tokens are real, so the attention mask is all 1s
    attention_mask = [1] * len(indexed_sentence)
    attention_mask_list.append(attention_mask)


Segment IDs indicate to the model different segments of the input, while attention masks allow the model to ignore padding during processing.

### Add Padding

In [None]:
# Initialize padded lists
padded_input_ids = []
padded_attention_mask = []

# Find the maximum sequence length in your batch
max_length = max(len(tokens) for tokens in indexed_tokens_list)

for tokens in indexed_tokens_list:
    # Calculate the number of padding tokens needed
    num_padding_tokens = max_length - len(tokens)
    
    # Pad the input IDs with zeros (assuming 0 is your padding token)
    padded_tokens = tokens + [0] * num_padding_tokens
    padded_input_ids.append(padded_tokens)
    
    # Pad the attention mask where actual tokens are marked with 1 and padding tokens with 0
    padded_mask = [1] * len(tokens) + [0] * num_padding_tokens
    padded_attention_mask.append(padded_mask)


Uniform input length is crucial for batch processing in neural networks. This step pads shorter sequences with zeros.

### Extract Contextual Embeddings

This section describes loading the pre-trained BERT model, converting data into tensors, and running the model to extract contextual embeddings.

In [None]:
# Load the BERT model
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
model.eval()  # Set the model to evaluation mode

# Convert the lists of integers into tensors
input_ids_tensor = torch.tensor(padded_input_ids)
attention_mask_tensor = torch.tensor(padded_attention_mask)

# Run the model and get the outputs
with torch.no_grad():
    outputs = model(input_ids_tensor, attention_mask=attention_mask_tensor)
    hidden_states = outputs[2]  # Hidden states from all BERT layers
    word_embeddings = outputs.last_hidden_state  # The last layer's output
word_embeddings

# Model Selection

## Model for TFIDF

## Model for Word2Vec

## Model for BERT

In [None]:
# read the embeddings from a file
word_embeddings = np.load('./data/word_embeddings.npy')
word_embeddings

In [None]:
# read csv file
import pandas as pd
df = pd.read_csv('./data/labeled_texts_1000.csv')
df.dropna(inplace=True)
y = df['label']

## SVM

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(word_embeddings, y, test_size=0.2, random_state=42)


In [None]:
# Average the word embeddings for each text sample
X_avg = word_embeddings.mean(axis=1)

In [None]:
# Now, you can split your data
X_train, X_test, y_train, y_test = train_test_split(X_avg, y, test_size=0.2, random_state=42)

In [None]:
# import svm

from sklearn.svm import SVC
clf = SVC(kernel='linear')
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)
conf_matrix_display = ConfusionMatrixDisplay(conf_matrix, display_labels=clf.classes_)
conf_matrix_display.plot()
# Detailed classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)


### Hyper-Parameter Tuning and Cross Validation

In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['rbf', 'linear']}

# Grid search with 10-fold cross-validation
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=2, cv=5)  # Set cv=10 for 10-fold CV
grid.fit(X_train, y_train)

# Best parameters and best score
print("Best parameters found:", grid.best_params_)
print("Best score:", grid.best_score_)


In [None]:
# show the confusion matrix
y_pred = grid.best_estimator_.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)
conf_matrix_display = ConfusionMatrixDisplay(conf_matrix, display_labels=clf.classes_)
conf_matrix_display.plot()
# Detailed classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

## LSTM

In [None]:
from tensorflow.keras.utils import to_categorical

# Assuming 'y' contains integer labels for categories
y_train_encoded = to_categorical(y_train)
y_test_encoded = to_categorical(y_test)


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(word_embeddings, y, test_size=0.2, random_state=42)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow import keras

num_classes = y_train_encoded.shape[1]  # Number of unique classes

model = Sequential([
    LSTM(100, input_shape=(word_embeddings.shape[1], word_embeddings.shape[2])),
    Dropout(0.2),
    Dense(num_classes, activation='softmax')  # Output layer for multi-class classification
])


callbacks = [
    keras.callbacks.ModelCheckpoint(
      filepath="./models/best-lstm.keras",
      save_best_only=True,
      monitor="val_loss")
]

model.compile(loss='categorical_crossentropy',  # Loss function for multi-class classification
              optimizer='adam',
              metrics=['accuracy'])

model.summary()


In [None]:
history = model.fit(X_train, y_train_encoded,  # Use one-hot encoded labels
                    epochs=4,  # May need adjustment
                    batch_size=32,  # May need adjustment
                    validation_split=0.2,
                    callbacks=callbacks
                    )  # Fraction of data to use as validation


In [None]:
# plot the training and validation accuracy
import matplotlib.pyplot as plt

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [None]:
from keras.models import load_model

best_model = load_model('./models/best-lstm.keras')
best_model.summary()

In [None]:
test_loss, test_acc = best_model.evaluate(X_test, y_test_encoded)
print('Test Accuracy:', test_acc)


In [None]:
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Dropout

import keras_tuner as kt
import tensorflow as tf

def build_model(hp):
    model = Sequential()
    model.add(Bidirectional(LSTM(units=hp.Int('units', min_value=50, max_value=200, step=50), return_sequences=True), input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(Dropout(0.5))
    model.add(LSTM(50))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))
    
    # Tuning the learning rate
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
    
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hp_learning_rate),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model

tuner = kt.Hyperband(build_model,
                     objective='val_accuracy',
                     max_epochs=10,
                     factor=3,
                     directory='my_dir',
                     project_name='intro_to_kt')

tuner.search(X_train, y_train_encoded, epochs=50, validation_split=0.2)

# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
The hyperparameter search is complete. The optimal number of units in the first densely-connected
layer is {best_hps.get('units')} and the optimal learning rate for the optimizer
is {best_hps.get('learning_rate')}.
""")


In [None]:
# Retrieve the best model
best_tuner_model = tuner.get_best_models(num_models=1)[0]
best_tuner_model.summary()  


In [None]:
# Making predictions
y_pred = best_tuner_model.predict(X_test)

# Since we're doing multi-class classification, 'y_pred' will contain probabilities for each class
# To convert these probabilities into class labels, you can use 'argmax' which returns the index of the maximum value

import numpy as np

# Convert probabilities to class labels
y_pred_labels = np.argmax(y_pred, axis=1)
y_pred_labels


In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Assuming 'y_test' contains the actual labels
accuracy = accuracy_score(y_test, y_pred_labels)
print("Test Accuracy:", accuracy)

# Detailed classification report
print(classification_report(y_test, y_pred_labels))

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_labels)
print("Confusion Matrix:\n", conf_matrix)

conf_matrix_display = ConfusionMatrixDisplay(conf_matrix, display_labels=clf.classes_)
conf_matrix_display.plot()

# Deployment and Interface

In [None]:
pip install tk

In [None]:
pip install customtkinter

In [None]:
import tkinter as tk
import customtkinter

In [None]:
def button_callback():
    print("button clicked")


In [None]:
app = customtkinter.CTk()
app.title("Custom Tkinter")
app.geometry("820x480")

app.grid_columnconfigure((0), weight=1)
# app.grid_columnconfigure((1), weight=1)
app.grid_columnconfigure((2), weight=1)
app.grid_columnconfigure((3), weight=1)


In [None]:
textbox = customtkinter.CTkTextbox(app, wrap="word", corner_radius=15)

textbox.insert("0.0", "What is on you mind?")  # insert at line 0 character 0

text = textbox.get("0.0", "end")  # get text from line 0 character 0 till the end

textbox.grid(row=0, column=0, columnspan=4, padx=50, sticky="nsew", pady=10)

In [None]:
button = customtkinter.CTkButton(app, text="PREDICT", width=200, height=30, command=button_callback, fg_color="#f40e7d", hover_color="#d4116f", corner_radius=15, font=("Arial", 12))
button.grid(row=1, column=2, pady=4)
    

In [None]:
sentiment = customtkinter.CTkTextbox(app, width=400, height=100, wrap="word", corner_radius=15)

sentiment.insert("0.0", "The emotion here is giving...")  # insert at line 0 character 0
text_sentiment = sentiment.get("0.0", "end")  # get text from line 0 character 0 till the end
# textbox.delete("0.0", "end")  # delete all text
sentiment.configure(state="disabled")  # configure textbox to be read-only

sentiment.grid(row=3, column=2, pady=10)

In [None]:
app.mainloop()