**Dependencies**

In [1]:
#%%capture
#!pip install pandas
#!pip install pyarrow
#!pip install numpy
#!pip install torch torchvision torchaudio
#!pip install transformers
#!pip install matplotlib
#!pip install nltk
#!pip install spacy
#!pip install scikit-learn

**Libraries**

In [2]:
# General
import re
import os
import json
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Tokenization
import spacy
import string
from nltk.tokenize import word_tokenize
from spacy.lang.en import English

# Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

# Models
# Base Model - Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

#Base Model - Other Models
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

# Transformers Model
import torch
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification,BertTokenizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertTokenizer
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

# Evaluation Metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

**Import data**

- Categorization Data

In [3]:
twitter_train = pd.read_csv('./categorizacion/topic_train.csv')
twitter_test = pd.read_csv('./categorizacion/topic_valid.csv')
twitter = pd.concat([twitter_train,twitter_test],axis=0)

# with open('./categorizacion/labels.json', 'r') as json_file:
#     labels = pd.DataFrame(list(json.load(json_file).items()),columns=['Label','Description'])

In [4]:
twitter.head()

Unnamed: 0,text,label
0,Here are Thursday's biggest analyst calls: App...,0
1,Buy Las Vegas Sands as travel to Singapore bui...,0
2,"Piper Sandler downgrades DocuSign to sell, cit...",0
3,"Analysts react to Tesla's latest earnings, bre...",0
4,Netflix and its peers are set for a ‘return to...,0


- Sentiment Data

In [5]:
directory_sent='./sentimiento'
data_list = []

for filename in os.listdir(directory_sent):
    if filename.endswith(".txt"):
        file_path = os.path.join(directory_sent,filename)
        with open(file_path,'r',encoding='latin1') as file:
            lines = file.readlines()
            for line in lines:
                sentence, sentiment = line.rsplit('@',1)
                sentiment=sentiment.strip()
                data_list.append({'sentence':sentence,'sentiment': sentiment})

sent = pd.DataFrame(data_list)
sent = sent.drop_duplicates()


In [6]:
sent.head()

Unnamed: 0,sentence,sentiment
0,"According to Gran , the company has no plans t...",neutral
1,Technopolis plans to develop in stages an area...,neutral
2,With the new production plant the company woul...,positive
3,According to the company 's updated strategy f...,positive
4,"For the last quarter of 2010 , Componenta 's n...",positive


**Pre-processing functions**

In [7]:
def clean_stop_words(data):
    stop_words = spacy.lang.en.stop_words.STOP_WORDS
    lista = [palabra for palabra in data if palabra not in stop_words and palabra not in string.punctuation and palabra != '``']
    return lista

In [8]:
# Processing Text
sent_base=sent.copy()
sent_base['tokenized_sentence'] = sent_base['sentence'].apply(word_tokenize)
sent_base['cleaned_sentence']=sent_base['tokenized_sentence'].apply(clean_stop_words)
sent_base['processed_sentence'] = sent_base['cleaned_sentence'].apply(lambda x: ' '.join(x))

# Labeling "sentiment" (target variable)
value_mapping = {'negative': 1, 'neutral': 2, 'positive': 0}
sent_base.loc[:, 'sentiment_numeric'] = sent_base['sentiment'].map(value_mapping)

# Train-Test Split (80% - 20%)
sent_train, sent_test = train_test_split(sent_base, test_size=0.2, stratify=sent_base['sentiment'], random_state=42)

# Vectorizing
vectorizer = TfidfVectorizer(max_features=1000)

# train (sent_train_tfid -> features, y_sent_train -> target)
sent_train_tfid = vectorizer.fit_transform(sent_train['processed_sentence'])
y_sent_train=sent_train['sentiment_numeric']

# test (sent_test_tfid -> features, y_sent_test -> target)
sent_test_tfid = vectorizer.transform(sent_test['processed_sentence'])
y_sent_test=sent_test['sentiment_numeric']

**Base Line Model (Logistic Regression)**

In [9]:
# Logistic Regression Model
model = LogisticRegression()
model.fit(sent_train_tfid, y_sent_train)

predictions = model.predict(sent_test_tfid)

accuracy = accuracy_score(y_sent_test, predictions)
print(f'Accuracy: {accuracy:.2f}')
print("-----------------Logistic Regression-----------------")
print("Classification Report:")
print(classification_report(y_sent_test, predictions))

Accuracy: 0.73
-----------------Logistic Regression-----------------
Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.52      0.60       272
           1       0.64      0.32      0.43       121
           2       0.74      0.91      0.82       575

    accuracy                           0.73       968
   macro avg       0.69      0.58      0.61       968
weighted avg       0.72      0.73      0.71       968



**Base Line Model (Other Models)**

In [10]:
models = [
    LogisticRegression(),
    RandomForestClassifier(),
    SVC(),
    MultinomialNB(),
    GradientBoostingClassifier(),
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    MLPClassifier()
]

for model in models:
    model.fit(sent_train_tfid, y_sent_train)
    
    y_train_pred = model.predict(sent_train_tfid)
    train_accuracy = accuracy_score(y_sent_train, y_train_pred)
    
    y_test_pred = model.predict(sent_test_tfid)
    test_accuracy = accuracy_score(y_sent_test, y_test_pred)
    
    print(f"{model.__class__.__name__}:")
    print(f"  Training Accuracy: {train_accuracy}")
    print(f"  Test Accuracy: {test_accuracy}")
    print("------")

LogisticRegression:
  Training Accuracy: 0.8083677685950413
  Test Accuracy: 0.7262396694214877
------
RandomForestClassifier:
  Training Accuracy: 0.9974173553719008
  Test Accuracy: 0.7117768595041323
------
SVC:
  Training Accuracy: 0.918904958677686
  Test Accuracy: 0.7200413223140496
------
MultinomialNB:
  Training Accuracy: 0.7551652892561983
  Test Accuracy: 0.6859504132231405
------
GradientBoostingClassifier:
  Training Accuracy: 0.8060433884297521
  Test Accuracy: 0.7324380165289256
------
KNeighborsClassifier:
  Training Accuracy: 0.7104855371900827
  Test Accuracy: 0.6394628099173554
------
DecisionTreeClassifier:
  Training Accuracy: 0.9974173553719008
  Test Accuracy: 0.6590909090909091
------
MLPClassifier:
  Training Accuracy: 0.9974173553719008
  Test Accuracy: 0.6766528925619835
------




### FINBERT MODEL 

**Initiate Model FinBert**

In [11]:
tokenizer = BertTokenizer.from_pretrained("ProsusAI/finbert")
model = BertForSequenceClassification.from_pretrained("ProsusAI/finbert")

**Calculate Sentiment (FinBert)**

In [12]:
def get_sentiment(txt):
    tokens = tokenizer.encode_plus(txt, add_special_tokens=False,return_tensors = 'pt')
    outputs = model(**tokens)
    probabilities = torch.nn.functional.softmax(outputs[0], dim=-1)
    category = torch.argmax(probabilities).item()
    return category

In [13]:
# Setting a copy and encoding categorical labels
sent_testing = sent.copy()
value_mapping = {'negative': 1, 'neutral': 2, 'positive': 0}
sent_testing.loc[:, 'sentiment_numeric'] = sent_testing['sentiment'].map(value_mapping)

# Generating a random sample 
sent_testing_sample = sent_testing.sample(n=4840, random_state=42)

In [None]:
# Using the model to assign sentiment
sent_testing_sample['prediction']=sent_testing_sample['sentence'].apply(get_sentiment)

In [None]:
# Evaluate accuracy
accuracy = accuracy_score(sent_testing_sample['sentiment_numeric'], sent_testing_sample['prediction'])
print(f'Accuracy: {accuracy:.2f}')

# Display classification report
print('Classification Report:')
print(classification_report(sent_testing_sample['sentiment_numeric'], sent_testing_sample['prediction']))

# Display confusion matrix
conf_matrix = confusion_matrix(sent_testing_sample['sentiment_numeric'], sent_testing_sample['prediction'])
print('Confusion Matrix:')
print(conf_matrix)

In [None]:
sent_testing_sample.head(5)

### Finbert (Twitter Dataset)

In [None]:
tokenizer = BertTokenizer.from_pretrained("ProsusAI/finbert")
model = BertForSequenceClassification.from_pretrained("ProsusAI/finbert")

# # Evaluate accuracy
# accuracy = accuracy_score(sent_testing_sample['sentiment_numeric'], sent_testing_sample['prediction'])
# print(f'Accuracy: {accuracy:.2f}')

# # Display classification report
# print('Classification Report:')
# print(classification_report(sent_testing_sample['sentiment_numeric'], sent_testing_sample['prediction']))

# # Display confusion matrix
# conf_matrix = confusion_matrix(sent_testing_sample['sentiment_numeric'], sent_testing_sample['prediction'])
# print('Confusion Matrix:')
# print(conf_matrix)


**Calculate Sentiment (FinBert)**

In [None]:
def get_sentiment(txt):
    tokens = tokenizer.encode_plus(txt, add_special_tokens=False,return_tensors = 'pt')
    outputs = model(**tokens)
    probabilities = torch.nn.functional.softmax(outputs[0], dim=-1)
    category = torch.argmax(probabilities).item()
    return category

In [None]:
# Setting a copy and encoding categorical labels
sent_testing = twitter.copy()
value_mapping = {'negative': 1, 'neutral': 2, 'positive': 0}
# sent_testing.loc[:, 'sentiment_numeric'] = sent_testing['sentiment'].map(value_mapping)

# Generating a random sample 
sent_testing_sample = sent_testing.sample(n=10, random_state=42)

# Using the model to assign sentiment
sent_testing_sample['prediction']=sent_testing_sample['text'].apply(get_sentiment)

In [None]:
sent_testing.head(5)

**Calculate Sentiment (Fine Tuning - FinBert)**

In [None]:
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Setting a copy and encoding categorical labels
sent_tuning = sent.copy()
value_mapping = {'negative': 1, 'neutral': 2, 'positive': 0}
sent_tuning.loc[:, 'sentiment_numeric'] = sent_tuning['sentiment'].map(value_mapping)

# Split the dataset into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(sent_tuning['sentence'], sent_tuning['sentiment_numeric'],stratify=sent_tuning['sentiment_numeric'], test_size=0.2,)

# Convert the texts and labels into tensors
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=128,return_tensors='pt')
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=128,return_tensors='pt')

# Create attention masks
train_masks = train_encodings['attention_mask']
val_masks = val_encodings['attention_mask']

train_inputs = train_encodings['input_ids']
val_inputs = val_encodings['input_ids']

train_labels = torch.tensor(train_labels.tolist())
val_labels = torch.tensor(val_labels.tolist())

# Create the DataLoader
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)



In [32]:
from transformers import BertForSequenceClassification, AdamW

# Load the pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)  # Assuming 3 classes: negative, neutral, positive

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

# Move model to the device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
num_epochs = 3  # You can adjust the number of training epochs

for epoch in range(num_epochs):
    model.train()

    for batch in train_dataloader:
        inputs = {'input_ids': batch[0].to(device),
                  'attention_mask': batch[1].to(device),
                  'labels': batch[2].to(device)}

        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Save the fine-tuned model
#model.save_pretrained('fine_tuned_bert_model')

# Save the model
with open('fine_tuned_bert_model.pkl', 'wb') as f:
    pickle.dump(model, f)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
# Load the fine-tuned model
with open('fine_tuned_bert_model.pkl', 'rb') as f:
    model_fine_tuning = pickle.load(f)

In [34]:
def get_sentiment_tuned(txt):
    tokens = tokenizer.encode_plus(txt, add_special_tokens=False,return_tensors = 'pt')
    outputs = model_fine_tuning(**tokens)
    probabilities = torch.nn.functional.softmax(outputs[0], dim=-1)
    category = torch.argmax(probabilities).item()
    return category

In [35]:
df_test_tun = pd.DataFrame({'sentence': val_texts, 'sentiment_numeric': val_labels})

In [36]:
df_test_tun['prediction']=df_test_tun['sentence'].apply(get_sentiment_tuned)

In [37]:
# Evaluate accuracy
accuracy = accuracy_score(df_test_tun['sentiment_numeric'], df_test_tun['prediction'])
print(f'Accuracy: {accuracy:.2f}')

# Display classification report
print('Classification Report:')
print(classification_report(df_test_tun['sentiment_numeric'], df_test_tun['prediction']))

# Display confusion matrix
conf_matrix = confusion_matrix(df_test_tun['sentiment_numeric'], df_test_tun['prediction'])
print('Confusion Matrix:')
print(conf_matrix)

Accuracy: 0.72
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.39      0.53       272
           1       0.94      0.24      0.38       121
           2       0.70      0.98      0.82       575

    accuracy                           0.72       968
   macro avg       0.81      0.54      0.58       968
weighted avg       0.76      0.72      0.68       968

Confusion Matrix:
[[107   0 165]
 [ 18  29  74]
 [  8   2 565]]
