In [None]:
import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn import set_config

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re #regular expression
import spacy
from matplotlib import pyplot as plt


from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import pipeline as tf_pipeline

import joblib

from custom_package.text_processing import normalize_text, tokenizer_func, remove_emojis
from custom_package.modeling import GensimLdaTransformer, get_topic_assignment
from custom_package.modeling import topic_mapping_sk_lda, topic_mapping_gensim_lda
from custom_package.database import get_raw_tweets, store_processed_tweets,get_training_raw_tweets






In [None]:
set_config(display='diagram')

Prepare dataset

In [None]:
query_limit = 484000


In [None]:
def get_filtered_tweets(query_limit = 100):
    raw_tweets = get_training_raw_tweets(query_limit)
    data = {'id' : [tweet.id for tweet in raw_tweets],
        'text' : [remove_emojis(tweet.text) for tweet in raw_tweets],
        'company_id' : [tweet.company_id for tweet in raw_tweets],
        'date' : [tweet.date for tweet in raw_tweets]
        }
    filtered_df = pd.DataFrame(data)
    return filtered_df

In [None]:
# get raw tweets for training
filtered_df = get_filtered_tweets(query_limit)

In [None]:
filtered_df.head()

In [None]:
filtered_df.info()

In [None]:
filtered_df['company_id'].value_counts()

In [None]:
# pre-trained tweet sentiment analysis model

# Load the model and tokenizer
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
sentiment_model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# Create a sentiment analysis pipeline
sentiment_pipeline = tf_pipeline("sentiment-analysis", model=sentiment_model, tokenizer=tokenizer)

# Example tweet
tweet = "I love using BERT models for NLP tasks!"

# Run sentiment analysis
result = sentiment_pipeline(tweet)
print(result)

In [None]:
# Example tweet
tweet = ["I love using BERT models for NLP tasks!",
         "I'm not a fan of rainy days.",
         "Neutral statements are hard to classify."]

# Run sentiment analysis
result = sentiment_pipeline(tweet)
print(result)

In [None]:
# Create a sentiment analysis pipeline
pos_sentiment_pipeline = tf_pipeline("text-classification", model=sentiment_model,
                                     tokenizer=tokenizer,return_all_scores=True)

In [None]:
joblib.dump(pos_sentiment_pipeline,'sentiment_analyis_pipeline.joblib')

In [None]:
# Sentiment label mapping based on model output
label_mapping = {
    'LABEL_0': 'Negative',
    'LABEL_1': 'Neutral',
    'LABEL_2': 'Positive'
}

# Function to get and map sentiment
def get_mapped_sentiment(text):
    result = text
    #result = sentiment_pipeline(text)[0]
    # Map the model's label to a more descriptive term
    sentiment_label = label_mapping.get(result['label'], "Unknown")
    return sentiment_label

In [None]:
# Function to get and map sentiment
def get_pos_sentiment_proba(text):
    result = text
    # Map the model's label to a more descriptive term
    pos_sentiment_proba = ((result[-1].get('score', np.nan) - result[-3].get('score',np.nan))+1)/2
    return pos_sentiment_proba

In [None]:
for item in result:
    print(get_mapped_sentiment(item))

In [None]:
res = pos_sentiment_pipeline(tweet)

In [None]:
for item in res:
    #print(item)
    print(get_pos_sentiment_proba(item))

In [None]:
res = pos_sentiment_pipeline(filtered_df['text'].iloc[0:100].to_list())

In [None]:
res_a = sentiment_pipeline(filtered_df['text'].iloc[0:100].to_list())

In [None]:
sentiment_a = [get_mapped_sentiment(text) for text in res_a]

In [None]:
sentiment_data = [get_pos_sentiment_proba(text) for text in res]

In [None]:
selected_df = filtered_df.iloc[0:100].copy()

In [None]:
selected_df['sentiment']=sentiment_data

In [None]:
selected_df['sentiment_map']= sentiment_a

In [None]:
res[10-1]

In [None]:
selected_df[['text','sentiment','sentiment_map']].to_dict(orient='records')

In [None]:
# Create a sentiment analysis pipeline
pos_sentiment_pipeline = hf_pipeline("text-classification", model=model,
                                     tokenizer=tokenizer,return_all_scores=True)

In [None]:
result = pos_sentiment_pipeline(tweet)
print(result)

In [None]:
result[0][-1]

In [None]:
# Function to get and map sentiment
def get_pos_sentiment_proba(text):
    result = pos_sentiment_pipeline(text)
    #result = result[0][-1] + result[0][-2]
    # Map the model's label to a more descriptive term
    pos_sentiment_proba = result[0][-1].get('score', np.nan) + result[0][-2].get('score',np.nan)
    return pos_sentiment_proba

In [None]:
for item in tweet:
    print(get_pos_sentiment_proba(item))