In [None]:
from datetime import datetime, timedelta,timezone
import pandas as pd
import numpy  as np
from sklearn import set_config
import joblib

# from db import Model, Session, engine
# from models import Tweet, ProcessedTweet, Company



from custom_package.text_processing import normalize_text, tokenizer_func, remove_emojis
from custom_package.modeling import GensimLdaTransformer, get_topic_assignment, get_pos_sentiment_proba
from custom_package.modeling import topic_mapping_sk_lda, topic_mapping_gensim_lda, topic_mapping_sk_full_lda
from custom_package.database import get_raw_tweets, store_processed_tweets

In [None]:
set_config(display='diagram')

In [None]:
def get_filtered_tweets(query_limit = 100):
    raw_tweets = get_raw_tweets(query_limit)
    data = {'id' : [tweet.id for tweet in raw_tweets],
        'text' : [remove_emojis(tweet.text) for tweet in raw_tweets],
        'company_id' : [tweet.company_id for tweet in raw_tweets],
        'date' : [tweet.date for tweet in raw_tweets]
        }
    filtered_df = pd.DataFrame(data)
    return filtered_df

In [None]:
filtered_df = get_filtered_tweets()

In [None]:
filtered_df.head()

In [None]:
sk_lda_pipeline = joblib.load('sklearn_LDA_pipeline.joblib')

In [None]:
print(sk_lda_pipeline)

In [None]:
gensim_lda_pipeline = joblib.load('gensim_LDA_pipeline.joblib')

In [None]:
print(gensim_lda_pipeline)

In [None]:
sk_full_lda_pipeline = joblib.load('full_lda_pipeline.joblib')

In [None]:
print(sk_full_lda_pipeline)

In [None]:
sentiment_analysis_pipeline = joblib.load('sentiment_analysis_pipeline.joblib')

In [None]:
print(sentiment_analysis_pipeline)

In [None]:
#filtered_df = pd.read_csv('gensim_topic.csv',usecols=['index','text','date','Sentiment'])

In [None]:
filtered_df.info()

In [None]:
check_text = filtered_df['text'].iloc[0:10].to_list()

In [None]:
print(sk_full_lda_pipeline.transform(check_text).round(3))

In [None]:
print(sk_lda_pipeline.transform(check_text).round(3))

In [None]:
print(gensim_lda_pipeline.transform(check_text))

In [None]:
sk_result = sk_lda_pipeline.transform(check_text).round(3)
sk_full_result = sk_full_lda_pipeline.transform(check_text).round(3)
gensim_result = gensim_lda_pipeline.transform(check_text)
sentiment_result = sentiment_analysis_pipeline(check_text)

In [None]:
for res in sentiment_result:
    print(get_pos_sentiment_proba(res))

In [None]:
for index,text in enumerate(check_text):
    print(index + 1)
    print(text)

In [None]:
for index,res in enumerate(sk_full_result):
    print(index + 1)
    print(get_topic_assignment(res, topic_mapping_sk_full_lda))

In [None]:
for index, res in enumerate(sk_result):
    print(index + 1)
    print(get_topic_assignment(res, topic_mapping_sk_lda))

In [None]:
for index, res in enumerate(gensim_result):
    print(index + 1)
    print(get_topic_assignment(res, topic_mapping_gensim_lda))

In [None]:
def apply_topic_to_df(filtered_df):
    sentiment_result = sentiment_analysis_pipeline(filtered_df['text'].to_list())
    sentiment_result = [get_pos_sentiment_proba(array) for array in sentiment_result]
    filtered_df['sentiment'] = sentiment_result
    sk_full_result = sk_full_lda_pipeline.transform(filtered_df['text'])
    sk_full_result = [get_topic_assignment(array,topic_mapping_sk_full_lda) for array in sk_full_result]
    filtered_df['sk_full_topic'] = sk_full_result
    sk_result = sk_lda_pipeline.transform(filtered_df['text'])
    sk_result = [get_topic_assignment(array,topic_mapping_sk_lda) for array in sk_result]
    filtered_df['sk_topic'] = sk_result
    gensim_result = gensim_lda_pipeline.transform(filtered_df['text'])
    gensim_result = [get_topic_assignment(array,topic_mapping_gensim_lda) for array in gensim_result]
    filtered_df['gensim_topic'] = gensim_result
    return filtered_df

In [None]:
filtered_df = apply_topic_to_df(filtered_df)

In [None]:
filtered_df.head()

In [None]:
store_processed_tweets(filtered_df)

In [None]:
store_processed_tweets(apply_topic_to_df(get_filtered_tweets()))

In [None]:
for i in range(round(185)):
    try:
        store_processed_tweets(apply_topic_to_df(get_filtered_tweets(1000)))
    except Exception as e:
        print(e)

In [None]:
sk_result = sk_lda_pipeline.transform(filtered_df['text'])

In [None]:
sk_result = list(sk_result)

In [None]:
sk_result[40-1]

In [None]:
filtered_df['text'].iloc[38:40]

In [None]:
normalize_text(filtered_df['text'].iloc[38:40])