In [None]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer

DATA_FLOW = 'local'
DATA_SAMPLE = .1
SAMPLE_OUTLIERS = 0.75

def preprocess(text):
    text = text.replace('\n', ' ')
    text = text.replace(u'\xa0', u' ')
    text = text.replace('@', '')
    new_text = []
    for t in text.split(" "):
        t = '' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

# Read news data
if DATA_FLOW == 'local':
    news_tweets = pd.read_csv("./../data/processed/news_tweets_clean.csv")
elif DATA_FLOW == 'gcp':
    from google.cloud import storage
    from io import BytesIO
    client = storage.Client()
    bucket_name = "covid-news-response"
    file_name = "news_tweets_clean.csv"
    bucket = client.get_bucket(bucket_name)
    blob = bucket.get_blob(file_name)
    content = blob.download_as_string()
    news_tweets = pd.read_csv(BytesIO(content))

# Country filtering
tweet_topics = pd.DataFrame([])
#for country in news_tweets.country.unique():
for country in ['AU', 'NZ', 'MY']:
    df = news_tweets[news_tweets.country==country]

    # Clear news data
    df = df.drop_duplicates()
    df.created_at = pd.to_datetime(df.created_at)
    df['clean_text'] = df.text.apply(lambda x: preprocess(x))
    df['moy'] = pd.to_datetime(
        df.created_at.dt.year.astype(str) + '-' + df.created_at.dt.month.astype(str) + '-1')
    df_sample = df.sample(frac=DATA_SAMPLE, random_state=42)
    docs = df_sample.clean_text.values
    timestamps = df_sample.moy.to_list()

    # Constants
    QUANTITY = len(docs)
    MIN_CLUSTER_SIZE = int(np.ceil(QUANTITY * 0.0005))
    N_NEIGHBORS = int(np.ceil(MIN_CLUSTER_SIZE * 0.75))
    MIN_SAMPLES = int(np.ceil(MIN_CLUSTER_SIZE * SAMPLE_OUTLIERS))
    MIN_VECTORIZE = int(np.ceil(QUANTITY * 2e-05))

    # Topic model
    vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words="english", min_df=MIN_VECTORIZE)
    umap_model = UMAP(n_neighbors=N_NEIGHBORS, metric='cosine', low_memory=True)
    hdbscan_model = HDBSCAN(min_cluster_size=MIN_CLUSTER_SIZE, min_samples=MIN_SAMPLES, metric='euclidean')
    topic_model = BERTopic(
        language="english", nr_topics="auto", calculate_probabilities=False, verbose=True, low_memory=True,
        vectorizer_model=vectorizer_model, umap_model=umap_model, hdbscan_model=hdbscan_model
    )
    topics, probs = topic_model.fit_transform(docs)
    df_sample['topic'] = topics
    tweet_topics = pd.concat([tweet_topics, df_sample[['tweet_id', 'topic']]])
    
    # Saving
    topic_model.save(fr"./models/topic_model_{country}.sav")

### Check topics

In [None]:
freq = topic_model.get_topic_info()
print("The number of documents is:", QUANTITY)
print("The number of identified topics is:", len(freq))
print("The percentage of outlier documents is:", np.round(freq[freq.Topic==-1]['Count'].values[0]/QUANTITY,2))
freq.head(5)

In [None]:
topic_model.get_topic(0)

### Visualize topics

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_hierarchy()

In [None]:
topic_model.visualize_barchart(top_n_topics=5)

In [None]:
topic_model.visualize_heatmap(top_n_topics=25)

In [None]:
topic_model.visualize_term_rank()

In [None]:
topic_model.visualize_topics_over_time(topics_over_time, topics=[0,1,2,3,4])

### Saving

In [None]:

if DATA_FLOW == 'local':
    topic_model.save("./../models/topic_model_bertopic.sav")
    df_sample[['tweet_id', 'moy', 'topic']].to_csv('./../data/processed/df_topics.csv')
elif DATA_FLOW == 'gcp':
    topic_model.save("topic_model_bertopic.sav")
    df_sample[['tweet_id', 'moy', 'topic']].to_csv('df_topics.csv')
    !gsutil df_topics.csv gs://covid-news-response/
    !gsutil topic_model_bertopic.sav gs://covid-news-response/