In [None]:
# Imports

import torch
from pathlib import Path
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datasets import Dataset
import numpy as np
import gc
import pickle
import plotly.express as px
import plotly.graph_objects as go
import six
import os
import re
from tqdm import tqdm
import warnings
import dill

import hdbscan
from umap import UMAP
from umap.parametric_umap import ParametricUMAP
from sklearn.feature_extraction.text import CountVectorizer
import nltk

In [None]:
desired_languages = ["und", "hr",  "cs",  "et",  "fi",  "fr",  "de",  "el",  "hu",  "it",  "lv",  "lt",  "pl",  "pt",  "ro",  "sk",  "sl",  "es"]
# Obtain dataframe
csv_collection = []
for dirname, _, filenames in os.walk('./Data/ukraine-russian-crisis-twitter-dataset-1-2-m-rows/'):
    for filename in filenames:
        fullpath= os.path.join(dirname, filename)
        csv_collection.append(fullpath)

df = pd.DataFrame()
for i, v in enumerate(tqdm(csv_collection)):
    #print(f"{i+1} out of {len(csv_collection)}")
    #print(v)
    tmp = pd.read_csv(v ,compression = 'gzip', index_col=0)[['tweetcreatedts', 'text','language']] # only keep important columns to lower memory consumption
    tmp = tmp.drop_duplicates(subset=['text']) # remove duplicates as most are retweets
    mask = tmp['language'].isin(desired_languages)
    #print(f"{len(tmp)} unique values in {i+1}th csv")
    tmp = tmp[mask]
    df = pd.concat([df, tmp], axis=0)

df.reset_index()
df.to_pickle('./Pickles/raw_df.pkl')

In [None]:
raw_df = pd.read_pickle('./Pickles/raw_df.pkl')
raw_df.head()
raw_df['language'].unique(), len(raw_df)

**Randomly sampling Part of Dataset**

In [None]:
idxs = np.random.choice(len(raw_df), int(np.ceil(0.75*len(raw_df))), replace=False)
df_small = raw_df.iloc[idxs]
df_small.to_pickle('./Pickles/df_small.pkl')

In [None]:
df_small = pd.read_pickle('./Pickles/df_small.pkl')
df_small

**Plotting Language Distribution (todo: plotly)**

In [None]:
g=sns.barplot(x=df_small.language.value_counts()[:].index,y=df_small.language.value_counts()[:])
g.set_yscale("log")

**Plotting Daily Tweet Count**

In [None]:
df_dailycounts = pd.to_datetime(df_small['tweetcreatedts']).dt.floor('d').value_counts().rename_axis('date').reset_index(name='count').sort_values(by='date')
fig = px.line(df_dailycounts, x='date', y="count")
fig.show()

**Random Example of some unprocessed tweets**

In [None]:
for i in df_small.iloc[np.random.choice(len(df_small), 5)]['text']: print(f"{i}\n")

**Preprocessing Data and Translating**

In [None]:
def preprocess_tweet(examples):
    text = examples['text']
    new_text = []
    text = re.sub('\n', ' ', str(text)) # replace new lines with space
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t # replace tags with "@user"
        t = 'http' if t.startswith(('http', 'www')) or t.endswith('.com') else t # replace links with "http"
        new_text.append(t)
    text = " ".join(new_text)   
    return dict(text = text)

In [None]:
df_proc = pd.DataFrame(Dataset.from_pandas(df_small).map(preprocess_tweet))
df_proc.to_pickle('./Pickles/df_proc.pkl')
df_proc.head()

In [None]:
df_proc = pd.read_pickle('./Pickles/df_proc.pkl')
df_proc

In [None]:
# translating
from google.cloud import translate_v2 as translate
path = "/home/elyx/ukraine-twitter-NLP/cred.json" # path to google cloud api credentials json
translate_client = translate.Client.from_service_account_json(path)

def translate_text(examples):
    """Translates text into the target language"""
    target = 'EN'
    text = examples['text']
    
    result = translate_client.translate(text, target_language=target)
    
    return dict(text=[result[i]["translatedText"] for i in np.arange(len(result))],
                language=[result[i]['detectedSourceLanguage'] for i in np.arange(len(result))])

In [None]:
df_trans = pd.DataFrame(Dataset.from_pandas(df_proc).map(translate_text, batched=True, batch_size=125))

mask = df_trans['language'].isin(desired_languages)
df_trans = df_trans[mask] # removing non desired languages that have been detected
df_trans = df_trans.reset_index()
df_trans['tweetcreatedts'] = pd.to_datetime(df_trans['tweetcreatedts']).dt.floor('d')

In [None]:
# removing html codes
def html_remover(examples):
    text = examples['text']
    text = re.sub('&quot;', '"', str(text))
    text = re.sub('&#39;', "'", str(text))
    text = re.sub('&amp;', "&", str(text))
    return dict(text = text)

df_trans = pd.DataFrame(Dataset.from_pandas(df_trans).map(html_remover))
df_trans.to_pickle('./Pickles/df_trans.pkl')

In [None]:
df_trans = pd.read_pickle('./Pickles/df_trans.pkl')
df_trans.shape, df_trans.columns

**Plotting new language distribution now that undetermined languages have been identified**

In [None]:
language_counts = df_trans.groupby('language').size().sort_values(ascending=False).reset_index().rename(columns={0:'count'})
language_counts_mean = (df_trans.groupby('language').size() / 277).sort_values(ascending=False).reset_index().rename(columns={0:'count'})

fig = px.bar(language_counts, x='language', y='count',
             hover_data=['language', 'count'],
             template='seaborn',
             log_y=True,)


fig.update_layout(
    updatemenus=[
        dict(
            type = "buttons",
            direction = "left",
            buttons=list([
                dict(
                    args=[{"y": [language_counts['count']]}],
                    label="Total",
                    method="update"
                ),
                dict(
                    args=[{"y": [language_counts_mean['count']]}],
                    label="Mean",
                    method="update"
                )
            ]),
            pad={"r": 10, "t": 10},
            showactive=True,
            x=0.0,
            xanchor="left",
            y=1.3,
            yanchor="top"
        ),
    ]
)

fig.show()
fig.write_html("./Plots/language_count.html")

In [None]:
**Removing languages that have average with less than 30 tweets a day (CLT)**

In [None]:
final_languages = list(language_counts_mean.iloc[0:10].language)
mask = df_trans['language'].isin(final_languages)
df_trans = df_trans[mask]
df_trans.to_pickle('./Pickles/df_trans.pkl')

In [None]:
df_trans = pd.read_pickle('./Pickles/df_trans.pkl')
df_trans.shape, df_trans.columns

**Random Example of some processed tweets**

In [None]:
for i in df_trans.iloc[np.random.choice(len(df_trans), 5)]['text']: print(f"{i}\n")

**Plotting daily tweet count by language over time**

In [None]:
df_dailycounts = df_trans.groupby(['language', 'tweetcreatedts']).size().unstack('language').reset_index().fillna(0)
df_dailycounts_norm = (df_dailycounts.iloc[:, 1:] / df_dailycounts.iloc[:, 1:].sum()).assign(tweetcreatedts=df_dailycounts['tweetcreatedts'])# divding every dayily count by the total number of tweets in that language

In [None]:
warnings.filterwarnings('ignore')
fig = go.Figure()
fig = px.line(df_dailycounts, x='tweetcreatedts', y=df_dailycounts[final_languages].columns,
              #hover_data={"tweetcreatedts": "|%B %d, %Y"},
              template='seaborn',
              title='Daily Tweets per language')
fig.update_xaxes(
    dtick="M1",
    tickformat="%b\n%Y")

fig.show()
fig.write_html("./Plots/daily_tweets_per_language.html") # add button to normalize

**Word Cloud**

In [None]:
# word cloud on all languages
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

stopwords_set = set(STOPWORDS)
stopwords_set.update(["user", "shows", "located", "dtype", "http", 'text']) # adding extra stop words

wordcloud = WordCloud(background_color='white',
                     stopwords = stopwords_set,
                      max_words = 300,
                      max_font_size = 40,
                      scale = 2,
                      random_state=42
                     ).generate(str(df_trans['text']))

plt.imshow(wordcloud)
plt.axis('off')
plt.show()
plt.savefig('./Plots/wordcloud.png')

**Sample subset of clean tweets for semantic processing**

In [None]:
sample_size = 5000
df_trans_subset = df_trans.iloc[np.random.choice(len(df_trans), sample_size)].reset_index(drop=True)
df_trans_subset.to_pickle('./Pickles/df_trans_subset.pkl')

In [None]:
df_trans_subset = pd.read_pickle('./Pickles/df_trans_subset.pkl')
df_trans_subset.shape

In [None]:
df_trans = df_trans_subset

**Extracting semantic tweet features using pretrained transformers**

In [None]:
df_trans = pd.read_pickle('./Pickles/df_trans.pkl')
df_trans.shape, df_trans.columns

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-mpnet-base-v2', device='cuda')#digio/Twitter4SSE
# get features in 2 batches because kernel keeps dying
def get_features(texts):
    arr_features = model.encode(texts, show_progress_bar=True)
    return arr_features
#df_trans = df_trans.reset_index()
length = len(df_trans.index)
tweets_1 = df_trans.iloc[:int(length / 2)].text
tweets_2 = df_trans.drop(tweets_1.index).text

arr_features_1 = get_features(list(tweets_1))
np.save('./Pickles/arr_features_1', arr_features_1)
print("arr 1 saved")
arr_features_2 = get_features(list(tweets_2))
np.save('./Pickles/arr_features_2', arr_features_2)

In [None]:
# combine two batches of features
arr_features_1 = np.load('./Pickles/arr_features_1.npy', allow_pickle=True)
print(arr_features_1.shape)
arr_features_2 = np.load('./Pickles/arr_features_2.npy', allow_pickle=True)
print(arr_features_2.shape)
arr_features = np.concatenate((arr_features_1, arr_features_2))
np.save('./Pickles/arr_features2', arr_features)
print(arr_features.shape)

In [None]:
arr_features = np.load('./Pickles/arr_features.npy', allow_pickle=True)
print(arr_features.shape)

In [None]:
arr_features = arr_features[:5000]
arr_features.shape

In [None]:
arr_features = arr_features[:5000]
arr_features.shape

**Umap, clustering, and topic modelling**

**Umap to high dimensions for clustering**

In [None]:
index = np.random.choice(len(arr_features), 4500)
np.save('./Pickles/index', index)

In [None]:
warnings.filterwarnings('ignore')
index = np.load('./Pickles/index.npy', allow_pickle=True)

obj = ParametricUMAP(n_neighbors=20,
                            n_components=20,
                            min_dist=0,
                            metric='cosine',
                            low_memory=True,
                            verbose=True)
obj.fit(arr_features[index])
umap_highdim_embeddings = obj.transform(arr_features[index])
np.save('./Pickles/umap_highdim_embeddings', umap_highdim_embeddings)
print(umap_highdim_embeddings.shape)

In [None]:
umap_highdim_embeddings = np.load('./Pickles/umap_highdim_embeddings.npy', allow_pickle=True)
umap_highdim_embeddings.shape

In [None]:
**Umap to low dimensions for plotting**

In [None]:
warnings.filterwarnings('ignore')
index = np.load('./Pickles/index.npy', allow_pickle=True)
obj = ParametricUMAP(n_neighbors=20,
                            n_components=2,
                            min_dist=0.5,
                            metric='cosine',
                            low_memory=True,
                            verbose=True)
obj.fit(umap_highdim_embeddings)#index
umap_embeddings = obj.transform(umap_highdim_embeddings)

np.save('./Pickles/umap_embeddings', np.array(umap_embeddings))

In [None]:
warnings.filterwarnings('ignore')
index = np.load('./Pickles/index.npy', allow_pickle=True)
obj = ParametricUMAP(n_neighbors=20,
                            n_components=3,
                            min_dist=0.5,
                            metric='cosine',
                            low_memory=True,
                            verbose=True)
obj.fit(umap_highdim_embeddings)
umap_embeddings_3d = obj.transform(umap_highdim_embeddings)

np.save('./Pickles/umap_embeddings_3d', np.array(umap_embeddings_3d))

In [None]:
#df_trans = pd.read_pickle('./Pickles/df_trans.pkl')
index = np.load('./Pickles/index.npy', allow_pickle=True)
umap_embeddings = np.load('./Pickles/umap_embeddings.npy', allow_pickle=True)
umap_embeddings_3d = np.load('./Pickles/umap_embeddings_3d.npy', allow_pickle=True)
print(df_trans.shape)
df_trans = df_trans.iloc[index]
df_cluster = df_trans.assign(x=umap_embeddings[:, 0],y=umap_embeddings[:, 1], x3d=umap_embeddings_3d[:, 0], y3d=umap_embeddings_3d[:, 1], z3d=umap_embeddings_3d[:, 2])
df_cluster.to_pickle('./Pickles/df_cluster.pkl')
df_cluster

**Hdbscan clustering to find topics**

In [None]:
umap_highdim_embeddings = np.load('./Pickles/umap_highdim_embeddings.npy', allow_pickle=True)
df_cluster = pd.read_pickle('./Pickles/df_cluster.pkl')

In [None]:
warnings.filterwarnings('ignore')
cluster = hdbscan.HDBSCAN(#min_cluster_size=15,
                          #min_samples=30,
                          metric='euclidean',                      
                          cluster_selection_method='eom',
                          #prediction_data=True,
                         ).fit(umap_highdim_embeddings)
df_cluster_topics = df_cluster.assign(topic=cluster.labels_)
print(f"{len(np.unique(cluster.labels_))} topics found")
df_cluster_topics
df_cluster_topics.to_pickle('./Pickles/df_cluster_topics.pkl')
#save model
#with open('./Pickles/hdbscan.pkl', 'wb') as inp:
#    dill.dump(cluster, inp)
df_cluster_topics

**TF-IDF to find important locally unique words per topic**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import STOPWORDS

df_per_topic = df_cluster_topics.groupby(['topic'], as_index = False).agg(({'text': ' '.join})) #create a dataframe organized by topic with all tweets in a topic concatenated
stopwords_set = set(STOPWORDS)
stopwords_set.update(["user", "http", "located", "dtype", "actually", "quot", 'text', '39', 'according', 'got']) # adding extra stop words
stopwords_set=list(stopwords_set)
tfidf = TfidfVectorizer(stop_words=stopwords_set)
X = tfidf.fit_transform(df_per_topic['text']) # fits  tfidf on entire corpus of tweets
feature_names = tfidf.get_feature_names_out()

def get_top_tf_idf_words(response, top_n=2): # function that returns top words given tweets in a specific topic
    sorted_nzs = np.argsort(response.data)[:-(top_n+1):-1]
    return feature_names[response.indices[sorted_nzs]]

arr_tf_idf = []
for i in np.arange(len(df_per_topic)):
    responses = tfidf.transform([df_per_topic['text'][i]])
    arr_tf_idf.append(list(get_top_tf_idf_words(responses,15))) # adds tf_idfs to per topic dataframe

topic_tf_idf = dict(zip(np.arange(-1, len(df_per_topic)), arr_tf_idf))
df_cluster_topics.sort_values(by='topic', inplace=True)
topic_order = np.array(df_cluster_topics['topic'])
df_cluster_topics['tf_idf'] = [topic_tf_idf[i] for i in topic_order]
legend = dict(zip(np.arange(-1,len(df_per_topic)), [str(list(topic_tf_idf.keys())[i+1])+'_'+'_'.join(topic_tf_idf[i][0:5]) for i in np.arange(-1, len(topic_tf_idf) - 1)]))
df_cluster_topics['legend'] = [legend[i] for i in topic_order]

df_cluster_topics.to_pickle('./Pickles/df_cluster_topics.pkl')
df_cluster_topics

**Interactive Scatter Plot with Plotly**

In [None]:
df_cluster_topics = pd.read_pickle('./Pickles/df_cluster_topics.pkl')

In [None]:
#clusters_only = np.where(df_cluster_topics.topic != -1)
#rand_idxs = np.random.choice(len(df_cluster_topics.iloc[clusters_only]), 20000, replace=False) # randomly pick 80000 points for file size purposes
df_cluster_plotting = df_cluster_topics#.iloc[clusters_only].reset_index()#.iloc[rand_idxs]
df_cluster_plotting.to_pickle('./Pickles/df_cluster_plotting.pkl')

In [None]:
df_cluster_plotting = pd.read_pickle('./Pickles/df_cluster_plotting.pkl')

In [None]:
import plotly.express as px
fig = px.scatter(
    df_cluster_plotting,
    x='x',
    y='y',
    custom_data=['text', 'tf_idf', 'topic'],
    color='legend',#[str(i) for i in df_cluster_no_outlier.topic],
    width=1600, height=1000,
    template='seaborn',
)
# hover text style
fig.update_layout(
    hoverlabel=dict(
        bgcolor="white",
        font_size=11,
        font_family="Times New Roman"
    )
)

# hover data
fig.update_traces(
    hovertemplate="<br>".join([
        "%{customdata[0]}",
        "Topic %{customdata[2]} Keywords: %{customdata[1]}<extra></extra>",
    ])
)

# title
fig.update_layout(
    title={
        'text': "<b>Tweets by Topic 2D"})

# point size
fig.update_traces(marker={'size': 2})

fig.show()
fig.write_html("./Plots/tweet_by_topic_map.html")


In [None]:
# 3d plot
import plotly.express as px
fig = px.scatter_3d(
    df_cluster_plotting,
    x='x3d',
    y='y3d',
    z='z3d',
    custom_data=['text', 'tf_idf', 'topic'],
    color='legend',
    width=1600, height=1000,
    template='seaborn',
)
# hover text style
fig.update_layout(
    hoverlabel=dict(
        bgcolor="white",
        font_size=11,
        font_family="Times New Roman"
    )
)

# hover data
fig.update_traces(
    hovertemplate="<br>".join([
        "%{customdata[0]}",
        "Topic %{customdata[2]} Keywords: %{customdata[1]}<extra></extra>",
    ])
)

# title
fig.update_layout(
    title={
        'text': "<b>Tweets by Topic 3D"})

# point size
fig.update_traces(marker={'size': 2})

#fig.show();
fig.write_html("./Plots/tweet_by_topic_map_3d.html")


In [None]:
## TO DO
"""
Figure out how to best isolate pro ukraine and anti russia tweets.
Remove html codes from text

"""

**Once Topics are selected, map them over time per country**

In [None]:
topics = [79, 13]
languages = df_cluster_topics.language.unique()#['de', 'it', 'fr', 'es', 'pl', 'pt', 'el', 'fi', 'cs', 'ro']

mask = df_cluster_topics.topic.isin(topics)

df_tweet_bytopic_bylanguage = df_cluster_topics.groupby(['language', 'tweetcreatedts', 'topic']).size().unstack('language').reset_index().fillna(0)


In [None]:
warnings.filterwarnings('ignore')
#https://plotly.com/python/facet-plots/
fig = go.Figure()
fig = px.line(df_dailycounts, x='tweetcreatedts', y=df_dailycounts[final_languages].columns,
              #hover_data={"tweetcreatedts": "|%B %d, %Y"},
              template='seaborn',
              title='Daily Tweets per language')
fig.update_xaxes(
    dtick="M1",
    tickformat="%b\n%Y")

fig.show()
fig.write_html("./Plots/.html") # add button to normalize