In [None]:
import math
import spacy
import random
import pandas as pd
import numpy as np
from tqdm import tqdm 
from collections import Counter
from util import preprocess_w2e

nlp = spacy.load('en_core_web_sm', disable=['textcat', 'parser', 'ner'])

In [None]:
# Load pre-defined word list
with open('data/D_common.txt', 'r') as f:
    D_common = set([line.strip() for line in f])
        
with open('data/D_tech.txt', 'r') as f:
    D_tech = set([line.strip() for line in f])

with open('data/D_whitelist.txt', 'r') as f:
    D_whitelist = set([line.strip() for line in f])

In [None]:
def count_tokens(token_list):
    word = [t for tl in token_list for t in tl]
    counter = Counter(word)
        
    return counter, set(list(counter.keys()))

def get_significat_words(word_list, n, current_c):
    z = 1.645
    word_list_ = []
    for w in word_list:
        f_w = current_c[w]
        p_w = f_w/n
        if p_w >= z*math.sqrt(p_w*(1-p_w)/n): 
            word_list_.append(w)
    return word_list_

def get_new_words(C, K, n, current_c):
    new_words = set(K - C - D_common.union(D_tech))
    new_words_ = get_significat_words(new_words, n, current_c)
    print(f"new words before: {len(new_words)}; new words after: {len(new_words_)}; ")
    return new_words_

In [None]:
# Load data
tweet_df = pd.read_json('../data/tweets_20220601_20220625.json')
tweet_df = tweet_df.drop_duplicates(subset=['text'])

tweets_past_list = []
tweets_past_list.append(tweet_df.loc[tweet_df['created_at']>np.datetime64('2022-06-01')].loc[tweet_df['created_at']<=np.datetime64('2022-06-02')].reset_index(drop=True))
tweets_past_list.append(tweet_df.loc[tweet_df['created_at']>np.datetime64('2022-06-02')].loc[tweet_df['created_at']<=np.datetime64('2022-06-03')].reset_index(drop=True))
tweets_past_list.append(tweet_df.loc[tweet_df['created_at']>np.datetime64('2022-06-03')].loc[tweet_df['created_at']<=np.datetime64('2022-06-04')].reset_index(drop=True))
tweets_cur = tweet_df.loc[tweet_df['created_at']>np.datetime64('2022-06-04')].loc[tweet_df['created_at']<=np.datetime64('2022-06-05')].reset_index(drop=True)

In [None]:
# text_p is list of tokens 
tweets_past_list_p = [tl['text'].apply(lambda x: preprocess_w2e(x)) for tl in tweets_past_list]
# tweets_past['text_p'] = tweets_past['text'].apply(lambda x: preprocess_w2e(x))
tweets_cur['text_p'] = tweets_cur['text'].apply(lambda x: preprocess_w2e(x))

In [None]:
# New keywords
tokens_last  = tweets_past_list_p[-1].tolist()
tokens_cur = tweets_cur.text_p.tolist()

# C: last, t-1 ~ t
# K: current, t
last_c, C = count_tokens(tokens_last) # return (token, count)
current_c, K = count_tokens(tokens_cur)
n = len(tokens_last)

new_words = get_new_words(C, K, n)

In [None]:
past_all_df = pd.DataFrame()
for i, text_p in enumerate(tweets_past_list_p):
    past_c, _ = count_tokens(text_p) 
    past_df = pd.DataFrame.from_dict(past_c, orient='index').reset_index()
    past_df = past_df.rename(columns={'index':'token', 0:'freq'})
    past_df['ts'] = i 
    past_all_df = pd.concat([past_all_df, past_df])

In [None]:
# Re-emerging keywords

smoothing_f = 0.4
reemerge_words = []
k = len(tweets_past_list_p) # 3
sf = (smoothing_f * (1 - (1 - smoothing_f)**(2 * k))) / (2 - smoothing_f)

C_R = C.intersection(D_tech - D_whitelist)
C_R_ = get_significat_words(C_R, n)

print(past_all_df.shape)
past_all_df = past_all_df[past_all_df['token'].isin(C_R_)]
print(past_all_df.shape)

for token, tmp_df in past_all_df.groupby('token'):
    if tmp_df.shape[0] != 3:
        for ts_ in set([0,1,2]) - set(tmp_df.ts.tolist()):
            new_row = pd.DataFrame.from_dict([{'token':token, 'freq':0, 'ts':ts_}])
            tmp_df = pd.concat([tmp_df, new_row])
        tmp_df.sort_values(by=['ts'], ascending=[True])
    tmp_df['EWMA'] = tmp_df['freq'].ewm(alpha=smoothing_f, adjust=False).mean()
    fw = current_c[token]
    fw_ = tmp_df.iloc[-1]['EWMA']
    tmp_df['sigma'] = (tmp_df['freq'] - tmp_df['EWMA'])**2
    if (fw-fw_)**2 >= 3.8 * tmp_df.sigma.sum()/k * sf:
        reemerge_words.append(token)

In [None]:
len(reemerge_words), len(new_words)

In [None]:
with open('reemerge_words.txt', 'w') as f:
    for w in reemerge_words:
        f.write(f"{w}\n")

with open('new_words.txt', 'w') as f:
    for w in new_words:
        f.write(f"{w}\n")

In [None]:
keywords = set(reemerge_words).union(new_words)

In [None]:
filter_words = D_common.union(D_whitelist).union(K) 

In [None]:
filtered_tweets = tweets_cur[tweets_cur['text_p'].apply(lambda x: len(set(x).intersection(keywords)) > 0)]

In [None]:
fltr_tweets = filtered_tweets.text_p.tolist()

In [None]:
sim_matrix = np.zeros((len(fltr_tweets), len(fltr_tweets)))

In [None]:
for i in tqdm(range(len(fltr_tweets))): 
    for j in range(i, len(fltr_tweets)):
        intersect = set(fltr_tweets[i]).intersection(set(fltr_tweets[j]))
        union = set(fltr_tweets[i]).union(set(fltr_tweets[j]))
        jaccard = len(intersect) / len(union) 
        sim_matrix[i][j] = jaccard
        sim_matrix[j][i] = jaccard

## Clustering

In [None]:
# Hierarchical clusteirng
from sklearn.cluster import AgglomerativeClustering

clustering = AgglomerativeClustering(affinity='precomputed', linkage='complete').fit(sim_matrix)

clustering.labels_


In [None]:
sample_indices = list(np.where(clustering.labels_==1)[0])

In [None]:
event1 = tweets_cur.iloc[sample_indices]

In [None]:
clustering.n_clusters_

In [None]:
Counter(clustering.labels_)

In [None]:
new_words

In [None]:
event1['text'].tolist()