In [None]:
!pip install git+https://github.com/dtemkin/gsdmm.git
!pip install wordcloud
!pip install --upgrade gensim
!pip install --upgrade s3fs
!pip install --upgrade boto3

In [None]:
import string
import re
import html
import json
import gzip
import datetime
import itertools
import numpy as np
import pandas as pd

from wordcloud import WordCloud

import matplotlib.pyplot as plt

In [None]:
import gensim
from gsdmm import MovieGroupProcess

In [None]:
import nltk

from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")
nltk.download("omw-1.4")

# punctuation
nltk.download("punkt")

# pos tagging
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')

from nltk.tokenize import word_tokenize

from nltk.corpus import stopwords 
nltk.download('stopwords')

In [None]:
# s3 specific libraries
import boto3
s3 = boto3.resource("s3")

import s3fs
s3fs = s3fs.S3FileSystem(anon=False)

In [None]:
english_stop_words = set(stopwords.words('english'))

In [None]:
data_files = [f"s3://{file_path}" for file_path in s3fs.glob("s3://mips-main-tests/chathura_tests/cache2/*cache*.json.gzip")]
len(data_files)

In [None]:
import gsdmm
gsdmm.__file__

In [None]:
data_files[0]#[len("s3://mips-main-tests/"):]

In [None]:
def read_gzip_from_s3(bucket_name, path_inside_bucket):
    obj = s3.Object(bucket_name, path_inside_bucket)
    with gzip.GzipFile(fileobj=obj.get()["Body"]) as gzipfile:
        json_bytes = gzipfile.read()
    return json_bytes

def get_data_from_s3_file(bucket_name, path_inside_bucket):
    json_bytes = read_gzip_from_s3(bucket_name, path_inside_bucket)
    json_str = json_bytes.decode('utf-8')
    data = json.loads(json_str)
    return data

def read_text_data(full_s3_data_file_path):
    len_path_to_bucket = len("s3://mips-main-tests/")
    data = get_data_from_s3_file("mips-main-tests", full_s3_data_file_path[len_path_to_bucket:])
    return data["data"]

In [None]:
np.random.seed(123)
sample_data_files = np.random.choice(data_files, size=100, replace=False)

In [None]:
all_data = list(itertools.chain.from_iterable( [read_text_data(data_file) for data_file in sample_data_files ] ))
len(all_data)

In [None]:
all_data[0]

In [None]:
possible_fields = set().union(*[set(tweet.keys()) for tweet in all_data])
possible_fields

In [None]:
common_fields = possible_fields.intersection(*[set(tweet.keys()) for tweet in all_data])
common_fields

In [None]:
class Maybe:
    def __init__(self, json_object):
        self.json_object = json_object
        
    def field(self, field):
        if self.json_object is not None and type(self.json_object) is dict and field in self.json_object:
            return Maybe(self.json_object[field])
        return Maybe(None)
    
    def index(self, index):
        if self.json_object is not None and type(self.json_object) is list and index < len(self.json_object):
            return Maybe(self.json_object[index])
        return Maybe(None)
    
    def array(self, func=lambda m: m, as_type=list):
        if self.json_object is not None and type(self.json_object) is list:
            return as_type([func(obj) for obj in self.json_object])
        return []
    
    def value(self):
        return self.json_object

# get_maybe(get_maybe(tweet_json, "referenced_tweets"), "quoted")

In [None]:
column_names = ['id',
                'conversation_id',
                'edit_history_tweet_ids',
                'author_id',
                'created_at',
                'text',
                'impression_count',
                'like_count',
                'quote_count',
                'reply_count',
                'retweet_count',
                'quoted',
                'replied_to',
                'retweeted',
                'in_reply_to_user_id',
                'mentions']
def get_columns(tweet_json):
    quoted = []
    replied_to = []
    retweeted = []
    for ref_tweet in Maybe(tweet_json).field("referenced_tweets").array():
        if ref_tweet["type"] == "quoted":
            quoted.append(ref_tweet["id"])
        elif ref_tweet["type"] == "replied_to":
            replied_to.append(ref_tweet["id"])
        elif ref_tweet["type"] == "retweeted":
            retweeted.append(ref_tweet["id"])
    columns_values = [
        # tweet always has following keys
        tweet_json["id"],
        tweet_json["conversation_id"],
        tweet_json["edit_history_tweet_ids"], # list of tweetIds
        tweet_json["author_id"],
        tweet_json["created_at"],
        tweet_json["text"],
        tweet_json["public_metrics"]["impression_count"],
        tweet_json["public_metrics"]["like_count"],
        tweet_json["public_metrics"]["quote_count"],
        tweet_json["public_metrics"]["reply_count"],
        tweet_json["public_metrics"]["retweet_count"],
        # optional tweet data fields
        str(quoted),
        str(replied_to),
        str(retweeted),
        Maybe(tweet_json).field("in_reply_to_user_id").value(),
        # tweet_json["geo"] # we dont take this field at the moment
        Maybe(tweet_json).field("entities").field("mentions").array(lambda m: m["id"], str),
        # Maybe(tweet_json).field("attachments") # we dont take this field at the moment
    ]
    return columns_values


In [None]:
df = pd.DataFrame([get_columns(d) for d in all_data], columns=column_names)
df

In [None]:
# df = pd.read_csv("s3://mips-main-tests/chathura_tests/data_minimal/all_data_minimal.csv")
# df

In [None]:
# df[df["created_at"] == '2022-04-01T04:58:38.000Z']

In [None]:
results = []
for tweet_text in df.text.values:
    result = tweet_text
    result = re.sub(r"(@\S+|http\S+|\n|\'|[“”’])", "", result)
    result = html.unescape(result)
    results.append(result)

In [None]:
results

In [None]:
# Initialize wordnet lemmatizer
wnl = WordNetLemmatizer()

def postag_to_argument(postag):
    if postag[0] == "V":
        return "v"
    elif postag[0] == "N":
        return "n"
    elif postag[0] == "J":
        return "a"
    elif postag[0] == "R":
        return "r"
    else:
        return "v"

def get_words(in_sentence, in_wnlemmatizer):
    # print(in_sentence)
    # Remove punctuation
    example_sentence_no_punct = in_sentence.translate(str.maketrans("", "", string.punctuation))
    # print(example_sentence_no_punct)
    # separate tokens
    word_tokens = word_tokenize(example_sentence_no_punct)
    # print(word_tokens)
    # tag Part of Speech
    postag = dict(nltk.pos_tag(word_tokens))
    # print(postag)
    # Lemmatize
    lemmas = [ wnl.lemmatize(word, pos=postag_to_argument(postag[word])) for word in word_tokens ]
    # print(lemmas)
    # remove stop words
    filtered_lemmas = [w for w in lemmas if not w.lower() in english_stop_words]
    # print(filtered_word_tokens)
    return filtered_lemmas

In [None]:
text_tokens = [ get_words(doc, wnl) for doc in results ]
text_tokens

In [None]:
# create dictionary of all words in all documents
dictionary = gensim.corpora.Dictionary(text_tokens)

# filter extreme cases out of dictionary
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

# create variable containing length of dictionary/vocab
vocab_length = len(dictionary)

# create BOW dictionary
bow_corpus = [dictionary.doc2bow(doc) for doc in text_tokens]

# initialize GSDMM
gsdmm = MovieGroupProcess(K=10, alpha=0.1, beta=0.3, n_iters=15)

# fit GSDMM model
y = gsdmm.fit(text_tokens, vocab_length)

In [None]:
# print number of documents per topic
doc_count = np.array(gsdmm.cluster_doc_count)
print('Number of documents per topic :', doc_count)

# Topics sorted by the number of document they are allocated to
top_index = doc_count.argsort()[-15:][::-1]
print('Most important clusters (by number of text_tokens inside):', top_index)

# define function to get top words per topic
def top_words(cluster_word_distribution, top_cluster, values):
    for cluster in top_cluster:
        sort_dicts = sorted(cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print("\nCluster %s : %s"%(cluster, sort_dicts))

# get top words in topics
top_words(gsdmm.cluster_word_distribution, top_index, 20)

In [None]:
def draw_wordcloud(topic_number, values = 20):
    # Get topic word distributions from gsdmm model
    cluster_word_distribution = gsdmm.cluster_word_distribution
    
    # Select topic you want to output as dictionary (using topic_number)
    topic_dict = sorted(cluster_word_distribution[topic_number].items(), key=lambda k: k[1], reverse=True)
    
    # Generate a word cloud image
    wc = WordCloud(background_color='#fcf2ed', 
                                width=1800,
                                height=700,
                                random_state = 123,
                                #font_path=path_to_font,
                                colormap='flag')
    
    wc.generate_from_frequencies(dict(topic_dict[:values]))

    print(topic_number, topic_dict[:5])
    
    # Print to screen
    fig, ax = plt.subplots(figsize=[10,10])
    plt.imshow(wc, interpolation='bilinear')
    plt.axis("off")
    
    # Save to disk
    wc.to_file(f"./wc_{topic_number}.png")
    
    plt.show()

In [None]:
for cluster in top_index:
    draw_wordcloud(cluster, 25)