http://www.nltk.org/nltk_data/

In [None]:
import numpy as np
import pandas as pd
import json
import logging
import os
log = logging.getLogger("twitter")
hdl = logging.StreamHandler()
hdl.setLevel(logging.DEBUG)
log.addHandler(hdl)

In [None]:
data = {}

with open("samples/positive_tweets.json", "r") as file:
    data["positive"] = file.readlines()[:1000]
    
with open("samples/negative_tweets.json", "r") as file:
    data["negative"] = file.readlines()[:1000]

In [None]:
import inflect
import regex as re
inflect_engine = inflect.engine()

def singularize(word):
    singular = inflect_engine.singular_noun(word) #singularize
    if singular != False:
        return singular
    else:
        return word

def process_phrase(string: str):
    string = re.sub('[\t\n]{1,}', '', string) #remove \n
    string = re.sub('&[a-zA-Z0-9]{1,}', '', string) #remove &aa
    string = re.sub('<[^<]+?>', '', string) #remove html
    string = re.sub('\([^<]+?\)', '', string) #parentesis
    string = re.sub('[-\.\:\\",!()?;_]', '', string) #special characters
    string = re.sub('/', ' ', string) #replace / with space
    string = re.sub('[ ]{2,}', '', string) # remove double spaces
    string = re.sub('[0-9]{1,}', '', string) # remove numbers
    string = re.sub('#[a-zA-Z0-9]{1,}', '', string) # remove hastangs
    string = re.sub('@[_a-zA-Z0-9]{1,}', '', string) # remove hastang
    string = re.sub(' r ', ' are ', string) # r->are
    string = re.sub(' u ', ' you ', string) #u->you
    string = re.sub(' i ', ' I ', string) #i I
    string = re.sub('i\'ll', 'I will', string) #i'll
    string = string.lower()
    string = ' '.join(list(map(singularize, string.split(" "))))
    return string

def get_hastangs(tweet):
    return list(map(lambda hashtang: hashtang["text"],tweet["entities"]["hashtags"]))

In [None]:
def get_tweet_dfs(tweet_data, tweet_type="positive"):
    df = pd.DataFrame()
    df_meta = pd.DataFrame()
    df_files = pd.DataFrame(columns=["kind"])
    
    tweet = json.loads(tweet_data)
    text = process_phrase(tweet["text"])
    hastangs = get_hastangs(tweet)
    name = "id_"+tweet["id_str"]
    
    df_files=df_files.append(pd.Series(name=name, index=["kind"], data=tweet_type))
    df = df.join(pd.Series(*np.unique(text.split(" "), return_counts=True)[::-1], name=name), how="outer")
    df_meta = df_meta.join(pd.Series(index=hastangs, data=1, name=name), how="outer")
    
    return df, df_meta, df_files

def append_callback(x):
    global df
    global df_meta
    global df_files
    df_j, df_meta_j, df_files_j = x
    try:
        df = df.join(df_j, how="outer") # join new articles
        df_meta = df_meta.join(df_meta_j, how="outer") #join new articles
        df_files = df_files.append(df_files_j) 
    except:
        pass

In [None]:
import multiprocessing as mp

In [None]:
pool = {}
work = {}

df = pd.DataFrame()
df_meta = pd.DataFrame()
df_files = pd.DataFrame(columns=["kind"])

for kind in ["positive", "negative"]: 
    pool[kind] = mp.Pool(6)

    work[kind] = [pool[kind].apply_async(get_tweet_dfs, args=([d,kind]), callback=append_callback, error_callback=lambda err: log.debug(err)) for d in data[kind]]

    pool[kind].close()
    
pool["positive"].join()
pool["negative"].join()

In [None]:
df = df.fillna(0).astype(int).drop_duplicates()

'''
do reindex in two steps to avoid undefined behaviour
the sum is made on the new index
'''
df = df.reindex(index = df.index[df.sum(1) >= 5]) #words that appear in at least # tweets 
df = df.reindex(columns = df.columns[df.sum(0) > 5]) #tweets with at least # word 

df_meta = df_meta.fillna(0).astype(int).drop_duplicates()

df_meta = df_meta.reindex(columns=df.columns) # match df index
df_meta = df_meta.reindex(index=df_meta.index[df_meta.sum(1)>1]) # Keywords with at least # tweet

df_files=df_files.reindex(index=df.columns)

In [None]:
df.sum(0).min()

In [None]:
print(df.shape)
print(df_meta.shape)
print(df_files.shape)

In [None]:
df_meta.sum(1)

In [None]:
import matplotlib.pyplot as plt

In [None]:
fig, ax = plt.subplots()
df.divide(df.sum(0),1).mean(1).sort_values(ascending=False).plot(ax=ax)

x = np.linspace(2,3e2)
ax.plot(x,2e-1*x**(-1))

ax.set_yscale("log")
ax.set_xscale("log")

# Make hSBM graph

In [None]:
import sys
sys.path.append("../hSBM_Topicmodel/")

In [None]:
import graph_tool.all as gt
from sbmtm import sbmtm

In [None]:
sbmtm = sbmtm()
sbmtm.make_graph_from_BoW_df(df)
sbmtm.save_graph("twitter.xml.gz")

In [None]:
g = sbmtm.g
g

In [None]:
sbmtm.fit(n_init=5, verbose=False, B_min=10, parallel=True)

In [None]:
os.system("mkdir -p twitter")
os.chdir("twitter/")
df_files.to_csv("files.dat")
os.system("mkdir -p topsbm")
os.chdir("topsbm/")
sbmtm.save_data()
sbmtm.save_graph()
os.chdir("../../")

In [None]:
gt.draw_hierarchy(sbmtm.state, layout="bipartite", hedge_pen_width=8, hvertex_size=25, vertex_kind=sbmtm.g.vertex_properties["kind"])

## triSBM

In [None]:
sys.path.append("../trisbm/")
from trisbm import trisbm

In [None]:
trisbm = trisbm()
trisbm.make_graph(df.append(df_meta), lambda word_keyword: 2 if word_keyword in df_meta.index else 1)

In [None]:
trisbm.save_graph("twitter_keyword.xml.gz")

In [None]:
trisbm.fit(n_init=5, verbose=False, B_min=15)

In [None]:
import os

In [None]:
os.system("mkdir -p twitter_key")
os.chdir("twitter_key/")
df_files.to_csv("files.dat")
os.system("mkdir -p trisbm")
os.chdir("trisbm/")
trisbm.save_data()
trisbm.save_graph()
os.chdir("../../")

In [None]:
gt.draw_hierarchy(trisbm.state, 
                  #pos=gt.sfdp_layout(model.g),
                  hedge_pen_width=8, 
                  hvertex_size=25, 
                  vertex_kind=trisbm.g.vertex_properties["kind"]
                 )

## Benchmark

In [None]:
from topicpy.hsbmpy import get_scores, get_scores_shuffled, add_score_lines, normalise_score
import matplotlib.pyplot as plt
import pandas as pd
import os

In [None]:
labels = ["kind"]
scores = get_scores("twitter_key", labels, algorithm="trisbm", verbose=False, df_files=df_files)
scores['trisbm'] = scores[labels[0]]
scores["hsbm"]=get_scores("twitter", labels, algorithm="topsbm", verbose=False, df_files=df_files)[labels[0]]
scores['shuffle'] = get_scores_shuffled("twitter_key", df_files, label=labels[0], algorithm='trisbm')
normalise_score(scores, base_algorithm="shuffle", operation=lambda x,y: x/y)

In [None]:
fig=plt.figure(figsize=(18,15))
ax = fig.subplots(1)
add_score_lines(ax,scores,labels=["hsbm","trisbm", "shuffle"], V="norm_V", alpha=1)
ax.set_xscale('log')
ax.set_ylim(0,max(map(lambda s: max(s["norm_V"]), scores.values()))*1.1)
ax.set_xlim(0,10)

plt.show()
fig.savefig("metric_scores.pdf")

In [None]:
import plotly.graph_objects as go

In [None]:
fig = go.Figure()
fig.add_traces([
    go.Bar(y=scores["hsbm"]["norm_V"], name="hSBM"),
    go.Bar(y=scores["trisbm"]["norm_V"], name="triSBM")
])

titlefont = {
    "size": 30 
}

tickfont = {
    "size":25
}

layout = {
    "title":"Twitter dataset",
    "xaxis":{
        "title": "Resolution",
        "titlefont": titlefont,
        "tickfont": tickfont
    },
    "yaxis":{
        "title": "NMI/NMI*",
        "titlefont": titlefont,
        "tickfont": tickfont
    },
    "legend":{
        "font_size":35
    }
}

fig.update_layout(layout)
#fig.write_image("metric_scores_bar.pdf")

In [None]:
from topicpy.hsbmpy import clusteranalysis

In [None]:
clusteranalysis(os.getcwd()+"/twitter/", ["kind"], algorithm="topsbm")
clusteranalysis(os.getcwd()+"/twitter_key/", ["kind"], algorithm="trisbm")