In [None]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk.tokenize import sent_tokenize
from scipy.stats.stats import pearsonr 
import numpy as np
import math
import pickle

np.random.seed(2018)

import nltk
#nltk.download('wordnet')

In [None]:
stemmer = SnowballStemmer("english")

In [None]:
import glob
import random
import itertools
import os
from pprint import pprint
import time
import re
import numpy as np
import seaborn as sns

In [None]:
from matplotlib import pyplot as plt

In [None]:
#Load the data
import pickle
with open("../data/deduped_policy_text_v11no_html_with_links_and_emails.pickle", "rb") as f:
    df_all = pickle.load(f)

In [None]:
df_1k = df[df.site_url.isin(set(df[df.alexa_rank <= 1000].site_url.unique()))]
df_sample = df_all.sample(1000)

In [None]:
#df = df_all
df = df_all

In [None]:
list(df)

In [None]:
section_re = re.compile("\#+")
ngram_size = 1

ttt = nltk.tokenize.TextTilingTokenizer()

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return [" ".join(words) for words in nltk.ngrams(result,ngram_size)]

def load_and_preprocess(rowId):
    text = df.loc[rowId].policy_text
    #sections = sent_tokenize(text)
    #return [preprocess(s) for s in sections]
    try:
        return [preprocess(para) for para in ttt.tokenize(text)]
    except:
        return [preprocess(text) ]
        

In [None]:
rowIds=list(df.index)

start = time.time()
processed_docs_structured = list(map(load_and_preprocess,rowIds))
print("Elapsed: %f " % (time.time() - start))
processed_docs = sum(processed_docs_structured,[])

segment_map = []
prev_stop = 0
for i in range(len(rowIds)):
    new_stop = prev_stop + len(processed_docs_structured[i])
    segment_map.append((prev_stop,new_stop))
    prev_stop = new_stop

In [None]:
#For some segment i
def get_original_text(i):
    for j,(start,stop) in enumerate(segment_map):
        if i >= start and i < stop:
            offset = i - start
            text = df.loc[rowIds[j]].policy_text
            sections = sent_tokenize(text)
            return sections[offset]

In [None]:
dictionary=gensim.corpora.Dictionary(processed_docs)

In [None]:
#Tokens should occur at least 50 times to be interesting
dictionary.filter_extremes(no_below=50, no_above=0.5, keep_n=100000) 

In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [None]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [None]:
BOW_TOPICS = 500
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=BOW_TOPICS, id2word=dictionary, passes=2, workers=20)

for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

In [None]:
TFIDF_TOPICS = 100
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=TFIDF_TOPICS, id2word=dictionary, passes=2, workers=20)

for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

In [None]:
get_original_text(38)

In [None]:
print(corpus_tfidf[38])
for index, score in sorted(lda_model_tfidf[corpus_tfidf[38]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))

In [None]:
for index, score in sorted(lda_model[bow_corpus[38]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))

In [None]:
#For each document, find the topics
#Topic weight is the max of the topic weight over all sentences

doc_top_hits_tfidf = [{} for i in range(len(rowIds))]
doc_top_hits_bow = [{} for i in range(len(rowIds))]
for i in range(len(rowIds)):
    start,end=segment_map[i]
    for j in range(start,end):
        for index, score in lda_model_tfidf[corpus_tfidf[j]]: #sorted(lda_model_tfidf[corpus_tfidf[j]], key=lambda tup: -1*tup[1]):
            doc_top_hits_tfidf[i][index] = max(doc_top_hits_tfidf[i].get(index,0), score)
        for index, score in lda_model[bow_corpus[j]]: #sorted(lda_model[bow_corpus[j]], key=lambda tup: -1*tup[1]):
            doc_top_hits_bow[i][index] = max(doc_top_hits_bow[i].get(index,0), score)

In [None]:
def print_topics_for(doc_id):
    print("BOW:")
    for i,s in sorted(doc_top_hits_bow[doc_id].items(),key=lambda x: -x[1]):
        print("\t%d: %f; %s" % (i,s,lda_model.print_topic(i, 10)))
    print("TF-IDF:")
    for i,s in sorted(doc_top_hits_tfidf[doc_id].items(),key=lambda x: -x[1]):
        print("\t%d: %f; %s" % (i,s,lda_model_tfidf.print_topic(i, 10)))

In [None]:
top_sums_bow = {}
top_sums_tfidf = {}
for top_hits in doc_top_hits_bow:
    for i,s in top_hits.items():
        top_sums_bow[i] = top_sums_bow.get(i,0) + s
        
for top_hits in doc_top_hits_tfidf:
    for i,s in top_hits.items():
        top_sums_tfidf[i] = top_sums_tfidf.get(i,0) + s

In [None]:
bow_filtered = set([i for i,s in sorted(top_sums_bow.items(),key=lambda x: -x[1])][:20])
tfidf_filtered = set([i for i,s in sorted(top_sums_tfidf.items(),key=lambda x: -x[1])][:20])

In [None]:
def print_topics_for_filtered(doc_id):
    print("BOW:")
    for i,s in sorted(doc_top_hits_bow[doc_id].items(),key=lambda x: -x[1]):
        if i in bow_filtered: continue
        print("\t%d: %f; %s" % (i,s,lda_model.print_topic(i, 10)))
    print("TF-IDF:")
    for i,s in sorted(doc_top_hits_tfidf[doc_id].items(),key=lambda x: -x[1]):
        if i in tfidf_filtered: continue
        print("\t%d: %f; %s" % (i,s,lda_model_tfidf.print_topic(i, 10)))

In [None]:
ys_counts = {
    ys:len(df[df.year_season == ys]) for ys in df.year_season.unique()
}

In [None]:
top_sums_bow_ys = {}
for i in range(len(doc_top_hits_bow)):
    top_hits = doc_top_hits_bow[i]
    ys = df.loc[rowIds[i]].year_season
    if ys not in top_sums_bow_ys: top_sums_bow_ys[ys] = {}
    for i,s in top_hits.items():
        #if i in bow_filtered: continue
        top_sums_bow_ys[ys][i] = top_sums_bow_ys[ys].get(i,0) + s
        
top_sums_tfidf_ys = {}
for i in range(len(doc_top_hits_tfidf)):
    top_hits = doc_top_hits_tfidf[i]
    ys = df.loc[rowIds[i]].year_season
    if ys not in top_sums_tfidf_ys: top_sums_tfidf_ys[ys] = {}
    for i,s in top_hits.items():
        #if i in tfidf_filtered: continue
        top_sums_tfidf_ys[ys][i] = top_sums_tfidf_ys[ys].get(i,0) + s

In [None]:
print("BOW")
for ys, ts in sorted(top_sums_bow_ys.items(),key=lambda x: x[0]):
    print("\t%s:" % ys)
    for i,s in sorted(ts.items(),key=lambda x: -x[1])[:20]:
        print("\t\t%d: %f (%s)" % (i,s,lda_model.print_topic(i, 4)))
print("TF-IDF")
for ys, ts in sorted(top_sums_tfidf_ys.items(),key=lambda x: x[0]):
    print("\t%s:" % ys)
    for i,s in sorted(ts.items(),key=lambda x: -x[1])[:20]:
        print("\t\t%d: %f (%s)" % (i,s,lda_model_tfidf.print_topic(i, 4)))

In [None]:
topicIds = set(itertools.chain(*[list(d.keys()) for d in top_sums_tfidf_ys.values()]))

In [None]:
ys_values = list(sorted(list(top_sums_bow_ys.keys())))

In [None]:
topics_timelines = {
    topicId: [top_sums_bow_ys[ys].get(topicId,0) / ys_counts[ys] for ys in ys_values] for topicId in topicIds
}

In [None]:
def plot_top(scores_and_topics):
    
    plt.figure(figsize=(12,5))
    
    for score, topicId in scores_and_topics[:10]:
        print("%d: %f (%s)" % (topicId,score,lda_model.print_topic(topicId, 4)))
    
    labels=ys_values
    x=list(range(len(ys_values)))
    ys=[topics_timelines[topicId] for _,topicId in scores_and_topics[10:]]

    #Plot
    for y in ys:
        sns.lineplot(x=labels,y=y,color="grey")

    ys=[(topics_timelines[topicId],topicId) for _,topicId in scores_and_topics[:10]]

    #Plot
    for y,topicId in ys:
        fig = sns.lineplot(x=labels,y=y,label=topicId)
        
    for item in fig.get_xticklabels():
        item.set_rotation(45)
        
    

## Max-min

In [None]:
cutoff = np.percentile(list(itertools.chain(*topics_timelines.values())),5)

def get_dip(freqs):
    freqs = freqs[-22:]
    maxF = max(freqs)
    minF = min(freqs)
    if minF <= cutoff:
        return 0
    return (maxF - minF)

topics_biggest_dip = sorted([(get_dip(v),k) for k,v in topics_timelines.items()],reverse=True)

In [None]:
plot_top(topics_biggest_dip)

## Max

In [None]:
cutoff = np.percentile(list(itertools.chain(*topics_timelines.values())),5)

def get_max(freqs):
    freqs = freqs[-22:]
    maxF = max(freqs)
    return maxF

topics_max = sorted([(get_max(v),k) for k,v in topics_timelines.items()],reverse=True)

In [None]:
plot_top(topics_max)

## Drop over 22 intervals

In [None]:
cutoff = np.percentile(list(itertools.chain(*topics_timelines.values())),5)

def get_dip(freqs):
    return freqs[-1] - freqs[-22]

topics_biggest_dip = sorted([(get_dip(v),k) for k,v in topics_timelines.items()],reverse=True)

In [None]:
plot_top(topics_biggest_dip)