# Plot VNC for r/conspiracy and r/science

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import sys
import os
import json
import itertools
import sqlite3
from datetime import datetime

from sklearn.preprocessing import StandardScaler
from nltk import ngrams as make_ngrams
from scipy.cluster.hierarchy import cophenet
from scipy.spatial.distance import pdist

sys.path.insert(1, "C:/Users/Eddie/Documents/language-change-methods")
sys.path.insert(1, "C:/Users/Eddie/Documents/language-change-application/flat-earth-forum/analysis")

from helpers import load_posts, load_toks, load_pos, get_top_n_toks
from vnc import VNC, plot_vnc
from utility_functions import get_data_windows, get_time_windows, basic_preprocessing
from features import get_tok_counts, function_words, combine_counts, make_feature_matrix
from features import get_ngram_lr_and_ll
from word_clouds import make_wordcloud

# This method calculates cosine distance between two vectors.
from scipy.spatial.distance import cosine as cosine_dist
# This method simply inverts it to get similarity.
cosine_sim = lambda x,y: 1 - cosine_dist(x,y)

from sklearn.metrics import jaccard_score

convert_to_date = lambda x: datetime.strptime(x, "%Y/%m/%d %H:%M:%S")

SUBREDDITS_FP = "C:/Users/Eddie/Documents/Datasets/Reddit/Other"
TOKENS_FP = "C:/Users/Eddie/Documents/Datasets/Reddit/Other/CHUNKS_COMMENTS_TOKENISED"
OUT_DIR = "C:/Users/Eddie/Documents/Datasets/Flat Earth Graphs/VNC"

ModuleNotFoundError: No module named 'utility_functions'

In [None]:
window_size = 90000

In [None]:
def read_subreddit(db_fp):
    conn = sqlite3.connect(db_fp)
    comments = pd.read_sql_query("SELECT uid, time FROM comments", conn)
    comments.set_index("uid", inplace=True)
    comments['time'] = comments['time'].apply(convert_to_date)
    comments.sort_values("time", inplace=True)
    conn.close()
    return comments

In [None]:
def get_bow_counts(curr_toks):
    bow_counts = {i: get_tok_counts(post) for i, post in curr_toks.items()}
    bow_counts = {i: {tok: count for tok, count in post.items() if tok not in function_words} for i, post in bow_counts.items()}
    top_n = get_top_n_toks(bow_counts.values(), 1000)
    bow_counts = {i: {tok: count for tok, count in post.items() if tok in top_n} for i, post in bow_counts.items()}
    bow_counts = pd.Series(bow_counts)
    return bow_counts

In [None]:
merge_lists = lambda x: list(itertools.chain.from_iterable(x))

In [None]:
def get_align_toks(tok_fp, posts):
    # Get the corresponding tokens
    toks = {x[0]: x[1] for x in load_toks(tok_fp)}
    toks = pd.Series(toks)
    toks = toks[toks.index.isin(posts.index)]

    # Remove the posts that don't have tokens
    posts = posts[posts.index.isin(toks.index)]
    # Align the ordering of forum posts and toks
    toks = toks.loc[posts.index]

    return toks, posts

In [None]:
def make_subreddit_vnc(comments, toks, curr_name, window_size):
    window_toks = {curr_date: merge_lists(toks.loc[curr_window.index]) 
                   for curr_date, curr_window in get_data_windows(comments, 
                                                                  window_size, 
                                                                  window_size)}
    window_toks = pd.Series(window_toks)

    # Get the counts
    curr_counts = get_bow_counts(window_toks)

    # Make the feature matrix
    curr_feats, curr_feat_names = make_feature_matrix(curr_counts.to_dict(), False)
    norm_feats = curr_feats / window_toks.apply(len).values[:,None]

    feats = StandardScaler().fit_transform(norm_feats)
    feats = pd.DataFrame(feats, index=curr_counts.index, columns=curr_feat_names)

    print("-----------------------------------------------------------------")
    print(curr_name)
    print("-----------------------------------------------------------------")

    vnc = VNC(feats, cosine_sim)
    c, coph_dists = cophenet(vnc.d_list, pdist(feats, metric="cosine"))
    print("Cophenetic Correlation Coefficient: {}".format(c))

    fig, ax = plt.subplots(figsize=(6,6))
    vnc.draw_dendrogram(ax=ax, colour="blue")
    ax.grid()
    plt.show()
    return vnc

In [None]:
def plot_cluster_kw(clusters, window_posts, toks, clust_windows):
    num_clusts = len(set(clusters))
    fig, axes = plt.subplots(1, num_clusts, figsize=(10*num_clusts, 8))
    
    kw_dic = dict()
    for i, clust in enumerate(set(clusters)):
        curr_clust_windows = clust_windows[clusters == clust]
        clust_indices = merge_lists([window_posts[w] for w in curr_clust_windows])
        clust_toks = toks.loc[clust_indices]
        non_clust_toks = toks[~toks.index.isin(clust_indices)]
        kw = get_ngram_lr_and_ll(clust_toks, non_clust_toks, 1, "_")
        kw = kw.query("LR > 1 and freq1 > 100").sort_values("LR", ascending=False).head(100)["LR"]
        kw_dic[clust] = kw
        
        cloud = make_wordcloud(kw)
        axes[i].imshow(cloud, aspect="auto")
        
        axes[i].axes.xaxis.set_ticks([])
        axes[i].axes.yaxis.set_ticks([])
        axes[i].set_title(clust)
        
    plt.show()
    return plot_cluster_kw

In [None]:
%%time
vncs = dict()
comment_dic = dict()
tok_dic = dict()
kw_dic = dict()
clust_dic = dict()
window_dic = dict()

for curr_name in ["conspiracy", "science"]:         
    curr_fp = os.path.join(SUBREDDITS_FP, "{0}/{0}_sample.db".format(curr_name))
    tok_fp = os.path.join(TOKENS_FP, f"{curr_name}_comments.json")

    startTime = datetime.now()
    
    # Get current comments and tokens
    comments = read_subreddit(curr_fp)
    toks, comments = get_align_toks(tok_fp, comments)
    
    # Make the VNC
    curr_vnc = make_subreddit_vnc(comments, toks, curr_name, window_size)
    
    # Get keywords for each cluster
    kw_dic[curr_name] = dict()
    clusters = curr_vnc.get_clusters(1)
    clust_dic[curr_name] = clusters
    window_posts = {w: curr_posts.index for w, curr_posts in get_data_windows(comments, window_size, window_size)}
    vnc_windows = curr_vnc.matrix.index
    window_dic[curr_name] = vnc_windows
    
#     kw_dic[curr_name] = plot_cluster_kw(clusters, window_posts, toks, vnc_windows)
    
    
#     # Store in dictionary
#     vncs[curr_name] = curr_vnc
#     comment_dic[curr_name] = comments
#     tok_dic[curr_name] = toks
    
    print("Time taken: ", datetime.now() - startTime)

In [None]:
%%time
def ax_epochs_against_posts(times, counts, clust_starts, ax):
    ax.plot(times, counts, c="#7fbf7b")
    for c_start in clust_starts:
        ax.axvline(c_start, c="#af8dc3", linestyle="--")

    for tick in ax.xaxis.get_ticklabels():
        tick.set_size(14)

    for tick in ax.yaxis.get_ticklabels():
        tick.set_size(14)

    ax.grid()

In [None]:
%%time
fig = plt.figure(figsize=(14, 5))
gs = fig.add_gridspec(len(clust_dic), hspace=0)
axes = gs.subplots(sharex=True)


for i, name in enumerate(clust_dic):
    clusts = clust_dic[name]
    dates = window_dic[name]
    curr_fp = os.path.join(SUBREDDITS_FP, name, f"{name}.db")
    comments = read_subreddit(curr_fp)
    comments = comments.query("time >= @dates[0]")
    comments["flag"] = [True] * len(comments)
    
    clust_starts = [dates[clusts==c][0] for c in set(clusts)]
    rolling_counts = comments.rolling("90D", on="time").count()
    ax_epochs_against_posts(rolling_counts["time"], rolling_counts["flag"], clust_starts, axes[i]) 
    axes[i].set_ylabel(name, size=14)
    
axes[-1].set_xlabel("Time", size=14)
fig.savefig(os.path.join(OUT_DIR, "ot_reddit_comments_vnc_meta.pdf"))
plt.show()