In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import sys
import os
import json
import itertools

from sklearn.preprocessing import StandardScaler
from nltk import ngrams as make_ngrams
from scipy.cluster.hierarchy import cophenet
from scipy.spatial.distance import pdist

sys.path.insert(1, "../")
sys.path.insert(1, "../utilities")

from helpers import load_posts, load_toks, load_pos, get_top_n_toks
from language_change_methods.vnc import VNC, plot_vnc
from language_change_methods.utility_functions import get_data_windows, get_time_windows, basic_preprocessing
from language_change_methods.features import get_tok_counts, function_words, combine_counts, make_feature_matrix

# This method calculates cosine distance between two vectors.
from scipy.spatial.distance import cosine as cosine_dist
# This method simply inverts it to get similarity.
cosine_sim = lambda x,y: 1 - cosine_dist(x,y)

from sklearn.metrics import jaccard_score

from gensim.models import Word2Vec

# suppress some deprecation warning..
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from settings import TFES_FP as DB_FP, TFES_TOK_FP

## Load Data

In [2]:
%%time
all_posts = load_posts(DB_FP)

from helpers import flat_earth_boards, off_topic_boards as other_boards

fe_posts = all_posts.query("board_id in @flat_earth_boards")

toks = {int(x[0]): x[1] for x in load_toks(TFES_TOK_FP)}
toks = pd.Series(toks)
toks = toks[toks.index.isin(fe_posts.index)]

fe_posts = fe_posts.loc[toks.index]
fe_posts.sort_values("time", ascending=True)
toks = toks.loc[fe_posts.index]

Wall time: 16.4 s


## Train Models Over Time

In this section we will train the models on our data. First we'll look at two time periods - the first and second half of our data.

In [4]:
first_half = toks.iloc[:int(len(fe_posts)/2)]
second_half = toks.iloc[int(len(fe_posts)/2):]

In [5]:
%%time
model_1 = Word2Vec(first_half, size=300)
model_2 = Word2Vec(second_half, size=300)

Wall time: 24.1 s


## Some Useful Functions

In [6]:
def get_most_changey_words_with_models(model1, model2, n=100, k=1000, top_n=None):
    nn_scores = []
    
    top_vocab = sorted(model1.wv.vocab.keys(), key=lambda x: model1.wv.vocab[x].count, reverse=True)[:top_n]
    
    vocab1 = model1.wv.vocab
    vocab2 = model2.wv.vocab
    # Loop through all the words in the vocab
    for w in vocab1:
        if (w not in function_words 
                and w in vocab1 
                and w in vocab2 
                and vocab1[w].count > n 
                and vocab2[w].count > n 
                and w in top_vocab):
            neighbours1 = set([x[0] for x in model1.wv.most_similar(w, topn=k)])
            neighbours2 = set([x[0] for x in model2.wv.most_similar(w, topn=k)])
            nn_scores.append((len(neighbours1.intersection(neighbours2)), w))
            
    nn_scores_sorted = sorted(nn_scores)
    return nn_scores_sorted



def neighbors(query : str,
              embs: np.ndarray,
              vocab: list,
              K : int = 3) -> list:
    sims = np.dot(embs[vocab.index(query),],embs.T)
    output = []
    for sim_idx in sims.argsort()[::-1][1:(1+K)]:
        if sims[sim_idx] > 0:
            output.append(vocab[sim_idx])
    return output



def get_most_changey_words_with_vectors(vocab1, vocab2, vectors1, vectors2, n=20, k=1000):
    nn_scores = []
    # Loop through all the words in the vocab
    for w in vocab1:
        if w not in function_words and w in vocab1 and w in vocab2:
            neighbours1 = set(neighbors(w, vectors1, vocab1, k))
            neighbours2 = set(neighbors(w, vectors2, vocab2, k))
            nn_scores.append((len(neighbours1.intersection(neighbours2)), w))
            
    nn_scores_sorted = sorted(nn_scores)
    return nn_scores_sorted

## Look for changiest words from first to second half of corpus

We have two subtley different methods for doing this.
The first compares the models directly, while the second looks only at the top 10,000 words in the vocabulary.
Using the entire models is more "correct", but with a small-ish corpus, the second method reduces the effects of low-occurance words and the output makes more sense.

### First by comparing the models

In [7]:
%%time
ranked_words_models = get_most_changey_words_with_models(model_1, model_2, n=10, k=1000)

Wall time: 28.6 s


In [8]:
ranked_words_models[:20]

[(24, 'retarded'),
 (32, 'fringe'),
 (34, 'flags'),
 (34, 'whirlpool'),
 (37, 'venture'),
 (38, 'corona'),
 (41, 'spoof'),
 (42, 'inflated'),
 (43, 'bone'),
 (43, 'conundrum'),
 (44, 'growth'),
 (44, 'raising'),
 (45, 'fold'),
 (45, 'joules'),
 (45, 'repetition'),
 (46, 'bing'),
 (46, 'exaggerated'),
 (46, 'headlight'),
 (46, 'mouse'),
 (48, 'cap')]

In [10]:
model_1.wv.vocab["retarded"].count

19

### Then by comparing the vectors

In [11]:
def get_top_vocab_and_vectors(model, n=10000):
    """
    Gets the top n words from the model's vocabulary and the vectors of these words.
    """
    top_vocab = sorted(model.wv.vocab.keys(), key=lambda x: model.wv.vocab[x].count, reverse=True)[:n]
    top_vectors = np.array([model.wv[t] for t in top_vocab])
    return top_vocab, top_vectors

In [12]:
%%time
vocab_1, vectors_1 = get_top_vocab_and_vectors(model_1)
vocab_2, vectors_2 = get_top_vocab_and_vectors(model_2)

Wall time: 58 ms


In [13]:
%%time
ranked_words_vectors = get_most_changey_words_with_vectors(vocab_1, vocab_2, vectors_1, vectors_2, k=1000)

Wall time: 59.9 s


In [14]:
ranked_words_vectors[:20]

[(157, 'whatâ\x80\x99s'),
 (164, 'bone'),
 (165, 'intuitively'),
 (173, 'spoof'),
 (177, 'retarded'),
 (184, 'altogether'),
 (191, 'sweeping'),
 (204, 'oddly'),
 (217, 'fringe'),
 (222, 'card'),
 (226, 'trend'),
 (227, 'raising'),
 (231, 'imho'),
 (234, '3d'),
 (234, 'unbelievable'),
 (237, 'continuation'),
 (237, 'individually'),
 (239, 'qed'),
 (242, 'cake'),
 (243, 'arises')]

In [15]:
neighbors("3d", vectors_1, vocab_1, 20)

['an',
 'google',
 '3d',
 'i',
 'scientific',
 'your',
 ':',
 'been',
 'video',
 'article',
 'youtube',
 'wiki',
 'posted',
 'zetetic',
 'link',
 'edit',
 'straight',
 'original',
 'rowbotham',
 'provided']

In [16]:
neighbors("3d", vectors_2, vocab_2, 20)

['sphere',
 'projection',
 'map',
 'd',
 'globe',
 'circle',
 'coordinate',
 'flat',
 'plane',
 'maps',
 'curved',
 'scale',
 'bing',
 'mercator',
 '3d',
 'radius',
 'onto',
 '2d',
 'spheroid',
 'lines']

## Look for more gradual change

In [17]:
%%time
time_models = dict()
# Train a language model for various different portions of the forum.
for w, w_posts in get_data_windows(fe_posts, 10000, 10000):
    time_models[w] = Word2Vec(toks.loc[w_posts.index], size=300)

Wall time: 23.8 s


In [18]:
def neighbours_over_time(search_term, time_models, top_n=10000):
    for window, curr_model in time_models.items():
        curr_vocab, curr_vectors = get_top_vocab_and_vectors(curr_model, top_n)
        print(window)
        if search_term in curr_vocab:
            print(neighbors(search_term, curr_vectors, curr_vocab, 12))
            
            
def neighbours_over_time_comma_delimited(query, time_models, top_n=10000):
    for window, curr_model in time_models.items():
        curr_vocab, curr_vectors = get_top_vocab_and_vectors(curr_model, top_n)
        if query in curr_vocab:
            print(window.strftime("%Y/%m/%d"), end=",")
            curr_neighbours = neighbors(query, curr_vectors, curr_vocab, 12)
            print(",".join(curr_neighbours[:6]))
            print("", end=",")
            print(",".join(curr_neighbours[6:]))
        else:
            print(window)

In [19]:
t100_fe_kw = pd.read_csv("../data/top-100-fe-keywords.csv")
t100_kw_list = list(t100_fe_kw["ngram"])

FileNotFoundError: [Errno 2] No such file or directory: '../data/top-100-fe-keywords.csv'

In [None]:
all_fe_kw = pd.read_csv("../data/all-fe-keywords.csv")
all_kw_list = list(all_fe_kw["ngram"])

### Look at some common FE related words

In [None]:
neighbours_over_time("flat", time_models)

In [None]:
neighbours_over_time_comma_delimited("flat", time_models)

In [None]:
neighbours_over_time("earth", time_models)

In [None]:
neighbours_over_time("globe", time_models)

In [None]:
neighbours_over_time("disc", time_models)

In [None]:
neighbours_over_time("ua", time_models)

In [None]:
neighbours_over_time("ice", time_models)

In [None]:
neighbours_over_time("wall", time_models)

### Look at some of the top keywords

In [None]:
for w in t100_kw_list[:10]:
    print(w)
    print("-----------------------------------")
    neighbours_over_time(w, time_models)
    print("-----------------------------------")

### Look for the changiest words

In [None]:
def get_changiest_words_per_window(time_models, top_n=10000):
    out_dic = dict()
    windows = list(time_models.keys())
    for i in range(1, len(windows)):
        model_1 = time_models[windows[i-1]]
        model_2 = time_models[windows[i]]

        vocab_1, vectors_1 = get_top_vocab_and_vectors(model_1, top_n)
        vocab_2, vectors_2 = get_top_vocab_and_vectors(model_2, top_n)

        out_dic[windows[i]] = get_most_changey_words_with_vectors(vocab_1, vocab_2, vectors_1, vectors_2, k=1000)

    return out_dic

In [None]:
%%time
changiest_words_per_window = get_changiest_words_per_window(time_models, 5000)

In [None]:
merge_lists = lambda x: list(itertools.chain.from_iterable(x))
all_words = set(merge_lists([[cw[1] for cw in cws] for cws in changiest_words_per_window.values()]))

In [None]:
def get_words_in_all_windows(changiest_words_per_window):
    words_in_each_window = [set([cw[1] for cw in cws]) for cws in changiest_words_per_window.values()]
    words_in_all_windows = words_in_each_window[0].intersection(*words_in_each_window[1:])
    return words_in_all_windows

In [None]:
words_in_all_windows = get_words_in_all_windows(changiest_words_per_window)

In [None]:
for window, changey_words in changiest_words_per_window.items():
    print(window)
    t20_words = [f"{w[1]} {w[0]}" for w in changey_words[:20]]
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[:5]))
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[5:10]))
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[10:15]))
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[15:20]))
    print("-----------------------------")

In [None]:
from word_vector_change import print_changiest_over_time

print_changiest_over_time(changiest_words_per_window, time_models, min_freq=30, remove_func=False)

In [None]:
neighbours_over_time("parallax", time_models)

In [None]:
neighbours_over_time("corrected", time_models)

In [None]:
neighbours_over_time("technically", time_models)

In [None]:
neighbours_over_time("particularly", time_models)

In [None]:
neighbours_over_time("infrared", time_models)

In [None]:
neighbours_over_time("standpoint", time_models)

### Looking only at words in all windows

In [None]:
for window, changey_words in changiest_words_per_window.items():
    print(window)
    t20_words = [f"{w[1]} {w[0]}" for w in [x for x in changey_words if x[1] in words_in_all_windows][:20]]
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[:5]))
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[5:10]))
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[10:15]))
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[15:20]))
    print("-----------------------------")

In [None]:
print_changiest_over_time(changiest_words_per_window, time_models, word_list=words_in_all_windows, min_freq=30, remove_func=False)

In [None]:
neighbours_over_time("particularly", time_models)

In [None]:
neighbours_over_time("leg", time_models)

In [None]:
neighbours_over_time("insane", time_models)

In [None]:
neighbours_over_time("unreasonable", time_models)

In [None]:
neighbours_over_time("idiots", time_models)

### Looking at changiest FE Keywords

In [None]:
for window, changey_words in changiest_words_per_window.items():
    print(window)
    t20_words = [f"{w[1]} {w[0]}" for w in [x for x in changey_words if x[1] in all_kw_list][:20]]
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[:5]))
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[5:10]))
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[10:15]))
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[15:20]))
    print("-----------------------------")

In [None]:
print_changiest_over_time(changiest_words_per_window, time_models, word_list=all_kw_list)

In [None]:
for window, changey_words in changiest_words_per_window.items():
    print(window)
    t20_words = [f"{w[1]} {w[0]}" for w in [x for x in changey_words if x[1] in t100_kw_list and x[1] in words_in_all_windows][:20]]
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[:5]))
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[5:10]))
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[10:15]))
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[15:20]))
    print("-----------------------------")

In [None]:
print_changiest_over_time(changiest_words_per_window, time_models, word_list=t100_kw_list, min_freq=50)

In [None]:
neighbours_over_time("gyroscope", time_models)

In [None]:
neighbours_over_time("infrared", time_models)

In [None]:
neighbours_over_time("cgi", time_models)

In [None]:
for i, t in toks.items():
    if "â«" in t:
        print(i)

# Using Pre-Trained Embeddings

I don't think anything I've got here will work, as I need full models rather than keyed vectors.

This means I'd probably need to train my own model.

In [None]:
# import gensim.downloader

# # Download the "glove-twitter-25" embeddings
# glove_vectors = gensim.downloader.load('glove-twitter-25')

In [None]:
# %%time
# time_models_retrained = dict()
# # Train a language model for various different portions of the forum.
# for w, w_posts in get_data_windows(fe_posts, 10000, 10000):
#     model = None # Need to load a model here
#     curr_toks = toks.loc[w_posts.index]
#     model.build_vocab(curr_toks, update=True)
#     model.train(curr_toks, total_examples=model.corpus_count, epochs=model.epochs)
    
#     time_models_retrained[w] = model