In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import sys
import os
import json
import itertools

from sklearn.preprocessing import StandardScaler
from nltk import ngrams as make_ngrams
from scipy.cluster.hierarchy import cophenet
from scipy.spatial.distance import pdist

sys.path.insert(1, "C:/Users/Eddie/Documents/language_change_methods")
sys.path.insert(1, "C:/Users/Eddie/Documents/language-change-application/flat-earth-forum/analysis")

from helpers import load_posts, load_toks, load_pos, get_top_n_toks
from vnc import VNC, plot_vnc
from utility_functions import get_data_windows, get_time_windows, basic_preprocessing
from features import get_tok_counts, function_words, combine_counts, make_feature_matrix

# This method calculates cosine distance between two vectors.
from scipy.spatial.distance import cosine as cosine_dist
# This method simply inverts it to get similarity.
cosine_sim = lambda x,y: 1 - cosine_dist(x,y)

from sklearn.metrics import jaccard_score

from gensim.models import Word2Vec

# suppress some deprecation warning..
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

DB_FP = "C:/Users/Eddie/Documents/Datasets/Flat Earth/little_fe_forum.db"

## Load Data

In [2]:
%%time
all_posts = load_posts(DB_FP)

flat_earth_boards = [5, 7, 8, 19, 23]
other_boards = [9, 10, 11, 12]

fe_posts = all_posts.query("board_id in @flat_earth_boards")

toks = {int(x[0]): x[1] for x in load_toks("C:/Users/Eddie/Documents/Datasets/Flat Earth/TFES_CHUNKS_TOKENISED/tfes_posts.json")}
toks = pd.Series(toks)
toks = toks[toks.index.isin(fe_posts.index)]

fe_posts = fe_posts.loc[toks.index]
fe_posts.sort_values("time", ascending=True)
toks = toks.loc[fe_posts.index]

Wall time: 18.1 s


## Train Models Over Time

In this section we will train the models on our data. First we'll look at two time periods - the first and second half of our data.

In [3]:
first_half = toks.iloc[:int(len(fe_posts)/2)]
second_half = toks.iloc[int(len(fe_posts)/2):]

In [4]:
%%time
model_1 = Word2Vec(first_half, size=300)
model_2 = Word2Vec(second_half, size=300)

Wall time: 24.2 s


## Some Useful Functions

In [5]:
def get_most_changey_words_with_models(model1, model2, n=100, k=1000, top_n=None):
    nn_scores = []
    
    top_vocab = sorted(model1.wv.vocab.keys(), key=lambda x: model1.wv.vocab[x].count, reverse=True)[:top_n]
    
    vocab1 = model1.wv.vocab
    vocab2 = model2.wv.vocab
    # Loop through all the words in the vocab
    for w in vocab1:
        if (w not in function_words 
                and w in vocab1 
                and w in vocab2 
                and vocab1[w].count > n 
                and vocab2[w].count > n 
                and w in top_vocab):
            neighbours1 = set([x[0] for x in model1.wv.most_similar(w, topn=k)])
            neighbours2 = set([x[0] for x in model2.wv.most_similar(w, topn=k)])
            nn_scores.append((len(neighbours1.intersection(neighbours2)), w))
            
    nn_scores_sorted = sorted(nn_scores)
    return nn_scores_sorted



def neighbors(query : str,
              embs: np.ndarray,
              vocab: list,
              K : int = 3) -> list:
    sims = np.dot(embs[vocab.index(query),],embs.T)
    output = []
    for sim_idx in sims.argsort()[::-1][1:(1+K)]:
        if sims[sim_idx] > 0:
            output.append(vocab[sim_idx])
    return output



def get_most_changey_words_with_vectors(vocab1, vocab2, vectors1, vectors2, n=20, k=1000):
    nn_scores = []
    # Loop through all the words in the vocab
    for w in vocab1:
        if w not in function_words and w in vocab1 and w in vocab2:
            neighbours1 = set(neighbors(w, vectors1, vocab1, k))
            neighbours2 = set(neighbors(w, vectors2, vocab2, k))
            nn_scores.append((len(neighbours1.intersection(neighbours2)), w))
            
    nn_scores_sorted = sorted(nn_scores)
    return nn_scores_sorted

## Look for changiest words from first to second half of corpus

We have two subtley different methods for doing this.
The first compares the models directly, while the second looks only at the top 10,000 words in the vocabulary.
Using the entire models is more "correct", but with a small-ish corpus, the second method reduces the effects of low-occurance words and the output makes more sense.

### First by comparing the models

In [6]:
%%time
ranked_words_models = get_most_changey_words_with_models(model_1, model_2, n=10, k=1000)

Wall time: 27.7 s


In [7]:
ranked_words_models[:20]

[(22, 'retarded'),
 (36, 'conundrum'),
 (36, 'venture'),
 (39, 'exaggerated'),
 (39, 'spoof'),
 (39, 'whirlpool'),
 (40, 'fringe'),
 (41, 'sundial'),
 (42, 'corona'),
 (42, 'mouse'),
 (42, 'raising'),
 (43, 'joules'),
 (44, 'bing'),
 (46, 'expanded'),
 (46, 'flags'),
 (46, 'growth'),
 (46, 'repetition'),
 (46, 'sighting'),
 (46, 'trend'),
 (47, 'dying')]

In [8]:
model_1.wv.vocab["retarded"].count

19

### Then by comparing the vectors

In [9]:
def get_top_vocab_and_vectors(model, n=10000):
    """
    Gets the top n words from the model's vocabulary and the vectors of these words.
    """
    top_vocab = sorted(model.wv.vocab.keys(), key=lambda x: model.wv.vocab[x].count, reverse=True)[:n]
    top_vectors = np.array([model.wv[t] for t in top_vocab])
    return top_vocab, top_vectors

In [10]:
%%time
vocab_1, vectors_1 = get_top_vocab_and_vectors(model_1)
vocab_2, vectors_2 = get_top_vocab_and_vectors(model_2)

Wall time: 51 ms


In [11]:
%%time
ranked_words_vectors = get_most_changey_words_with_vectors(vocab_1, vocab_2, vectors_1, vectors_2, k=1000)

Wall time: 59.9 s


In [12]:
ranked_words_vectors[:20]

[(157, 'spoof'),
 (163, 'intuitively'),
 (177, 'retarded'),
 (177, 'whatâ\x80\x99s'),
 (208, 'individually'),
 (209, 'altogether'),
 (213, 'endlessly'),
 (225, 'conundrum'),
 (225, 'sweeping'),
 (233, 'beat'),
 (235, 'replacing'),
 (236, '3d'),
 (237, 'imho'),
 (237, 'unbelievable'),
 (238, 'qed'),
 (238, 'rethink'),
 (242, 'arises'),
 (242, 'oddly'),
 (242, 'raising'),
 (245, 'bone')]

In [13]:
neighbors("3d", vectors_1, vocab_1, 20)

['scientific',
 'my',
 'google',
 '3d',
 'been',
 'i',
 'video',
 ':',
 'article',
 'your',
 'zetetic',
 'wiki',
 'link',
 'posted',
 'rowbotham',
 'youtube',
 'straight',
 'new',
 'posting',
 'provided']

In [14]:
neighbors("3d", vectors_2, vocab_2, 20)

['sphere',
 'projection',
 'map',
 'globe',
 'd',
 'circle',
 'flat',
 'coordinate',
 'maps',
 'plane',
 'curved',
 'mercator',
 'scale',
 'bing',
 'onto',
 'radius',
 'lines',
 '3d',
 '2d',
 'spheroid']

## Look for more gradual change

In [15]:
%%time
time_models = dict()
# Train a language model for various different portions of the forum.
for w, w_posts in get_data_windows(fe_posts, 10000, 10000):
    time_models[w] = Word2Vec(toks.loc[w_posts.index], size=300)

Wall time: 24.4 s


In [64]:
def neighbours_over_time(search_term, time_models, top_n=10000):
    for window, curr_model in time_models.items():
        curr_vocab, curr_vectors = get_top_vocab_and_vectors(curr_model, top_n)
        print(window)
        if search_term in curr_vocab:
            print(neighbors(search_term, curr_vectors, curr_vocab, 12))
            
            
def neighbours_over_time_comma_delimited(query, time_models, top_n=10000):
    for window, curr_model in time_models.items():
        curr_vocab, curr_vectors = get_top_vocab_and_vectors(curr_model, top_n)
        if query in curr_vocab:
            print(window.strftime("%Y/%m/%d"), end=",")
            curr_neighbours = neighbors(query, curr_vectors, curr_vocab, 12)
            print(",".join(curr_neighbours[:6]))
            print("", end=",")
            print(",".join(curr_neighbours[6:]))
        else:
            print(window)

In [17]:
t100_fe_kw = pd.read_csv("C:/Users/Eddie/Documents/Datasets/Flat Earth/Keywords/top-100-fe-keywords.csv")
t100_kw_list = list(t100_fe_kw["ngram"])

In [18]:
all_fe_kw = pd.read_csv("C:/Users/Eddie/Documents/Datasets/Flat Earth/Keywords/all-fe-keywords.csv")
all_kw_list = list(all_fe_kw["ngram"])

### Look at some common FE related words

In [19]:
neighbours_over_time("flat", time_models)

2013-12-01 18:43:04
['round', 'globe', 'shape', 'evidence', 'this', 'theory', 'model', 'map', 'i', 'fe', 'believe', 'society']
2015-12-30 23:54:31
['round', 'globe', 'shape', 'fe', 'evidence', 'sphere', 'map', 'spherical', 'wrong', 'believe', 'true', 'google']
2017-01-16 01:34:55
['round', 'globe', 'fe', 'shape', 'map', 'evidence', 'model', 'believe', 'real', 'there', 'proof', 'true']
2017-11-09 16:57:19
['round', 'globe', 'fe', 'shape', 'believe', 'map', 'model', 'spherical', 'theory', 'sphere', 'conspiracy', 'true']
2018-04-20 09:03:11
['round', 'globe', 'fe', 'map', 'model', 'spherical', 'shape', 'sphere', 'moon', 'theory', 'believe', 'evidence']
2018-10-07 05:55:37
['round', 'globe', 'fe', 'spherical', 'shape', 'map', 'model', 'sphere', 'believe', 'i', 'moon', 'wrong']


In [65]:
neighbours_over_time_comma_delimited("flat", time_models)

2013/12/01,round,globe,shape,evidence,this,theory
,model,map,i,fe,believe,society
2015/12/30,round,globe,shape,fe,evidence,sphere
,map,spherical,wrong,believe,true,google
2017/01/16,round,globe,fe,shape,map,evidence
,model,believe,real,there,proof,true
2017/11/09,round,globe,fe,shape,believe,map
,model,spherical,theory,sphere,conspiracy,true
2018/04/20,round,globe,fe,map,model,spherical
,shape,sphere,moon,theory,believe,evidence
2018/10/07,round,globe,fe,spherical,shape,map
,model,sphere,believe,i,moon,wrong


In [21]:
neighbours_over_time("earth", time_models)

2013-12-01 18:43:04
['earthers', 'sun', 'horizon', 'model', 'world', 'map', 'surface', 'society', 'theory', 'pole', 'it', 'plane']
2015-12-30 23:54:31
['horizon', 'earthers', 'sun', 'moon', 'world', 'model', 'map', 'earther', 'surface', 'theory', 'globe', 'object']
2017-01-16 01:34:55
['world', 'earthers', 'map', 'horizon', 'sun', 'model', 'plane', 'moon', 'surface', 'earther', 'pole', 'theory']
2017-11-09 16:57:19
['earthers', 'world', 'sun', 'horizon', 'earther', 'model', 'moon', 'map', 'plane', 'surface', 'circle', 'theory']
2018-04-20 09:03:11
['sun', 'surface', 'moon', 'model', 'world', 'map', 'earthers', 'plane', 'horizon', 'light', 'sphere', 'earther']
2018-10-07 05:55:37
['surface', 'sun', 'plane', 'earthers', 'map', 'model', 'world', 'moon', 'horizon', 'earther', 'circle', 'maps']


In [22]:
neighbours_over_time("globe", time_models)

2013-12-01 18:43:04
['round', 'globe', 'shape', 'theory', 'model', 'evidence', 'map', 'sun', 'earth', 'surface', 'believe', 'curvature']
2015-12-30 23:54:31
['globe', 'round', 'map', 'shape', 'fe', 'model', 'sphere', 'earth', 'spherical', 'theory', 'evidence', 'moon']
2017-01-16 01:34:55
['round', 'globe', 'map', 'model', 'shape', 'fe', 'surface', 'earth', 'sphere', 'system', 'plane', 'based']
2017-11-09 16:57:19
['round', 'globe', 'shape', 'fe', 'model', 'map', 'sphere', 'earth', 'spherical', 'theory', 'plane', 'conspiracy']
2018-04-20 09:03:11
['round', 'globe', 'model', 'map', 'sphere', 'spherical', 'surface', 'fe', 'moon', 'shape', 'plane', 'theory']
2018-10-07 05:55:37
['globe', 'round', 'map', 'spherical', 'sphere', 'model', 'surface', 'plane', 'shape', 'fe', 'based', 'maps']


In [23]:
neighbours_over_time("disc", time_models)

2013-12-01 18:43:04
['sun', 'flat', 'model', 'round', 'surface', 'horizon', 'globe', 'map', 'observer', 'distance', 'shape', 'pole']
2015-12-30 23:54:31
['map', 'horizon', 'model', 'sun', 'earthers', 'globe', 'projection', 'surface', 'object', 'earther', 'circle', 'theory']
2017-01-16 01:34:55
['ice', 'map', 'model', 'surface', 'plane', 'globe', 'wall', 'horizon', 'object', 'force', 'flat', 'circle']
2017-11-09 16:57:19
['sun', 'horizon', 'surface', 'miles', 'moon', 'circle', 'equator', 'plane', 'object', 'degrees', 'pole', 'north']
2018-04-20 09:03:11
['surface', 'sun', 'moon', 'plane', 'sphere', 'light', '=', 'distance', 'model', 'miles', 'circle', 'north']
2018-10-07 05:55:37
['sun', 'north', 'plane', 'map', 'earth', 'pole', 'circle', 'miles', 'equator', 'model', 'south', 'moon']


In [24]:
neighbours_over_time("ua", time_models)

2013-12-01 18:43:04
['theory', 'sun', 'evidence', 'no', 'matter', 'acceleration', 'model', 'light', 'aether', 'earth', 'effect', 'exist']
2015-12-30 23:54:31
['theory', 'evidence', 'sun', 'flat', 'model', 'globe', 'earth', 'does', 'proof', 'fe', 'round', 'shape']
2017-01-16 01:34:55
['perspective', 'fe', 'acceleration', 'theory', 'evidence', 'model', 'fet', 'anything', 'an', 'flat', 'round', 'tom']
2017-11-09 16:57:19
['model', 'perspective', 'force', 'acceleration', 'gravitation', 'ua', 'exist', 'shape', 'map', 'theory', 'earth', 'round']
2018-04-20 09:03:11
['model', 'theory', 'fe', 'evidence', 'effect', 'gravity', 'round', 'perspective', 'coriolis', 'force', 'refraction', 'matter']
2018-10-07 05:55:37
['force', 'evidence', 'model', 'effect', 'ua', 'fe', 'flat', 'theory', 'acceleration', 'true', 'light', 'round']


In [25]:
neighbours_over_time("ice", time_models)

2013-12-01 18:43:04
['earth', 'light', 'observer', 'north', 'surface', 'gravity', 'distance', '=', 'object', 'pole', 'miles', 'moon']
2015-12-30 23:54:31
['wall', 'pole', 'north', 'horizon', 'object', 'map', '"', 'celestial', 'surface', 'projection', 'globe', 'lines']
2017-01-16 01:34:55
['wall', 'pole', 'north', 'force', 'south', 'object', 'acceleration', 'gravity', 'be', 'surface', 'map', 'plane']
2017-11-09 16:57:19
['wall', 'pole', 'south', 'north', 'object', 'force', 'equator', 'light', 'horizon', 'has', 'earth', 'gravity']
2018-04-20 09:03:11
['miles', 'flat', 'pole', 'ice', 'map', 'earth', 'round', 'north', 'surface', 'wall', 'globe', '100']
2018-10-07 05:55:37
['wall', "'", 'model', 'pole', 'earth', 'theory', 'evidence', 'body', '=', 'dome', 'north', 'has']


In [26]:
neighbours_over_time("wall", time_models)

2013-12-01 18:43:04
['light', 'earth', 'observer', 'distance', 'surface', 'north', 'pole', 'gravity', 'miles', 'horizon', 'moon', 'object']
2015-12-30 23:54:31
['ice', 'north', 'pole', 'horizon', 'map', '"', 'south', 'object', 'celestial', 'equator', 'projection', 'surface']
2017-01-16 01:34:55
['wall', 'pole', 'north', 'south', 'horizon', 'force', 'object', 'map', 'gravity', 'be', 'no', 'equator']
2017-11-09 16:57:19
['ice', 'pole', 'evidence', 'no', 'south', '"', 'been', 'object', 'force', 'north', 'gravity', 'shadow']
2018-04-20 09:03:11
['surface', "'", 'map', 'plane', 'pole', 'miles', 'model', 'globe', 'round', 'earth', 'wall', 'there']
2018-10-07 05:55:37
['ice', "'", 'pole', 'evidence', 'there', 'north', 'no', 'model', 'body', 'force', 'problem', 'effect']


### Look at some of the top keywords

In [27]:
for w in t100_kw_list[:10]:
    print(w)
    print("-----------------------------------")
    neighbours_over_time(w, time_models)
    print("-----------------------------------")

longitude
-----------------------------------
2013-12-01 18:43:04
['observer', '=', 'sun', '\x94', 'distance', 'north', 'light', 'degrees', 'horizon', 'during', 'south', 'sky']
2015-12-30 23:54:31
['north', 'km', 'south', '°', '=', 'degrees', 'miles', 'equator', 'east', 'pole', 'angle', '̂°']
2017-01-16 01:34:55
['line', 'straight', 'north', 'light', 'southern', 'distance', 'south', 'between', 'speed', 'pole', 'northern', 'east']
2017-11-09 16:57:19
['miles', 'km', 'equator', '=', 'distance', 'between', 'north', 'south', 'line', 'angle', 'lines', 'straight']
2018-04-20 09:03:11
['south', 'degrees', 'pole', 'between', '=', 'miles', 'distance', 'lines', 'equator', 'east', 'latitude', 'distances']
2018-10-07 05:55:37
['south', 'equator', 'between', 'latitude', 'degrees', 'line', 'km', 'pole', 'east', 'distance', 'longitude', 'distances']
-----------------------------------
circumference
-----------------------------------
2013-12-01 18:43:04
['observer', 'distance', 'miles', 'horizon', 'n

### Look for the changiest words

In [28]:
def get_changiest_words_per_window(time_models, top_n=10000):
    out_dic = dict()
    windows = list(time_models.keys())
    for i in range(1, len(windows)):
        model_1 = time_models[windows[i-1]]
        model_2 = time_models[windows[i]]

        vocab_1, vectors_1 = get_top_vocab_and_vectors(model_1, top_n)
        vocab_2, vectors_2 = get_top_vocab_and_vectors(model_2, top_n)

        out_dic[windows[i]] = get_most_changey_words_with_vectors(vocab_1, vocab_2, vectors_1, vectors_2, k=1000)

    return out_dic

In [29]:
%%time
changiest_words_per_window = get_changiest_words_per_window(time_models, 5000)

Wall time: 1min 53s


In [30]:
merge_lists = lambda x: list(itertools.chain.from_iterable(x))
all_words = set(merge_lists([[cw[1] for cw in cws] for cws in changiest_words_per_window.values()]))

In [31]:
def get_words_in_all_windows(changiest_words_per_window):
    words_in_each_window = [set([cw[1] for cw in cws]) for cws in changiest_words_per_window.values()]
    words_in_all_windows = words_in_each_window[0].intersection(*words_in_each_window[1:])
    return words_in_all_windows

In [32]:
words_in_all_windows = get_words_in_all_windows(changiest_words_per_window)

In [33]:
for window, changey_words in changiest_words_per_window.items():
    print(window)
    t20_words = [f"{w[1]} {w[0]}" for w in changey_words[:20]]
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[:5]))
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[5:10]))
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[10:15]))
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[15:20]))
    print("-----------------------------")

2015-12-30 23:54:31
corrected 254        cancer 267           respect 294          trade 328            witnessed 331       
universally 332      f 333                blah 334             alt 356              quiet 360           
invent 361           offers 363           spreading 364        slightest 368        freely 372          
> 373                google 377           ] 379                [ 382                closed 385          
-----------------------------
2017-01-16 01:34:55
particularly 203     functions 210        junk 221             3d 248               peoples 255         
young 257            ** 276               terrible 278         moderate 306         flatness 307        
super 309            directed 322         representative 324   plainly 325          notes 326           
heavily 328          paint 332            universally 336      sattelites 338       explaining 339      
-----------------------------
2017-11-09 16:57:19
parallax 224         compute 260        

In [68]:
from word_vector_change import print_changiest_over_time

print_changiest_over_time(changiest_words_per_window, time_models, min_freq=30, remove_func=False)

2015-12-30 23:54:31
corrected            cancer               respect              f                    alt                 
slightest            google               closed               expected             experts             
ultimately           particularly         fully                junker               watching            
direct               existed              new                  includes             non                 
-----------------------------
2017-01-16 01:34:55
particularly         3d                   terrible             super                explaining          
became               giving               fully                display              pilots              
sunsets              deeper               includes             becoming             necessarily         
consistently         precisely            confusion            similarly            non                 
-----------------------------
2017-11-09 16:57:19
parallax             compute            

In [100]:
neighbours_over_time("parallax", time_models)

2013-12-01 18:43:04
['sun', 'miles', 'light', 'observer', '\x94', 'north', 'distance', 'moon', 'degrees', 'eclipse', 'solar', 'during']
2015-12-30 23:54:31
['=', 'sun', 'miles', 'km', '°', 'pole', 'north', 'diameter', 'object', 'gravity', 'light', 'degrees']
2017-01-16 01:34:55
['miles', 'degrees', 'speed', 'light', 'due', 'sun', 'feet', 'km', 'west', 'm', 'between', 'away']
2017-11-09 16:57:19
['tom', 'evidence', 'rowbotham', 'been', 'i', 'anyone', 'thread', 'who', 'science', 'question', 'someone', 'me']
2018-04-20 09:03:11
['[', 'years', 'theory', ']', '|', 'm', '24', 'been', '10', 'days', 'miles', 'â\x80']
2018-10-07 05:55:37
['relativity', '=', 'acceleration', 'sagnac', 'gravity', 'force', 'system', 'coriolis', 'based', 'rowbotham', 'body', 'by']


In [101]:
neighbours_over_time("corrected", time_models)

2013-12-01 18:43:04
["'m", 'am', 'post', "'ll", 'thread', 'have', 'want', 'i', 'had', 'read', 'question', "'d"]
2015-12-30 23:54:31
['=', '°', 'â°', '2016', 'miles', 'â', 'years', '10', 'm', 'x', 'degrees', 'per']
2017-01-16 01:34:55
['m', 'miles', '1', '2', '/', 'x', 'km', '+', 'hours', '*', '%', 'years']
2017-11-09 16:57:19
['=', 'degrees', 'miles', 'years', 'km', '0.0', '+', 'south', '10', 'm', 'feet', 'north']
2018-04-20 09:03:11
['years', '10', 'thread', 'post', '[', '|', 'm', 'degrees', ']', '*', 'days', 'hours']
2018-10-07 05:55:37
['done', 'miles', 'years', 'm', 'he', '=', 'i', 'feet', 'km', 'someone', 'wrong', 'me']


In [37]:
neighbours_over_time("technically", time_models)

2013-12-01 18:43:04
["'m", "n't", 'sure', 'wrong', 'exist', 'talking', 'know', 'saying', ')', 'there', 'not', '\x94']
2015-12-30 23:54:31
['sun', 'appear', 'object', 'away', 'light', 'pole', 'moon', 'flat', 'distance', 'lines', 'miles', 'it']
2017-01-16 01:34:55
['lines', "'", 'gravity', 'light', 'ice', 'north', 'acceleration', 'straight', 'miles', 'line', 'pole', 'force']
2017-11-09 16:57:19
['horizon', 'gravity', 'sun', 'degrees', 'miles', '0.0', 'wrong', 'flat', 'object', 'gravitation', 'eye', 'mean']
2018-04-20 09:03:11
["'m", 'saying', 'well', 'been', "'re", 'are', 'he', 'wrong', 'said', 'good', 'sure', 'pretty']
2018-10-07 05:55:37
['m', 'miles', 'km', 'x', 'degrees', 'feet', '>', 'per', '+', 'north', '/', 'acceleration']


In [102]:
neighbours_over_time("particularly", time_models)

2013-12-01 18:43:04
['=', 'i', 'sure', '1', 'am', '[', "'re", ')', ']', '2', 'do', 'talking']
2015-12-30 23:54:31
['â°', '°', 'km', 'â', 'miles', '2016', 'x', '̂°', 'diameter', 'degrees', 'per', '10']
2017-01-16 01:34:55
["'ve", 'he', 'me', 'did', 'am', 'questions', 'answer', 'tom', 'please', 'does', 'question', 'i']
2017-11-09 16:57:19
['degrees', 'miles', 'years', 'km', '0.0', '+', 'm', '10', 'my', 'topic', 'minutes', '1']
2018-04-20 09:03:11
['tom', 'me', 'who', 'thread', 'evidence', 'wrong', 'trying', 'question', 'am', 'good', 'sure', 'talking']
2018-10-07 05:55:37
['science', 'me', 'who', 'relativity', 'my', 'evidence', '=', 'tom', 'wrong', 'm', 'scientific', 'sure']


In [39]:
neighbours_over_time("infrared", time_models)

2013-12-01 18:43:04
['sun', 'observer', 'miles', '\x94', 'light', 'north', 'distance', 'moon', '[', 'degrees', 'during', 'km']
2015-12-30 23:54:31
['°', '=', 'km', 'miles', 'degrees', '̂°', 'x', 'diameter', 'â', 'per', 'north', 'east']
2017-01-16 01:34:55
['flight', 'hours', 'years', 'm', 'miles', 'west', 'flights', 'moved', 'sydney', 'southern', 'times', 'x']
2017-11-09 16:57:19
['degrees', 'miles', 'km', '0.0', '+', 'm', 'hours', '10', 'feet', 'per', '15', 'minutes']
2018-04-20 09:03:11
['|', 'into', 'ago', 'back', '[', 'launch', 'own', 'â\x80', 'years', ']', 'up', 'try']
2018-10-07 05:55:37
['m', 'degrees', 'miles', 'feet', 'km', 'height', 'east', 'x', 'eye', 'west', 'object', 'observer']


In [40]:
neighbours_over_time("standpoint", time_models)

2013-12-01 18:43:04
['i', 'question', 'thread', 'make', 'claim', 'answer', 'he', 'please', 'own', 'find', "'m", 'forum']
2015-12-30 23:54:31
['=', 'â°', 'km', '2016', 'years', 'â', 'x', '10', '̂°', '̂', 'per', "'"]
2017-01-16 01:34:55
['been', '=', 'moved', 'ice', 'he', 'theory', 'be', 'earth', 'earthers', 'model', 'anyone', 'based']
2017-11-09 16:57:19
['miles', '=', 'km', 'south', 'north', 'years', 'away', 'pole', '0.0', 'distance', 'hours', '10']
2018-04-20 09:03:11
['miles', 'degrees', 'm', 'sagnac', 'effect', '10', '[', 'feet', '1', 'light', 'theory', '|']
2018-10-07 05:55:37
['object', 'eye', 'horizon', 'explanation', 'earth', 'map', 'question', 'm', 'observer', 'surface', 'wiki', 'point']


### Looking only at words in all windows

In [41]:
for window, changey_words in changiest_words_per_window.items():
    print(window)
    t20_words = [f"{w[1]} {w[0]}" for w in [x for x in changey_words if x[1] in words_in_all_windows][:20]]
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[:5]))
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[5:10]))
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[10:15]))
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[15:20]))
    print("-----------------------------")

2015-12-30 23:54:31
corrected 254        cancer 267           respect 294          witnessed 331        universally 332     
f 333                invent 361           slightest 368        freely 372           > 373               
google 377           ] 379                [ 382                closed 385           expected 389        
active 395           firstly 405          referred 406         seemingly 409        experts 412         
-----------------------------
2017-01-16 01:34:55
particularly 203     functions 210        3d 248               young 257            terrible 278        
flatness 307         super 309            directed 322         heavily 328          universally 336     
explaining 339       seemingly 346        solely 347           became 358           entry 362           
giving 365           readily 365          fully 375            greatly 378          display 379         
-----------------------------
2017-11-09 16:57:19
parallax 224         directed 273       

In [42]:
print_changiest_over_time(changiest_words_per_window, time_models, word_list=words_in_all_windows, min_freq=30, remove_func=False)

2015-12-30 23:54:31
corrected            cancer               respect              f                    slightest           
google               closed               expected             experts              ultimately          
particularly         fully                watching             direct               existed             
new                  includes             non                  folks                pm                  
-----------------------------
2017-01-16 01:34:55
particularly         3d                   terrible             super                explaining          
became               giving               fully                display              pilots              
sunsets              deeper               includes             becoming             necessarily         
consistently         precisely            confusion            similarly            non                 
-----------------------------
2017-11-09 16:57:19
parallax             super              

In [43]:
neighbours_over_time("particularly", time_models)

2013-12-01 18:43:04
['=', 'i', 'sure', '1', 'am', '[', "'re", ')', ']', '2', 'do', 'talking']
2015-12-30 23:54:31
['â°', '°', 'km', 'â', 'miles', '2016', 'x', '̂°', 'diameter', 'degrees', 'per', '10']
2017-01-16 01:34:55
["'ve", 'he', 'me', 'did', 'am', 'questions', 'answer', 'tom', 'please', 'does', 'question', 'i']
2017-11-09 16:57:19
['degrees', 'miles', 'years', 'km', '0.0', '+', 'm', '10', 'my', 'topic', 'minutes', '1']
2018-04-20 09:03:11
['tom', 'me', 'who', 'thread', 'evidence', 'wrong', 'trying', 'question', 'am', 'good', 'sure', 'talking']
2018-10-07 05:55:37
['science', 'me', 'who', 'relativity', 'my', 'evidence', '=', 'tom', 'wrong', 'm', 'scientific', 'sure']


In [44]:
neighbours_over_time("leg", time_models)

2013-12-01 18:43:04
['do', "n't", 'please', 'let', 'you', 'want', 'i', 'what', "'re", 'claim', 'why', 'how']
2015-12-30 23:54:31
['°', 'km', '=', 'miles', 'north', '̂°', 'east', 'degrees', 'south', 'â', 'x', 'diameter']
2017-01-16 01:34:55
['=', 'degrees', 'hours', 'north', 'west', 'feet', 'east', 'south', 'm', 'angle', 'southern', 'away']
2017-11-09 16:57:19
['able', 'ca', 'want', 'does', 'miles', '0.0', 'degrees', 'will', 'they', 'try', 'need', "'ll"]
2018-04-20 09:03:11
['[', '/', ']', '|', 'm', '10', 'degrees', '2', '(', 'sagnac', '*', '+']
2018-10-07 05:55:37
['who', 'evidence', 'based', 'claims', 'wiki', 'world', 'thread', 'claim', 'rowbotham', 'nasa', 'post', 'science']


In [45]:
neighbours_over_time("insane", time_models)

2013-12-01 18:43:04
['i', 'earthers', 'he', 'who', "'", 'people', 'been', 'flat', '=', 'earth', 'you', 'ghosts']
2015-12-30 23:54:31
['â°', '°', '=', '2016', 'â', 'miles', 'x', '10', '̂°', 'm', '̂', 'degrees']
2017-01-16 01:34:55
['he', 'tom', 'bishop', '1', 'm', 'evidence', 'science', 'gravity', 'acceleration', 'nasa', 'been', 'god']
2017-11-09 16:57:19
['flat', '0.0', 'evidence', 'questions', 'many', 'these', 'been', '+', 'round', "'ve", '!', "'re"]
2018-04-20 09:03:11
["'re", 'thread', 'who', "'m", 'people', 'want', 'please', 'own', 'post', 'questions', 'read', 'am']
2018-10-07 05:55:37
['mass', 'km', 'miles', 'degrees', 'm', 'force', 'acceleration', 'bodies', 'gravitational', 'feet', 'less', 'north']


In [46]:
neighbours_over_time("unreasonable", time_models)

2013-12-01 18:43:04
['he', 'i', 'am', 'claim', 'sure', 'there', 'question', 'me', 'my', "'ve", 'you', 'evidence']
2015-12-30 23:54:31
['am', '=', 'bishop', '[', '2016', 'my', 'let', "'ll", '°', 'please', ']', 'post']
2017-01-16 01:34:55
['he', 'been', 'tom', 'let', 'me', 'bishop', 'who', 'trying', 'please', 'i', 'sure', 'answer']
2017-11-09 16:57:19
["'m", 'people', 'tom', 'who', 'wrong', 'do', 'does', 'he', 'questions', 'me', 'you', 'proof']
2018-04-20 09:03:11
["'m", 'ca', 'do', 'did', '=', 'me', 'am', 'let', 'sure', 'tom', 'understand', 'seems']
2018-10-07 05:55:37
['m', "'", '=', 'degrees', 'feet', 'km', 'north', 'x', 'south', 'per', 'pole', 'hours']


In [47]:
neighbours_over_time("idiots", time_models)

2013-12-01 18:43:04
['[', "'", ']', '\x94', 'miles', 'space', 'observer', '/', '1', 'north', 'km', 'x']
2015-12-30 23:54:31
['they', 'you', 'me', "'m", 'earthers', 'those', 'do', 'want', 'are', 'these', "'re", 'who']
2017-01-16 01:34:55
["'re", 'are', 'who', 'years', 'been', "'m", 'questions', 'many', 'am', 'were', "'ve", 'topic']
2017-11-09 16:57:19
["'re", '=', 'trying', 'who', 'talking', 'going', 'wrong', 'people', 'did', 'saying', "'d", "'ll"]
2018-04-20 09:03:11
['their', 'who', 'years', 'space', 'these', 'those', 'many', 'they', 'did', 'ca', 'nasa', 'things']
2018-10-07 05:55:37
['who', 'these', 'many', 'nasa', 'talking', 'years', 'science', 'relativity', 'rowbotham', 'he', '=', 'wrong']


### Looking at changiest FE Keywords

In [48]:
for window, changey_words in changiest_words_per_window.items():
    print(window)
    t20_words = [f"{w[1]} {w[0]}" for w in [x for x in changey_words if x[1] in all_kw_list][:20]]
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[:5]))
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[5:10]))
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[10:15]))
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[15:20]))
    print("-----------------------------")

2015-12-30 23:54:31
corrected 254        witnessed 331        freely 372           firstly 405          pm 436              
sunsets 436          eers 446             surveyors 449        pilots 455           instruments 456     
mathematically 456   cavendish 457        compasses 457        distorted 464        faking 464          
assumes 466          trigonometry 466     amateur 468          eratosthenes 468     methodology 468     
-----------------------------
2017-01-16 01:34:55
flatness 307         explaining 339       pilots 379           blindly 381          sunsets 390         
deeper 392           barrier 407          foucault 409         illustrate 412       testable 418        
string 419           everyday 430         reproduce 434        sufficiently 438     demonstrating 443   
rendering 443        principle 450        lasers 452           scientifically 455   continually 456     
-----------------------------
2017-11-09 16:57:19
parallax 224         compute 260        

In [49]:
print_changiest_over_time(changiest_words_per_window, time_models, word_list=all_kw_list)

2015-12-30 23:54:31
corrected            witnessed            freely               firstly              pm                  
sunsets              eers                 surveyors            pilots               instruments         
mathematically       cavendish            compasses            distorted            faking              
assumes              trigonometry         amateur              eratosthenes         methodology         
-----------------------------
2017-01-16 01:34:55
flatness             explaining           pilots               blindly              sunsets             
deeper               barrier              foucault             illustrate           testable            
string               everyday             reproduce            sufficiently         demonstrating       
rendering            principle            lasers               scientifically       continually         
-----------------------------
2017-11-09 16:57:19
parallax             compute            

In [50]:
for window, changey_words in changiest_words_per_window.items():
    print(window)
    t20_words = [f"{w[1]} {w[0]}" for w in [x for x in changey_words if x[1] in t100_kw_list and x[1] in words_in_all_windows][:20]]
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[:5]))
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[5:10]))
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[10:15]))
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[15:20]))
    print("-----------------------------")

2015-12-30 23:54:31
cavendish 457        trigonometry 466     eratosthenes 468     magnification 496    curving 513         
enag 536             mercator 550         vanishing 550        bedford 556          gps 566             
auckland 570         santiago 577         everest 579          azimuthal 587        bi-polar 591        
johannesburg 592     balloons 597         diagrams 598         equidistant 600      accelerator 617     
-----------------------------
2017-01-16 01:34:55
trigonometry 485     gps 497              diagrams 504         eratosthenes 523     mercator 568        
accelerator 580      bedford 580          cavendish 589        enag 592             bi-polar 609        
auckland 610         balloons 623         rowbotham 628        everest 633          azimuthal 639       
coriolis 647         distances 647        magnification 653    santiago 654         refraction 659      
-----------------------------
2017-11-09 16:57:19
bedford 551          cavendish 567      

In [51]:
print_changiest_over_time(changiest_words_per_window, time_models, word_list=t100_kw_list, min_freq=50)

2015-12-30 23:54:31
surveyors            cavendish            trigonometry         geodetic             magnification       
gleason              mercator             vanishing            gps                  santiago            
azimuthal            johannesburg         equidistant          equatorial           converge            
longitude            ua                   coriolis             ret                  sunset              
-----------------------------
2017-01-16 01:34:55
gps                  diagrams             unipolar             qantas               lat                 
balloons             rowbotham            coriolis             distances            magnification       
santiago             refraction           vanishing            fe'ers               longitude           
ret                  equinox              ua                   hemiplane            latitudes           
-----------------------------
2017-11-09 16:57:19
trigonometry         gps                

In [52]:
neighbours_over_time("gyroscope", time_models)

2013-12-01 18:43:04
['miles', '\x94', 'observer', 'north', 'during', 'sun', 'distance', 'degrees', 'solar', 'km', 'light', 'between']
2015-12-30 23:54:31
['°', '=', 'km', '2016', '̂°', 'â', 'miles', 'x', '̂', 'diameter', '10', 'north']
2017-01-16 01:34:55
['round', "n't", 'has', 'make', 'explain', 'fe', 'not', 'theory', 'map', 'prove', 'globe', 'work']
2017-11-09 16:57:19
['=', 'sun', 'miles', 'north', 'km', 'light', 'horizon', 'south', '0.0', 'moon', 'speed', 'object']
2018-04-20 09:03:11
['light', 'moon', 'north', 'line', 'south', 'surface', 'earth', 'speed', 'observer', 'angle', 'horizon', 'straight']
2018-10-07 05:55:37
['m', 'x', 'degrees', 'years', 'per', 'miles', 'km', 'feet', '[', '+', '15', 'sagnac']


In [53]:
neighbours_over_time("infrared", time_models)

2013-12-01 18:43:04
['sun', 'observer', 'miles', '\x94', 'light', 'north', 'distance', 'moon', '[', 'degrees', 'during', 'km']
2015-12-30 23:54:31
['°', '=', 'km', 'miles', 'degrees', '̂°', 'x', 'diameter', 'â', 'per', 'north', 'east']
2017-01-16 01:34:55
['flight', 'hours', 'years', 'm', 'miles', 'west', 'flights', 'moved', 'sydney', 'southern', 'times', 'x']
2017-11-09 16:57:19
['degrees', 'miles', 'km', '0.0', '+', 'm', 'hours', '10', 'feet', 'per', '15', 'minutes']
2018-04-20 09:03:11
['|', 'into', 'ago', 'back', '[', 'launch', 'own', 'â\x80', 'years', ']', 'up', 'try']
2018-10-07 05:55:37
['m', 'degrees', 'miles', 'feet', 'km', 'height', 'east', 'x', 'eye', 'west', 'object', 'observer']


In [54]:
neighbours_over_time("cgi", time_models)

2013-12-01 18:43:04
['flat', 'claim', "n't", 'round', 'theory', 'there', 'exist', 'what', '"', 'why', 'wrong', 'anything']
2015-12-30 23:54:31
['there', 'who', 'nasa', 'they', 'evidence', 'science', 'he', 'me', 'sense', 'years', 'someone', 'things']
2017-01-16 01:34:55
['years', 'he', 'who', 'nasa', 'tom', 'people', 'many', '=', 'ago', 'questions', 'done', 'i']
2017-11-09 16:57:19
['flat', 'model', 'theory', 'globe', 'fe', 'round', 'wall', 'gravity', 'map', 'conspiracy', "'", 'force']
2018-04-20 09:03:11
['do', 'nasa', 'people', 'they', 'who', 'evidence', 'he', 'flat', 'exist', 'round', 'did', 'science']
2018-10-07 05:55:37
['nasa', 'space', 'he', 'science', 'who', 'people', 'fake', 'rowbotham', 'his', 'someone', 'many', 'wrong']


In [55]:
for i, t in toks.items():
    if "â«" in t:
        print(i)

85446
92661
92810
93585
93605
93666
94010
94604
94653
95157
96200
96615
96625
97000
97109
97522
97674
97686
97984
98280
98881
99385
99567
99770
100124
100812
101304
101337
102041
102042
102828
102882
102895
104249
104353
104355
104754
105211
105297
105560
105589
105790
105844
105871
105908
105947
106378
106443
106489
106519
106832
106833
106969
107040
107211
107396
107912
107923
108445
108661
108702
108914
109350
109683
109715
111060
111126
111367
111402
111408
111828
111895
138706
152083
152600
157455
157875
171462
195229


# Using Pre-Trained Embeddings

I don't think anything I've got here will work, as I need full models rather than keyed vectors.

This means I'd probably need to train my own model.

In [56]:
# import gensim.downloader

# # Download the "glove-twitter-25" embeddings
# glove_vectors = gensim.downloader.load('glove-twitter-25')

In [57]:
# %%time
# time_models_retrained = dict()
# # Train a language model for various different portions of the forum.
# for w, w_posts in get_data_windows(fe_posts, 10000, 10000):
#     model = None # Need to load a model here
#     curr_toks = toks.loc[w_posts.index]
#     model.build_vocab(curr_toks, update=True)
#     model.train(curr_toks, total_examples=model.corpus_count, epochs=model.epochs)
    
#     time_models_retrained[w] = model