In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import sys
import os
import json
import itertools

from sklearn.preprocessing import StandardScaler
from nltk import ngrams as make_ngrams
from scipy.cluster.hierarchy import cophenet
from scipy.spatial.distance import pdist

sys.path.insert(1, "../")
sys.path.insert(1, "../utilities")

from helpers import load_posts, load_toks, load_pos, get_top_n_toks
from language_change_methods.vnc import VNC, plot_vnc
from language_change_methods.utility_functions import get_data_windows, get_time_windows, basic_preprocessing
from language_change_methods.features import get_tok_counts, function_words, combine_counts, make_feature_matrix

# This method calculates cosine distance between two vectors.
from scipy.spatial.distance import cosine as cosine_dist
# This method simply inverts it to get similarity.
cosine_sim = lambda x,y: 1 - cosine_dist(x,y)

from sklearn.metrics import jaccard_score

from gensim.models import Word2Vec

# suppress some deprecation warning..
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from settings import TFES_FP as DB_FP, TFES_TOK_FP

## Load Data

In [2]:
%%time
all_posts = load_posts(DB_FP)

from helpers import flat_earth_boards, off_topic_boards as other_boards

fe_posts = all_posts.query("board_id in @flat_earth_boards")

toks = {int(x[0]): x[1] for x in load_toks(TFES_TOK_FP)}
toks = pd.Series(toks)
toks = toks[toks.index.isin(fe_posts.index)]

fe_posts = fe_posts.loc[toks.index]
fe_posts.sort_values("time", ascending=True)
toks = toks.loc[fe_posts.index]

Wall time: 16.8 s


## Train Models Over Time

In this section we will train the models on our data. First we'll look at two time periods - the first and second half of our data.

In [3]:
first_half = toks.iloc[:int(len(fe_posts)/2)]
second_half = toks.iloc[int(len(fe_posts)/2):]

In [4]:
%%time
model_1 = Word2Vec(first_half, size=300)
model_2 = Word2Vec(second_half, size=300)

Wall time: 24.2 s


## Some Useful Functions

In [5]:
def get_most_changey_words_with_models(model1, model2, n=100, k=1000, top_n=None):
    nn_scores = []
    
    top_vocab = sorted(model1.wv.vocab.keys(), key=lambda x: model1.wv.vocab[x].count, reverse=True)[:top_n]
    
    vocab1 = model1.wv.vocab
    vocab2 = model2.wv.vocab
    # Loop through all the words in the vocab
    for w in vocab1:
        if (w not in function_words 
                and w in vocab1 
                and w in vocab2 
                and vocab1[w].count > n 
                and vocab2[w].count > n 
                and w in top_vocab):
            neighbours1 = set([x[0] for x in model1.wv.most_similar(w, topn=k)])
            neighbours2 = set([x[0] for x in model2.wv.most_similar(w, topn=k)])
            nn_scores.append((len(neighbours1.intersection(neighbours2)), w))
            
    nn_scores_sorted = sorted(nn_scores)
    return nn_scores_sorted



def neighbors(query : str,
              embs: np.ndarray,
              vocab: list,
              K : int = 3) -> list:
    sims = np.dot(embs[vocab.index(query),],embs.T)
    output = []
    for sim_idx in sims.argsort()[::-1][1:(1+K)]:
        if sims[sim_idx] > 0:
            output.append(vocab[sim_idx])
    return output



def get_most_changey_words_with_vectors(vocab1, vocab2, vectors1, vectors2, n=20, k=1000):
    nn_scores = []
    # Loop through all the words in the vocab
    for w in vocab1:
        if w not in function_words and w in vocab1 and w in vocab2:
            neighbours1 = set(neighbors(w, vectors1, vocab1, k))
            neighbours2 = set(neighbors(w, vectors2, vocab2, k))
            nn_scores.append((len(neighbours1.intersection(neighbours2)), w))
            
    nn_scores_sorted = sorted(nn_scores)
    return nn_scores_sorted

## Look for changiest words from first to second half of corpus

We have two subtley different methods for doing this.
The first compares the models directly, while the second looks only at the top 10,000 words in the vocabulary.
Using the entire models is more "correct", but with a small-ish corpus, the second method reduces the effects of low-occurance words and the output makes more sense.

### First by comparing the models

In [6]:
%%time
ranked_words_models = get_most_changey_words_with_models(model_1, model_2, n=10, k=1000)

Wall time: 28.8 s


In [7]:
ranked_words_models[:20]

[(21, 'retarded'),
 (34, 'conundrum'),
 (37, 'fringe'),
 (37, 'venture'),
 (39, 'joules'),
 (39, 'whirlpool'),
 (40, 'flags'),
 (40, 'inflated'),
 (40, 'ir'),
 (41, 'corona'),
 (41, 'growth'),
 (42, "'t"),
 (43, 'exaggerated'),
 (43, 'mouse'),
 (43, 'spoof'),
 (44, 'bing'),
 (44, 'expanded'),
 (45, 'trend'),
 (48, 'appropriately'),
 (49, '----')]

In [8]:
model_1.wv.vocab["retarded"].count

19

### Then by comparing the vectors

In [9]:
def get_top_vocab_and_vectors(model, n=10000):
    """
    Gets the top n words from the model's vocabulary and the vectors of these words.
    """
    top_vocab = sorted(model.wv.vocab.keys(), key=lambda x: model.wv.vocab[x].count, reverse=True)[:n]
    top_vectors = np.array([model.wv[t] for t in top_vocab])
    return top_vocab, top_vectors

In [10]:
%%time
vocab_1, vectors_1 = get_top_vocab_and_vectors(model_1)
vocab_2, vectors_2 = get_top_vocab_and_vectors(model_2)

Wall time: 49.5 ms


In [11]:
%%time
ranked_words_vectors = get_most_changey_words_with_vectors(vocab_1, vocab_2, vectors_1, vectors_2, k=1000)

Wall time: 57.3 s


In [12]:
ranked_words_vectors[:20]

[(145, 'intuitively'),
 (155, 'spoof'),
 (167, 'retarded'),
 (178, 'whatâ\x80\x99s'),
 (206, 'endlessly'),
 (208, 'altogether'),
 (214, 'conundrum'),
 (215, 'sweeping'),
 (216, 'individually'),
 (230, 'beat'),
 (232, 'growth'),
 (233, 'arises'),
 (234, 'uncertain'),
 (235, 'bone'),
 (237, 'imho'),
 (237, 'replacing'),
 (238, 'qed'),
 (238, 'unbelievable'),
 (242, 'amazingly'),
 (242, 'nailed')]

In [13]:
neighbors("3d", vectors_1, vocab_1, 20)

['scientific',
 'google',
 'my',
 '3d',
 'been',
 'i',
 'video',
 'article',
 'zetetic',
 'your',
 ':',
 'wiki',
 'posted',
 'link',
 'rowbotham',
 'provided',
 'youtube',
 'd',
 'original',
 'posting']

In [14]:
neighbors("3d", vectors_2, vocab_2, 20)

['sphere',
 'projection',
 'map',
 'globe',
 'd',
 'circle',
 'flat',
 'coordinate',
 'maps',
 'plane',
 'curved',
 'lines',
 'bing',
 'scale',
 'mercator',
 '3d',
 'onto',
 '2d',
 'spheroid',
 'radius']

## Look for more gradual change

In [15]:
%%time
time_models = dict()
# Train a language model for various different portions of the forum.
for w, w_posts in get_data_windows(fe_posts, 10000, 10000):
    time_models[w] = Word2Vec(toks.loc[w_posts.index], size=300)

Wall time: 24.3 s


In [16]:
def neighbours_over_time(search_term, time_models, top_n=10000):
    for window, curr_model in time_models.items():
        curr_vocab, curr_vectors = get_top_vocab_and_vectors(curr_model, top_n)
        print(window)
        if search_term in curr_vocab:
            print(neighbors(search_term, curr_vectors, curr_vocab, 12))
            
            
def neighbours_over_time_comma_delimited(query, time_models, top_n=10000):
    for window, curr_model in time_models.items():
        curr_vocab, curr_vectors = get_top_vocab_and_vectors(curr_model, top_n)
        if query in curr_vocab:
            print(window.strftime("%Y/%m/%d"), end=",")
            curr_neighbours = neighbors(query, curr_vectors, curr_vocab, 12)
            print(",".join(curr_neighbours[:6]))
            print("", end=",")
            print(",".join(curr_neighbours[6:]))
        else:
            print(window)

In [17]:
t100_fe_kw = pd.read_csv("../data/top-100-fe-keywords.csv")
t100_kw_list = list(t100_fe_kw["ngram"])

In [18]:
all_fe_kw = pd.read_csv("../data/all-fe-keywords.csv")
all_kw_list = list(all_fe_kw["ngram"])

### Look at some common FE related words

In [19]:
neighbours_over_time("flat", time_models)

2013-12-01 18:43:04
['round', 'globe', 'shape', 'evidence', 'this', 'theory', 'i', 'fe', 'believe', 'model', 'map', 'what']
2015-12-30 23:54:31
['round', 'globe', 'shape', 'fe', 'evidence', 'sphere', 'spherical', 'map', 'proof', 'believe', 'wrong', 'me']
2017-01-16 01:34:55
['round', 'globe', 'fe', 'shape', 'map', 'model', 'evidence', 'believe', 'real', 'there', 'proof', 'true']
2017-11-09 16:57:19
['round', 'globe', 'fe', 'shape', 'believe', 'map', 'spherical', 'model', 'evidence', 'conspiracy', 'real', 'theory']
2018-04-20 09:03:11
['round', 'globe', 'fe', 'model', 'map', 'spherical', 'shape', 'sphere', 'moon', 'theory', 'believe', 'evidence']
2018-10-07 05:55:37
['round', 'globe', 'fe', 'spherical', 'map', 'model', 'shape', 'sphere', 'believe', 'i', 'moon', 'wrong']


In [20]:
neighbours_over_time_comma_delimited("flat", time_models)

2013/12/01,round,globe,shape,evidence,this,theory
,i,fe,believe,model,map,what
2015/12/30,round,globe,shape,fe,evidence,sphere
,spherical,map,proof,believe,wrong,me
2017/01/16,round,globe,fe,shape,map,model
,evidence,believe,real,there,proof,true
2017/11/09,round,globe,fe,shape,believe,map
,spherical,model,evidence,conspiracy,real,theory
2018/04/20,round,globe,fe,model,map,spherical
,shape,sphere,moon,theory,believe,evidence
2018/10/07,round,globe,fe,spherical,map,model
,shape,sphere,believe,i,moon,wrong


In [21]:
neighbours_over_time("earth", time_models)

2013-12-01 18:43:04
['earthers', 'sun', 'horizon', 'model', 'world', 'map', 'surface', 'society', 'theory', 'pole', 'it', 'north']
2015-12-30 23:54:31
['earthers', 'horizon', 'sun', 'world', 'moon', 'map', 'earther', 'model', 'globe', 'surface', 'equator', 'pole']
2017-01-16 01:34:55
['world', 'earthers', 'horizon', 'map', 'sun', 'model', 'plane', 'moon', 'surface', 'earther', 'pole', 'theory']
2017-11-09 16:57:19
['earthers', 'world', 'sun', 'earther', 'horizon', 'moon', 'model', 'plane', 'map', 'surface', 'theory', 'fe']
2018-04-20 09:03:11
['sun', 'moon', 'surface', 'world', 'model', 'map', 'earthers', 'plane', 'horizon', 'light', 'sphere', 'earther']
2018-10-07 05:55:37
['surface', 'sun', 'plane', 'earthers', 'map', 'model', 'world', 'horizon', 'moon', 'earther', 'circle', 'maps']


In [22]:
neighbours_over_time("globe", time_models)

2013-12-01 18:43:04
['round', 'globe', 'shape', 'theory', 'model', 'evidence', 'map', 'sun', 'earth', 'surface', 'believe', 'curvature']
2015-12-30 23:54:31
['globe', 'round', 'map', 'shape', 'model', 'sphere', 'earth', 'evidence', 'fe', 'moon', 'proof', 'theory']
2017-01-16 01:34:55
['round', 'globe', 'map', 'model', 'shape', 'fe', 'surface', 'earth', 'sphere', 'system', 'plane', 'based']
2017-11-09 16:57:19
['round', 'globe', 'shape', 'fe', 'map', 'model', 'sphere', 'spherical', 'conspiracy', 'earth', 'believe', 'plane']
2018-04-20 09:03:11
['round', 'globe', 'model', 'map', 'spherical', 'sphere', 'surface', 'shape', 'moon', 'fe', 'plane', 'theory']
2018-10-07 05:55:37
['globe', 'round', 'map', 'spherical', 'model', 'sphere', 'surface', 'plane', 'shape', 'fe', 'based', 'projection']


In [23]:
neighbours_over_time("disc", time_models)

2013-12-01 18:43:04
['sun', 'flat', 'model', 'round', 'surface', 'horizon', 'globe', 'observer', 'map', 'distance', 'pole', 'north']
2015-12-30 23:54:31
['map', 'horizon', 'globe', 'sun', 'model', 'earthers', 'surface', 'earther', 'projection', 'object', 'moon', 'theory']
2017-01-16 01:34:55
['ice', 'map', 'model', 'surface', 'plane', 'wall', 'globe', 'horizon', 'object', 'force', 'flat', 'pole']
2017-11-09 16:57:19
['sun', 'miles', 'surface', 'horizon', 'equator', 'pole', 'north', 'moon', 'object', 'circle', 'plane', 'degrees']
2018-04-20 09:03:11
['surface', 'sun', 'plane', 'moon', 'sphere', 'light', 'distance', 'model', 'miles', '=', 'horizon', 'north']
2018-10-07 05:55:37
['sun', 'north', 'plane', 'pole', 'map', 'earth', 'circle', 'model', 'miles', 'equator', 'moon', 'south']


In [24]:
neighbours_over_time("ua", time_models)

2013-12-01 18:43:04
['theory', 'sun', 'evidence', 'no', 'matter', 'acceleration', 'earth', 'light', 'model', 'aether', 'effect', 'exist']
2015-12-30 23:54:31
['flat', 'theory', 'round', 'evidence', 'fe', 'globe', 'sun', 'model', 'gravitation', 'true', 'shape', 'science']
2017-01-16 01:34:55
['perspective', 'fe', 'acceleration', 'theory', 'evidence', 'model', 'fet', 'anything', 'an', 'flat', 'tom', 'round']
2017-11-09 16:57:19
['model', 'perspective', 'acceleration', 'flat', 'force', 'round', 'exist', 'theory', 'shape', 'ua', 'gravitation', 'map']
2018-04-20 09:03:11
['theory', 'flat', 'evidence', 'fe', 'effect', 'gravity', 'perspective', 'force', 'round', 'refraction', 'coriolis', 'matter']
2018-10-07 05:55:37
['force', 'evidence', 'model', 'effect', 'fe', 'ua', 'theory', 'flat', 'acceleration', 'true', 'light', 'what']


In [25]:
neighbours_over_time("ice", time_models)

2013-12-01 18:43:04
['earth', 'light', 'observer', 'north', 'surface', 'gravity', 'distance', 'moon', 'pole', 'object', '=', 'miles']
2015-12-30 23:54:31
['north', 'pole', 'wall', 'map', 'horizon', 'object', 'south', 'projection', 'earth', 'surface', 'celestial', 'equator']
2017-01-16 01:34:55
['wall', 'pole', 'north', 'south', 'force', 'be', 'map', 'acceleration', 'object', 'gravity', 'surface', 'plane']
2017-11-09 16:57:19
['wall', 'pole', 'has', 'south', 'object', 'there', 'north', 'force', 'light', 'shadow', 'gravity', '"']
2018-04-20 09:03:11
['flat', 'pole', 'miles', 'ice', 'map', 'round', 'earth', 'north', 'wall', 'surface', 'model', 'globe']
2018-10-07 05:55:37
['wall', "'", 'evidence', 'model', 'pole', 'theory', 'has', '=', 'body', 'problem', 'earth', 'no']


In [26]:
neighbours_over_time("wall", time_models)

2013-12-01 18:43:04
['light', 'earth', 'observer', 'distance', 'surface', 'north', 'gravity', 'pole', 'moon', 'miles', 'horizon', 'object']
2015-12-30 23:54:31
['map', 'wall', 'horizon', 'north', 'pole', 'globe', 'object', '"', 'projection', 'surface', 'distance', 'south']
2017-01-16 01:34:55
['wall', 'pole', 'north', 'south', 'horizon', 'force', 'map', 'object', 'gravity', 'be', 'no', 'plane']
2017-11-09 16:57:19
['ice', 'pole', 'no', '"', 'object', 'gravity', 'evidence', 'force', 'south', 'been', 'shadow', 'there']
2018-04-20 09:03:11
['surface', 'map', "'", 'plane', 'model', 'pole', 'globe', 'round', 'miles', 'earth', 'wall', 'sphere']
2018-10-07 05:55:37
['ice', "'", 'pole', 'evidence', 'there', 'no', 'has', 'model', 'flat', 'force', 'theory', 'effect']


### Look at some of the top keywords

In [27]:
for w in t100_kw_list[:10]:
    print(w)
    print("-----------------------------------")
    neighbours_over_time(w, time_models)
    print("-----------------------------------")

longitude
-----------------------------------
2013-12-01 18:43:04
['observer', '=', 'sun', '\x94', 'distance', 'north', 'light', 'horizon', 'degrees', 'during', 'south', 'sky']
2015-12-30 23:54:31
['north', 'km', 'south', '°', 'degrees', 'miles', 'pole', '=', 'east', 'equator', 'angle', '̂°']
2017-01-16 01:34:55
['line', 'straight', 'north', 'light', 'southern', 'distance', 'south', 'pole', 'northern', 'speed', 'between', 'east']
2017-11-09 16:57:19
['miles', 'km', '=', 'north', 'equator', 'south', 'between', 'line', 'distance', 'lines', 'angle', 'straight']
2018-04-20 09:03:11
['north', 'pole', 'degrees', 'between', '=', 'miles', 'distance', 'lines', 'equator', 'distances', 'east', 'latitude']
2018-10-07 05:55:37
['south', 'equator', 'between', 'latitude', 'pole', 'degrees', 'line', 'km', 'east', 'distances', 'distance', 'longitude']
-----------------------------------
circumference
-----------------------------------
2013-12-01 18:43:04
['observer', 'distance', 'miles', 'horizon', 'n

2018-10-07 05:55:37
['earth', 'map', 'plane', 'system', 'model', 'distances', 'surface', 'earthers', 'distance', 'circle', 'based', 'models']
-----------------------------------


### Look for the changiest words

In [28]:
def get_changiest_words_per_window(time_models, top_n=10000):
    out_dic = dict()
    windows = list(time_models.keys())
    for i in range(1, len(windows)):
        model_1 = time_models[windows[i-1]]
        model_2 = time_models[windows[i]]

        vocab_1, vectors_1 = get_top_vocab_and_vectors(model_1, top_n)
        vocab_2, vectors_2 = get_top_vocab_and_vectors(model_2, top_n)

        out_dic[windows[i]] = get_most_changey_words_with_vectors(vocab_1, vocab_2, vectors_1, vectors_2, k=1000)

    return out_dic

In [29]:
%%time
changiest_words_per_window = get_changiest_words_per_window(time_models, 5000)

Wall time: 1min 58s


In [30]:
merge_lists = lambda x: list(itertools.chain.from_iterable(x))
all_words = set(merge_lists([[cw[1] for cw in cws] for cws in changiest_words_per_window.values()]))

In [31]:
def get_words_in_all_windows(changiest_words_per_window):
    words_in_each_window = [set([cw[1] for cw in cws]) for cws in changiest_words_per_window.values()]
    words_in_all_windows = words_in_each_window[0].intersection(*words_in_each_window[1:])
    return words_in_all_windows

In [32]:
words_in_all_windows = get_words_in_all_windows(changiest_words_per_window)

In [33]:
for window, changey_words in changiest_words_per_window.items():
    print(window)
    t20_words = [f"{w[1]} {w[0]}" for w in changey_words[:20]]
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[:5]))
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[5:10]))
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[10:15]))
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[15:20]))
    print("-----------------------------")

2015-12-30 23:54:31
corrected 266        cancer 272           respect 294          inclined 305         universally 311     
f 326                philosophical 332    alt 336              peoples 343          pedantic 346        
eric 351             invent 351           qualified 356        settled 356          explore 357         
slightest 358        offers 359           eratosthenes 360     ludicrous 360        google 362          
-----------------------------
2017-01-16 01:34:55
notes 218            particularly 219     young 238            peoples 241          functions 261       
flatness 271         discern 281          ** 282               sattelites 296       junk 303            
3d 310               explaining 318       universally 322      seemingly 325        super 328           
representative 329   powers 331           dangerous 336        moderate 341         ham 342             
-----------------------------
2017-11-09 16:57:19
parallax 227         compute 279        

In [34]:
from language_change_methods.word_vector_change import print_changiest_over_time

print_changiest_over_time(changiest_words_per_window, time_models, min_freq=30, remove_func=False)

2015-12-30 23:54:31
corrected            cancer               respect              f                    alt                 
slightest            eratosthenes         google               expected             fully               
closed               arbitrary            experts              critical             special             
ultimately           sunsets              chinese              particularly         popular             
-----------------------------
2017-01-16 01:34:55
particularly         3d                   explaining           super                dangerous           
display              fully                giving               became               sunsets             
becoming             precisely            deeper               pilots               latter              
terrible             similarly            eratosthenes         agreed               heck                
-----------------------------
2017-11-09 16:57:19
parallax             compute            

In [35]:
neighbours_over_time("parallax", time_models)

2013-12-01 18:43:04
['sun', 'light', 'miles', 'observer', '\x94', 'north', 'distance', 'moon', 'degrees', 'solar', 'horizon', 'eclipse']
2015-12-30 23:54:31
['km', 'sun', '=', 'north', '°', 'miles', 'light', 'south', 'object', 'degrees', 'pole', 'hemisphere']
2017-01-16 01:34:55
['miles', 'light', 'speed', 'degrees', 'sun', 'due', 'feet', 'km', 'm', 'away', 'east', 'west']
2017-11-09 16:57:19
['tom', 'who', 'anyone', 'evidence', 'i', 'me', 'someone', 'rowbotham', 'been', 'thread', 'anything', 'what']
2018-04-20 09:03:11
['[', 'years', 'theory', ']', 'm', '|', '24', '10', 'been', 'days', 'miles', 'â\x80']
2018-10-07 05:55:37
['relativity', '=', 'acceleration', 'sagnac', 'force', 'gravity', 'body', 'coriolis', 'system', 'caused', 'three', 'based']


In [36]:
neighbours_over_time("corrected", time_models)

2013-12-01 18:43:04
["'m", 'am', 'post', 'thread', "'ll", 'have', 'want', 'had', 'read', 'i', 'question', 'here']
2015-12-30 23:54:31
['°', 'â°', '=', 'miles', '2016', 'degrees', 'â', 'years', '10', 'm', 'x', 'north']
2017-01-16 01:34:55
['m', 'miles', '1', '2', '/', 'x', 'km', '+', 'years', '%', 'hours', '*']
2017-11-09 16:57:19
['=', 'degrees', 'miles', 'years', 'km', '0.0', '+', 'south', 'feet', 'north', '10', 'm']
2018-04-20 09:03:11
['post', 'm', 'thread', 'years', '10', '[', '|', '*', ']', 'degrees', 'said', 'my']
2018-10-07 05:55:37
['been', 'miles', 'he', 'years', 'm', 'i', 'eye', 'wrong', 'able', 'sure', 'me', 'said']


In [37]:
neighbours_over_time("technically", time_models)

2013-12-01 18:43:04
["n't", "'m", 'sure', 'wrong', 'exist', 'talking', 'there', 'know', 'saying', ')', 'not', 'sun']
2015-12-30 23:54:31
['sun', 'appear', 'light', 'away', 'object', 'moon', 'distance', 'see', 'be', 'we', 'far', 'how']
2017-01-16 01:34:55
['lines', "'", 'light', 'north', 'gravity', 'ice', 'miles', 'acceleration', 'pole', 'straight', 'line', 'm']
2017-11-09 16:57:19
["'m", '0.0', 'does', '2', 'am', 'degrees', '+', 'been', 'miles', 'km', 'question', '1']
2018-04-20 09:03:11
["'m", 'are', "'re", 'been', 'saying', 'well', 'said', 'good', '!', 'he', 'was', 'sure']
2018-10-07 05:55:37
['m', 'miles', 'km', 'degrees', 'x', 'feet', 'pole', 'north', '>', 'eye', 'surface', 'south']


In [38]:
neighbours_over_time("particularly", time_models)

2013-12-01 18:43:04
['=', 'i', 'sure', 'am', "'re", '1', '[', 'do', ')', 'talking', ']', '2']
2015-12-30 23:54:31
['°', '=', 'km', 'â', '2016', 'south', 'north', '̂°', 'x', 'east', 'miles', 'size']
2017-01-16 01:34:55
["'ve", 'he', 'me', 'i', 'did', 'questions', 'answer', 'am', 'tom', 'please', 'question', 'post']
2017-11-09 16:57:19
['degrees', 'miles', 'km', '0.0', 'years', '+', 'm', 'my', 'thread', 'minutes', '10', 'topic']
2018-04-20 09:03:11
['who', "'m", 'me', 'thread', 'trying', 'wrong', 'evidence', 'nothing', 'question', 'sure', 'am', 'good']
2018-10-07 05:55:37
['me', 'science', 'my', 'sure', 'evidence', 'rowbotham', 'wrong', 'your', 'tom', 'who', 'i', 'does']


In [39]:
neighbours_over_time("infrared", time_models)

2013-12-01 18:43:04
['sun', 'miles', 'observer', '\x94', 'light', 'north', 'distance', '[', 'moon', 'degrees', 'during', 'km']
2015-12-30 23:54:31
['â°', 'km', '=', 'miles', 'degrees', '̂°', 'north', 'x', 'â', 'east', 'south', 'per']
2017-01-16 01:34:55
['flight', 'miles', 'hours', 'm', 'west', 'years', 'southern', 'flights', 'east', 'sydney', 'moved', 'times']
2017-11-09 16:57:19
['degrees', 'miles', 'km', '0.0', '+', 'm', 'per', '15', 'hours', 'less', 'minutes', 'feet']
2018-04-20 09:03:11
['|', 'into', 'back', 'ago', 'up', 'try', 'own', '[', 'let', '+', "'ve", 'launch']
2018-10-07 05:55:37
['m', 'miles', 'degrees', 'feet', 'km', 'east', 'west', 'eye', 'height', 'observer', 'x', 'object']


In [40]:
neighbours_over_time("standpoint", time_models)

2013-12-01 18:43:04
['i', 'question', 'make', 'thread', 'answer', 'claim', 'he', 'please', 'find', 'own', "'m", "'ve"]
2015-12-30 23:54:31
['â°', '=', 'km', 'miles', '̂°', 'â', 'x', '̂', 'degrees', 'm', '2016', '10']
2017-01-16 01:34:55
['=', 'been', 'moved', 'be', 'ice', 'theory', 'he', 'model', 'earth', 'earthers', 'flat', 'map']
2017-11-09 16:57:19
['miles', 'years', 'km', 'away', 'south', 'north', '=', 'been', 'hours', 'minutes', 'distance', 'pole']
2018-04-20 09:03:11
['miles', 'effect', 'degrees', 'sagnac', 'light', 'm', 'theory', 'horizon', 'earth', 'distance', 'model', 'eye']
2018-10-07 05:55:37
['map', 'earth', 'explanation', 'object', 'surface', 'theory', 'evidence', 'wiki', 'horizon', 'eye', 'question', 'm']


### Looking only at words in all windows

In [41]:
for window, changey_words in changiest_words_per_window.items():
    print(window)
    t20_words = [f"{w[1]} {w[0]}" for w in [x for x in changey_words if x[1] in words_in_all_windows][:20]]
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[:5]))
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[5:10]))
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[10:15]))
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[15:20]))
    print("-----------------------------")

2015-12-30 23:54:31
corrected 266        cancer 272           respect 294          inclined 305         universally 311     
f 326                invent 351           explore 357          slightest 358        eratosthenes 360    
google 362           active 368           > 372                expected 372         witnessed 373       
fully 379            freely 381           ] 383                seemingly 391        closed 392          
-----------------------------
2017-01-16 01:34:55
particularly 219     young 238            functions 261        flatness 271         3d 310              
explaining 318       universally 322      seemingly 325        super 328            powers 331          
dangerous 336        display 356          fully 356            endless 357          giving 357          
philosophy 357       became 361           active 362           directed 364         sunsets 369         
-----------------------------
2017-11-09 16:57:19
parallax 227         dropped 302        

In [42]:
print_changiest_over_time(changiest_words_per_window, time_models, word_list=words_in_all_windows, min_freq=30, remove_func=False)

2015-12-30 23:54:31
corrected            cancer               respect              f                    slightest           
eratosthenes         google               expected             fully                closed              
arbitrary            experts              critical             special              ultimately          
sunsets              chinese              particularly         popular              r                   
-----------------------------
2017-01-16 01:34:55
particularly         3d                   explaining           super                dangerous           
display              fully                giving               became               sunsets             
becoming             precisely            deeper               pilots               latter              
terrible             similarly            eratosthenes         agreed               standards           
-----------------------------
2017-11-09 16:57:19
parallax             powers             

In [43]:
neighbours_over_time("particularly", time_models)

2013-12-01 18:43:04
['=', 'i', 'sure', 'am', "'re", '1', '[', 'do', ')', 'talking', ']', '2']
2015-12-30 23:54:31
['°', '=', 'km', 'â', '2016', 'south', 'north', '̂°', 'x', 'east', 'miles', 'size']
2017-01-16 01:34:55
["'ve", 'he', 'me', 'i', 'did', 'questions', 'answer', 'am', 'tom', 'please', 'question', 'post']
2017-11-09 16:57:19
['degrees', 'miles', 'km', '0.0', 'years', '+', 'm', 'my', 'thread', 'minutes', '10', 'topic']
2018-04-20 09:03:11
['who', "'m", 'me', 'thread', 'trying', 'wrong', 'evidence', 'nothing', 'question', 'sure', 'am', 'good']
2018-10-07 05:55:37
['me', 'science', 'my', 'sure', 'evidence', 'rowbotham', 'wrong', 'your', 'tom', 'who', 'i', 'does']


In [44]:
neighbours_over_time("leg", time_models)

2013-12-01 18:43:04
['do', 'let', "n't", 'please', 'you', 'want', 'i', 'what', 'why', 'how', "'re", 'did']
2015-12-30 23:54:31
['â°', '°', 'miles', 'north', 'degrees', 'south', '=', 'pole', 'equator', 'east', 'horizon', '̂°']
2017-01-16 01:34:55
['=', 'degrees', 'hours', 'north', 'west', 'feet', 'east', 'south', 'm', 'angle', 'southern', 'away']
2017-11-09 16:57:19
['0.0', "'ll", 'did', 'moved', "'m", 'does', 'his', 'will', 'want', 'topic', "'d", 'my']
2018-04-20 09:03:11
['|', '[', '/', ']', 'm', '10', 'degrees', '*', '+', 'sagnac', '2', '(']
2018-10-07 05:55:37
['based', 'who', 'world', 'evidence', 'other', 'thread', 'post', 'been', 'claims', 'topic', 'wiki', 'rowbotham']


In [45]:
neighbours_over_time("insane", time_models)

2013-12-01 18:43:04
['i', 'he', 'earthers', '=', 'who', 'been', "'", 'people', '\x94', 'ghosts', 'earth', 'you']
2015-12-30 23:54:31
['your', 'his', 'an', 'science', 'me', 'their', 'scientific', 'evidence', 'someone', '2016', 'he', '°']
2017-01-16 01:34:55
['he', 'm', '1', 'tom', 'bishop', 'gravity', 'evidence', 'science', 'acceleration', 'nasa', 'x', 'been']
2017-11-09 16:57:19
['who', 'flat', 'believe', 'these', 'wrong', 'tom', 'know', 'proof', 'nasa', 'science', 'what', 'exist']
2018-04-20 09:03:11
["'re", "'m", 'thread', 'people', 'who', 'want', 'post', 'questions', 'own', 'please', 'topic', 'am']
2018-10-07 05:55:37
['mass', 'force', 'degrees', 'm', 'km', 'bodies', 'acceleration', 'miles', 'effect', 'body', 'north', 'has']


In [46]:
neighbours_over_time("unreasonable", time_models)

2013-12-01 18:43:04
['am', 'i', 'he', 'claim', 'sure', 'question', 'there', "'ve", 'me', 'my', 'evidence', 'answer']
2015-12-30 23:54:31
['°', '2016', 'km', 'â°', '[', 'â', 'x', ']', '%', '/', '1', 'years']
2017-01-16 01:34:55
['been', 'he', 'let', 'tom', 'me', 'bishop', 'who', 'i', 'please', 'trying', 'sure', 'answer']
2017-11-09 16:57:19
["'m", 'tom', 'evidence', 'you', 'i', 'wrong', 'questions', 'me', 'who', 'trying', 'they', 'please']
2018-04-20 09:03:11
["'m", '=', 'ca', 'do', 'did', 'am', 'me', 'sure', 'let', 'seems', 'tom', 'wo']
2018-10-07 05:55:37
['miles', 'm', '=', 'degrees', 'km', 'feet', 'x', 'per', '2', '+', 'hours', 'north']


In [47]:
neighbours_over_time("idiots", time_models)

2013-12-01 18:43:04
['[', ']', '\x94', "'", 'space', 'miles', '/', 'observer', '1', 'x', 'during', 'years']
2015-12-30 23:54:31
['they', 'those', 'these', 'earthers', 'who', "'m", 'you', 'their', 'many', 'scientists', 'nasa', 'science']
2017-01-16 01:34:55
["'re", 'who', 'are', 'years', 'been', "'m", 'questions', 'many', 'am', 'were', "'ve", 'topic']
2017-11-09 16:57:19
["'re", 'who', '=', 'wrong', 'am', "'", '0.0', 'trying', 'fake', 'saying', "'d", 'talking']
2018-04-20 09:03:11
['their', 'who', 'these', 'space', 'they', 'those', 'many', 'years', 'things', 'nasa', 'please', 'trying']
2018-10-07 05:55:37
['people', 'these', '=', 'are', 'years', 'who', 'talking', 'maps', '>', 'nasa', 'space', 'relativity']


### Looking at changiest FE Keywords

In [48]:
for window, changey_words in changiest_words_per_window.items():
    print(window)
    t20_words = [f"{w[1]} {w[0]}" for w in [x for x in changey_words if x[1] in all_kw_list][:20]]
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[:5]))
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[5:10]))
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[10:15]))
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[15:20]))
    print("-----------------------------")

2015-12-30 23:54:31
corrected 266        eric 351             eratosthenes 360     witnessed 373        freely 381          
sunsets 413          firstly 420          eers 423             r 423                physicists 425      
assumes 431          presenting 431       surveyors 432        pm 436               galileo 438         
confirmation 440     instruments 445      relativistic 445     cavendish 452        inconsistent 457    
-----------------------------
2017-01-16 01:34:55
flatness 271         explaining 318       sunsets 369          deeper 387           pilots 391          
blindly 393          rendering 409        foucault 413         barrier 419          demonstrating 424   
eratosthenes 428     continually 432      weigh 437            demonstrates 450     reproduce 450       
sufficiently 450     insufficient 451     observational 451    testable 451         â« 455             
-----------------------------
2017-11-09 16:57:19
parallax 227         compute 279        

In [49]:
print_changiest_over_time(changiest_words_per_window, time_models, word_list=all_kw_list)

2015-12-30 23:54:31
corrected            eric                 eratosthenes         witnessed            freely              
sunsets              firstly              eers                 r                    physicists          
assumes              presenting           surveyors            pm                   galileo             
confirmation         instruments          relativistic         cavendish            inconsistent        
-----------------------------
2017-01-16 01:34:55
flatness             explaining           sunsets              deeper               pilots              
blindly              rendering            foucault             barrier              demonstrating       
eratosthenes         continually          weigh                demonstrates         reproduce           
sufficiently         insufficient         observational        testable             â«                 
-----------------------------
2017-11-09 16:57:19
parallax             compute            

In [50]:
for window, changey_words in changiest_words_per_window.items():
    print(window)
    t20_words = [f"{w[1]} {w[0]}" for w in [x for x in changey_words if x[1] in t100_kw_list and x[1] in words_in_all_windows][:20]]
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[:5]))
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[5:10]))
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[10:15]))
    print("{:20} {:20} {:20} {:20} {:20}".format(*t20_words[15:20]))
    print("-----------------------------")

2015-12-30 23:54:31
eratosthenes 360     cavendish 452        magnification 489    curving 529          bedford 542         
vanishing 544        trigonometry 548     mercator 559         bi-polar 563         auckland 565        
diagrams 578         enag 578             azimuthal 580        gps 580              johannesburg 586    
equidistant 591      everest 593          santiago 593         ua 595               coriolis 596        
-----------------------------
2017-01-16 01:34:55
eratosthenes 428     diagrams 474         trigonometry 515     bedford 529          everest 538         
gps 550              mercator 573         cavendish 579        accelerator 580      balloons 605        
auckland 613         enag 616             bi-polar 620         rowbotham 625        coriolis 626        
refraction 641       magnification 644    johannesburg 651     ret 656              distances 658       
-----------------------------
2017-11-09 16:57:19
eratosthenes 572     trigonometry 590   

In [51]:
print_changiest_over_time(changiest_words_per_window, time_models, word_list=t100_kw_list, min_freq=50)

2015-12-30 23:54:31
surveyors            cavendish            geodetic             magnification        vanishing           
trigonometry         gleason              mercator             azimuthal            gps                 
johannesburg         equidistant          santiago             ua                   coriolis            
equatorial           ret                  longitude            sunset               rowbotham           
-----------------------------
2017-01-16 01:34:55
diagrams             gps                  qantas               unipolar             balloons            
lat                  rowbotham            coriolis             refraction           magnification       
ret                  distances            vanishing            santiago             longitude           
ua                   fe'ers               antarctic            acceleration         equinox             
-----------------------------
2017-11-09 16:57:19
trigonometry         refraction         

In [52]:
neighbours_over_time("gyroscope", time_models)

2013-12-01 18:43:04
['miles', '\x94', 'observer', 'north', 'during', 'sun', 'distance', 'degrees', 'solar', 'light', 'km', 'between']
2015-12-30 23:54:31
['°', 'â°', 'km', '2016', 'â', 'x', '̂°', 'miles', 'm', '[', '̂', '/']
2017-01-16 01:34:55
['round', 'has', "n't", 'fe', 'make', 'theory', 'explain', 'globe', 'map', 'not', 'system', 'based']
2017-11-09 16:57:19
['degrees', 'horizon', '=', 'light', 'object', 'moon', 'eye', 'miles', 'north', 'speed', 'km', 'observer']
2018-04-20 09:03:11
['light', 'moon', 'line', 'north', 'angle', 'south', 'surface', 'speed', 'horizon', 'observer', 'earth', 'straight']
2018-10-07 05:55:37
['m', 'years', '[', 'x', 'â\x80', 'per', 'he', ']', 'rowbotham', 'sagnac', '+', ':']


In [53]:
neighbours_over_time("infrared", time_models)

2013-12-01 18:43:04
['sun', 'miles', 'observer', '\x94', 'light', 'north', 'distance', '[', 'moon', 'degrees', 'during', 'km']
2015-12-30 23:54:31
['â°', 'km', '=', 'miles', 'degrees', '̂°', 'north', 'x', 'â', 'east', 'south', 'per']
2017-01-16 01:34:55
['flight', 'miles', 'hours', 'm', 'west', 'years', 'southern', 'flights', 'east', 'sydney', 'moved', 'times']
2017-11-09 16:57:19
['degrees', 'miles', 'km', '0.0', '+', 'm', 'per', '15', 'hours', 'less', 'minutes', 'feet']
2018-04-20 09:03:11
['|', 'into', 'back', 'ago', 'up', 'try', 'own', '[', 'let', '+', "'ve", 'launch']
2018-10-07 05:55:37
['m', 'miles', 'degrees', 'feet', 'km', 'east', 'west', 'eye', 'height', 'observer', 'x', 'object']


In [54]:
neighbours_over_time("cgi", time_models)

2013-12-01 18:43:04
['flat', 'claim', 'round', "n't", 'there', '"', 'theory', 'exist', 'what', 'anything', 'wrong', 'why']
2015-12-30 23:54:31
['they', 'who', 'nasa', 'flat', 'evidence', 'there', 'fake', 'many', 'believe', 'me', 'someone', 'science']
2017-01-16 01:34:55
['years', 'he', 'who', 'nasa', 'people', 'tom', 'many', 'ago', '=', 'questions', 'done', 'i']
2017-11-09 16:57:19
['flat', 'theory', 'model', 'gravity', 'globe', 'round', 'conspiracy', 'gravitation', 'force', 'map', "'", 'fe']
2018-04-20 09:03:11
['do', 'nasa', 'who', 'evidence', 'people', 'he', 'they', 'science', 'flat', 'sense', 'round', 'did']
2018-10-07 05:55:37
['space', 'nasa', 'he', 'who', 'fake', 'science', 'people', 'did', 'wrong', 'his', 'rowbotham', 'done']


In [55]:
for i, t in toks.items():
    if "â«" in t:
        print(i)

42961
47076
47192
47608
47627
47673
47876
48290
48330
48717
49640
49970
49971
50233
50300
50611
50711
50722
50923
51170
51600
51937
52084
52246
52461
52952
53345
53378
53922
53923
54411
54440
54449
55377
55461
55463
55777
56094
56173
56394
56423
56608
56640
56666
56695
56722
57078
57142
57174
57194
57439
57440
57549
57604
57743
57852
58171
58181
58574
58736
58763
58917
59145
59404
59428
60319
60358
60479
60506
60512
60860
60920
79514
88968
89245
92392
92682
100342
112372
