In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os
from datetime import datetime
import numpy as np
import json

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

In [2]:
import language_change_methods
from language_change_methods.features import function_words
from language_change_methods.utility_functions import get_data_windows, get_time_windows

In [46]:
import sys
sys.path.insert(1, "../")
sys.path.insert(1, "../utilities")

from settings import DB_FP

out_dir = "./Graphs"

In [4]:
dates_fp = "../resources/key-dates.csv"

key_dates = pd.read_csv(dates_fp, delimiter="\t")

convert_to_date = lambda x: datetime.strptime(x, "%d-%m-%Y")
key_dates["date"] = key_dates["date"].apply(convert_to_date)
key_dates.set_index("date", inplace=True)
key_dates = key_dates.sort_index(ascending=True)

In [5]:
sql_get_all_posts ="""
SELECT c.uid, m.name, m.PimsId, p.party, d.date, c.body, c.topic, c.section, s.tmay_deal, s.benn_act, s.ref_stance, s.constituency_leave, c.usas_file
FROM contributions as c
INNER JOIN members as m
ON m.PimsId = c.member
INNER JOIN debates as d
ON d.uid = c.debate
INNER JOIN member_party as p
ON p.PimsId = m.PimsId
INNER JOIN member_stances as s
ON s.PimsId = m.PimsId
WHERE (d.date BETWEEN date("2015-05-01") AND date("2019-09-10"))
AND (((d.date BETWEEN p.start AND p.end) AND NOT (p.end IS NULL))
OR ((d.date >= p.start) AND (p.end IS NULL)));""".strip()

# regex for identifying EU/brexit mentions
eu_regex = r'\b(EU|[Ee]uropean [Uu]nion|[Bb]rexit)\b'

In [6]:
%%time
import sqlite3

conn = sqlite3.connect(DB_FP)
curs = conn.cursor()

# Gets all the contributions and creates a nice dataframe
all_contributions = pd.read_sql_query(sql_get_all_posts, conn)
all_contributions.columns = ['uid', 'name', 'PimsId', 'party', 'date', 'text', 'topic', 'section', 'tmay_deal', 'benn_act', 'ref_stance', 'constituency_leave', 'usas_file']
all_contributions.set_index("uid", inplace=True)
convert_to_date = lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S")
all_contributions['date'] = all_contributions['date'].apply(convert_to_date)
all_contributions.sort_values("date", inplace=True)

Wall time: 14.4 s


In [8]:
%%time
from language_change_methods.utility_functions import clean_text, spacy_tokenise# spacy_pos
# from text_processing import ucrel_tokenise
import nltk
import regex as re    
import spacy

nlp = spacy.load('en_core_web_sm', parser=False, entity=False, matcher=False, add_vectors=False)

def tokenise(text):
    cleaned = clean_text(text)
    cleaned = re.sub(r"(\p{P})\p{P}*", r"\1 ", cleaned)
    tokens = spacy_tokenise(cleaned)
    return tokens

all_toks =  all_contributions["text"].apply(tokenise)

Wall time: 3min 1s


In [9]:
def get_top_vocab_and_vectors(model, n=10000):
    """
    Gets the top n words from the model's vocabulary and the vectors of these words.
    """
    top_vocab = sorted(model.wv.vocab.keys(), key=lambda x: model.wv.vocab[x].count, reverse=True)[:n]
    top_vectors = np.array([model.wv[t] for t in top_vocab])
    return top_vocab, top_vectors

In [10]:
def save_word_vectors(model, voc_fp, vec_fp):
    vocs = sorted(model.wv.vocab.keys(), key=lambda x: model.wv.vocab[x].count, reverse=True)[:10000]
    vecs = np.array([model.wv[t] for t in vocs])
    
    with open(voc_fp, 'w') as voc_file:
        json.dump(vocs, voc_file)
        
    np.save(vec_fp, vecs)

In [11]:
def check_dir(dir_name):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)

In [12]:
from gensim.models import Word2Vec

# suppress some deprecation warning..
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [13]:
EMB_DIR = "./Models"
LOAD = False
# LOAD = os.path.exists('word_vectors/static/word_vectors.npy')

# Model on all contributions

In [14]:
%%time
curr_dir = os.path.join(EMB_DIR, "static")
check_dir(curr_dir)

if LOAD:
    print("Loading Model")
#     model = Word2Vec.load('word_vectors/w2v_all_contributions_static.npy')
    with open(os.path.join(curr_dir, 'vocab_all.json')) as voc_file:
        vocs = json.load(voc_file)
    vecs = np.load(os.path.join(curr_dir, 'word_vectors_all.npy'))
else:
    # training the model
    print("Training model")
    model = Word2Vec(all_toks, size=300)
    # model.save('word_vectors/w2v_all_contributions_static.bin')
    
    save_word_vectors(model, 
                      os.path.join(curr_dir, 'vocab_all.json'), 
                      os.path.join(curr_dir, 'word_vectors_all.npy'))

Training model
Wall time: 2min 13s


In [15]:
t10000 = sorted(model.wv.vocab.keys(), key=lambda x: model.wv.vocab[x].count, reverse=True)[:10000]
top_vectors = np.array([model.wv[t] for t in t10000])

In [16]:
print("Vocab size: {}".format(len(model.wv.vocab)))

Vocab size: 38465


In [17]:
def show_top_token_freq(model,topn):
    for w, v in sorted(list(model.wv.vocab.items()), key=lambda x:x[1], reverse=True)[:topn]:
        if topn<=20:
            print(f"{w:>10s} {v.count:5d}")
        else:
            print(f"{w}({v.count}), ", end="")
            
show_top_token_freq(model, 10)

           4796229
       the 2803248
         , 1860753
         . 1805540
        to 1448366
        of 1145774
      that 1130001
       and 1095262
        in 839869
         a 724597


In [18]:
def get_word_similarity(model, word, topn=10):
    if model.wv.__contains__(word):
        vecs = [f"{w}({v:.5f})" for w, v in model.wv.most_similar(word)]
        print(f"{word}:\n{vecs}")
    else:
        print(f"'{word}' not found in this model")
    print("--"*10)

words = ['brexit', 'eu', 'union', 'european', 'europe', 'remainers', 'trade']

for word in words:
     get_word_similarity(model,word)

brexit:
['mortem(0.49617)', 'mortems(0.48390)', 'exit(0.47262)', 'referendum(0.45965)', 'renegotiation(0.44157)', 'brexiteers(0.41249)', 'ttip(0.41093)', 'pre(0.39478)', 'negotiation(0.39092)', 'deal(0.38097)']
--------------------
eu:
['eea(0.71330)', 'euratom(0.68788)', 'euNUMBER(0.68492)', 'european(0.67392)', 'eurozone(0.59399)', 'cfp(0.59027)', 'euro(0.56000)', 'europe(0.55436)', 'echr(0.55253)', 'efta(0.52765)']
--------------------
union:
['superstate(0.57532)', 'pan-(0.55770)', 'union-(0.52502)', 'eu(0.50887)', 'convention(0.45355)', 'euratom(0.44337)', 'unionists(0.43717)', 'championship(0.43316)', 'jewry(0.42936)', 'unions(0.42411)']
--------------------
european:
['eu(0.67392)', 'soviet(0.59859)', 'customs(0.55448)', 'cinematograph(0.54138)', 'subscriptions(0.53913)', 'euratom(0.50186)', 'eea(0.49706)', 'reps(0.49241)', 'euNUMBER(0.48421)', 'europe(0.48292)']
--------------------
europe:
['world(0.60350)', 'continent(0.58500)', 'eu(0.55436)', 'balkans(0.54690)', 'globe(0.545

In [19]:
result = model.wv.most_similar(positive=['remain', 'brexiteer'], negative=['leave'])
for word, score in result:
    print(f"{word:>15s}: {score:.4f}")

       remainer: 0.4696
parliamentarian: 0.4682
  devolutionist: 0.4459
       advocate: 0.4218
           scot: 0.4196
      energetic: 0.4196
        staunch: 0.4147
     politician: 0.4118
     campaigner: 0.4091
    eurosceptic: 0.4071


# Methods for comparing models

In [27]:
def neighbors(query : str,
              embs: np.ndarray,
              vocab: list,
              K : int = 3) -> list:
    sims = np.dot(embs[vocab.index(query),],embs.T)
    output = []
    for sim_idx in sims.argsort()[::-1][1:(1+K)]:
        if sims[sim_idx] > 0:
            output.append(vocab[sim_idx])
    return output

In [28]:
def get_most_changey_words_with_models(model1, model2, n=100, k=1000, top_n=None):
    nn_scores = []
    
    top_vocab = sorted(model1.wv.vocab.keys(), key=lambda x: model1.wv.vocab[x].count, reverse=True)[:top_n]
    
    vocab1 = model1.wv.vocab
    vocab2 = model2.wv.vocab
    # Loop through all the words in the vocab
    for w in vocab1:
        if (w not in function_words 
                and w in vocab1 
                and w in vocab2 
                and vocab1[w].count > n 
                and vocab2[w].count > n 
                and w in top_vocab):
            neighbours1 = set([x[0] for x in model1.wv.most_similar(w, topn=k)])
            neighbours2 = set([x[0] for x in model2.wv.most_similar(w, topn=k)])
            nn_scores.append((len(neighbours1.intersection(neighbours2)), w))
            
    nn_scores_sorted = sorted(nn_scores)
    return nn_scores_sorted

In [29]:
def get_most_changey_words_with_vectors(vocab1, vocab2, vectors1, vectors2, n=20, k=1000):
    nn_scores = []
    # Loop through all the words in the vocab
    for w in vocab1:
        if w not in function_words and w in vocab1 and w in vocab2:
            neighbours1 = set(neighbors(w, vectors1, vocab1, k))
            neighbours2 = set(neighbors(w, vectors2, vocab2, k))
            nn_scores.append((len(neighbours1.intersection(neighbours2)), w))
            
    nn_scores_sorted = sorted(nn_scores)
    return nn_scores_sorted

### Get neighbours of keywords for chapter

In [30]:
queries = ["referendum", "brexit", "immigration", "leave"]
for query in queries:
#     print(query)
    print(",".join(neighbors(query, top_vectors, t10000, 10)))
#     print("")

election,vote,elections,brexit,leave,voted,votes,article,parliament,debate
referendum,exit,deal,vote,eu,austerity,negotiations,outcome,what,union
migration,asylum,fisheries,welfare,border,detention,sanctions,migrants,trade,borders
stay,remain,referendum,lose,leaving,left,exit,leaves,vote,go


# Labour vs Conservative

In [31]:
conservatives = all_contributions.query("party == 'Conservative'")
labour = all_contributions.query("party == 'Labour'")

### Compare with models

In [32]:
%%time

# Conservative model
con_model = Word2Vec(all_toks.loc[conservatives.index], size=300)

# Labour model
lab_model = Word2Vec(all_toks.loc[labour.index], size=300)

Wall time: 1min 50s


In [33]:
%%time
ranked_words_models = get_most_changey_words_with_models(con_model, lab_model, n=10, k=1000)

Wall time: 1min 30s


In [34]:
ranked_words_models[:20]

[(15, 'instantly'),
 (17, 'cogent'),
 (17, 'nudge'),
 (18, 'faraday'),
 (19, 'cse'),
 (21, 'bundle'),
 (21, 'ni'),
 (22, 'scales'),
 (23, 'abysmal'),
 (23, 'afloat'),
 (24, 'honours'),
 (25, 'lab'),
 (26, 'mustard'),
 (27, 'forensically'),
 (27, 'macroeconomic'),
 (27, 'thereafter'),
 (27, 'unskilled'),
 (29, 'costings'),
 (30, 'caseload'),
 (30, 'handover')]

### Compare with vectors

In [35]:
%%time
vocab_con, vectors_con = get_top_vocab_and_vectors(con_model)
vocab_lab, vectors_lab = get_top_vocab_and_vectors(lab_model)

Wall time: 63.5 ms


In [36]:
%%time
ranked_words_vectors = get_most_changey_words_with_vectors(vocab_con, vocab_lab, vectors_con, vectors_lab, k=1000)

Wall time: 1min 6s


In [37]:
ranked_words_vectors[:20]

[(152, 'mirrors'),
 (213, 'honours'),
 (221, 'presiding'),
 (254, 'inadvertently'),
 (256, 'thereafter'),
 (257, 'seemingly'),
 (260, 'harlow'),
 (265, 'redditch'),
 (267, 'moray'),
 (285, 'ideally'),
 (287, 'bypass'),
 (301, 'continually'),
 (305, 'promptly'),
 (311, 'deane'),
 (315, 'naturally'),
 (317, 'beforehand'),
 (321, 'speedily'),
 (323, 'alternatively'),
 (324, 'manual'),
 (329, 'sensibly')]

### Some Examples with Neighbours

In [38]:
min_freq = 50
check_freq = lambda w, m: m.wv.vocab[w].count > min_freq
queries = [w[1] for w in ranked_words_vectors if check_freq(w[1],con_model) and check_freq(w[1],lab_model)]
queries = queries[:10]
queries

['mirrors',
 'presiding',
 'inadvertently',
 'thereafter',
 'seemingly',
 'harlow',
 'ideally',
 'continually',
 'naturally',
 'alternatively']

In [39]:
for query in queries:
    print(query)
    print("Con:", neighbors(query, vectors_con, vocab_con, 8))
    print("Lab:", neighbors(query, vectors_lab, vocab_lab, 8))
    print("")

mirrors
Con: ['includes', 'under', 'allows', 'anti-', 'page', 'contains', 'covers', 'modern']
Lab: ['tory', 'words', 'side', 'lines', 'tories', 'parties', 'benches', 'rhetoric']

presiding
Con: ['police', 'senior', 'officer', 'presiding', 'director', 'crown', 'prison', 'assembly']
Lab: ['period', 'weekend', 'austerity', 'decades', 'presided', 'decade', 'presiding', 'half']

inadvertently
Con: ['harm', 'inadvertently', 'otherwise', 'damage', 'any', 'anything', 'unnecessary', 'anybody']
Lab: ['commons', 'remind', 'statement', 'assure', 'sir', 'update', 'lords', 'sides']

thereafter
Con: ['after', 'soon', 'recess', 'until', 'period', 'ahead', 'next', 'shortly']
Lab: ['after', 'post-', 'member', 'before', 'low-', 'allowance', 'result', 'during']

seemingly
Con: ['person', 'there', 'slightly', 'an', 'someone', 'woman', 'somebody', 'perfectly']
Lab: ['failure', 'uncertainty', 'brexit', 'financial', 'budget', 'scale', 'rhetoric', 'caused']

harlow
Con: ['st', 'town', 'college', 'constituency'

In [40]:
min_freq = 50
check_freq = lambda w, m: m.wv.vocab[w].count > min_freq
queries = [w[1] for w in ranked_words_models if check_freq(w[1],con_model) and check_freq(w[1],lab_model)]
queries = queries[:10]
queries

['thereafter',
 'mirrors',
 'presiding',
 'nationwide',
 'bogus',
 'naturally',
 'super-',
 'winners',
 'individually',
 'latter']

In [41]:
for query in queries:
    print(query)
    print("Con:", [x[0] for x in con_model.wv.most_similar(query, topn=8)])
    print("Lab:", [x[0] for x in lab_model.wv.most_similar(query, topn=8)])
    print("")

thereafter
Con: ['afterwards', 'recess', 'subsequent', 'commence', 'before', 'easter', 'purdah', 'sometime']
Lab: ['regularise', 'wanstead', 'ewell', 'aunts', 'unpicked', 'widowers', 'alabed', 'miserly']

mirrors
Con: ['criminalises', 'replicates', 'streamlines', 'amends', 'repeals', 'simplifies', 'codifies', 'devolves']
Lab: ['smoke', 'poisonous', 'ugly', 'vicious', 'insidious', 'nationalism', 'shirkers', 'clinging']

presiding
Con: ['commanding', 'warranted', 'certification', 'chief', 'pcc', 'pored', 'police', 'presided']
Lab: ['presided', 'preside', 'glossed', 'roughshod', 'presides', 'hangs', 'pored', 'hanging']

nationwide
Con: ['comprising', 'segment', 'vertical', 'hyper-', 'rNUMBER', 'satellite', 'specification', 'geographic']
Lab: ['unfilled', 'homicides', 'rotherham', 'comprises', 'apartment', 'haringey', 'micrograms', 'house-']

bogus
Con: ['unregistered', 'crowdfunding', 'converter', 'overcomplicated', 'sexualised', 'abusive', 'biosimilars', 'exhausting']
Lab: ['congratulato

# EU vs Non-EU

In [54]:
from importlib import reload
import helper_functions
reload(helper_functions)

<module 'helper_functions' from '../utilities\\helper_functions.py'>

In [56]:
%%time
from helper_functions import split_corpus

eu_mentions, non_eu_mentions = split_corpus(all_contributions, "eu")

Wall time: 27 s


In [57]:
%%time

# EU model
eu_model = Word2Vec(all_toks.loc[eu_mentions.index], size=300)

# Non-EU model
neu_model = Word2Vec(all_toks.loc[non_eu_mentions.index], size=300)

Wall time: 2min 5s


### With Models

In [58]:
%%time
eu_ranked_words_models = get_most_changey_words_with_models(eu_model, neu_model, n=10, k=1000)

Wall time: 1min 19s


In [59]:
eu_ranked_words_models[:20]

[(4, 'sterling'),
 (7, 'erect'),
 (8, 'ord'),
 (10, 'suing'),
 (11, 'ceta'),
 (11, 'rigidity'),
 (12, 'sifting'),
 (13, 'otiose'),
 (13, 'patriot'),
 (13, 'tra'),
 (14, 'entrenchment'),
 (14, 'exits'),
 (15, 'quangos'),
 (15, 'renegotiating'),
 (16, 'decree'),
 (16, 'exiting'),
 (16, 'lent'),
 (16, 'modifying'),
 (17, 'genetically'),
 (17, 'gras')]

### With Vectors

In [60]:
%%time
vocab_eu, vectors_eu = get_top_vocab_and_vectors(eu_model)
vocab_neu, vectors_neu = get_top_vocab_and_vectors(neu_model)

Wall time: 57.5 ms


In [61]:
%%time
eu_ranked_words_vectors = get_most_changey_words_with_vectors(vocab_eu, vocab_neu, vectors_eu, vectors_neu, k=1000)

Wall time: 1min 4s


In [62]:
eu_ranked_words_vectors[:20]

[(97, 'ii'),
 (140, 'master'),
 (149, 'seemingly'),
 (165, 'sterling'),
 (185, 'honours'),
 (194, 'prisoner'),
 (203, 'correcting'),
 (204, 'terminal'),
 (214, 'contracting'),
 (219, 'remotely'),
 (229, 'conversion'),
 (229, 'gb'),
 (234, 'staying'),
 (235, 'alternatively'),
 (235, 'bypass'),
 (240, 'hunting'),
 (240, 'leaving'),
 (242, 'leavers'),
 (244, 'feeding'),
 (244, 'revolutionary')]

### Some Examples

In [63]:
min_freq = 50
check_freq = lambda w, m: m.wv.vocab[w].count > min_freq
queries = [w[1] for w in eu_ranked_words_vectors if check_freq(w[1],eu_model) and check_freq(w[1],neu_model)]
queries = queries[:10]
queries

['ii',
 'seemingly',
 'sterling',
 'honours',
 'prisoner',
 'correcting',
 'contracting',
 'remotely',
 'conversion',
 'staying']

In [64]:
for query in queries:
    print(query)
    print("EU :", neighbors(query, vectors_eu, vocab_eu, 8))
    print("NEU:", neighbors(query, vectors_neu, vocab_neu, 8))
    print("")

ii
EU : ['arrest', 'access', '(', 'criminal', 'regulations', 'regulatory', 'court', 'regulation']
NEU: ['st', 'c', 'george', 'elizabeth', 'james', 'b', 'insert', 'king']

seemingly
EU : ['(', 'an', 'green', 'no-', 'digital', 'page', 'climate', 'sea']
NEU: ['completely', 'totally', 'caused', 'wholly', 'austerity', 'somewhat', 'causing', 'delay']

sterling
EU : ['NUMBER%', 'increase', 'pound', 'unemployment', 'reduction', 'exports', 'growth', 'poverty']
NEU: ['sir', 'tribute', 'fantastic', 'tireless', 'pensions', 'predecessor', 'excellent', 'hard']

honours
EU : ['manifesto', 'delivers', 'protects', 'gave', 'deliver', 'result', 'honour', 'allows']
NEU: ['NUMBERth', 'smith', 'justice', 'royal', 'statute', 'member', 'pension', 'lord']

prisoner
EU : ['treaty', 'directive', 'data', 'withdrawal', 'regulations', 'treaties', 'reciprocal', 'existing']
NEU: ['officer', 'prison', 'someone', 'woman', 'child', 'prisoners', 'prisoner', 'sentence']

correcting
EU : ['delegated', 'clause', 'giving', '

In [65]:
min_freq = 50
check_freq = lambda w, m: m.wv.vocab[w].count > min_freq
queries = [w[1] for w in eu_ranked_words_models if check_freq(w[1],eu_model) and check_freq(w[1],neu_model)]
queries = queries[:10]
queries

['sterling',
 'honours',
 'bypass',
 'seemingly',
 'contracting',
 'prisoner',
 'persuading',
 'chip',
 'super-',
 'temporarily']

In [66]:
for query in queries:
    print(query)
    print("Con:", [x[0] for x in eu_model.wv.most_similar(query, topn=8)])
    print("Lab:", [x[0] for x in neu_model.wv.most_similar(query, topn=8)])
    print("")

sterling
Con: ['depreciation', 'pound', 'inflation', 'devaluation', 'decline', 'decrease', 'projected', 'earnings']
Lab: ['tireless', 'heroic', 'jamieson', 'ethic', 'steen', 'tremendous', 'natzler', 'preparatory']

honours
Con: ['honouring', 'protects', 'delivers', 'maintains', 'secures', 'fulfils', 'retains', 'undermines']
Lab: ['medal', 'wilfred', 'magna', 'nobel', 'stipulation', 'carta', 'yellow', 'featuring']

bypass
Con: ['initiate', 'block', 'organise', 'facilitate', 'undergo', 'assist', 'accommodate', 'allocate']
Lab: ['aNUMBER', 'redevelopment', 'tunnel', 'mNUMBER', 'middlewich', 'dualling', 'parkway', 'ely']

seemingly
Con: ['commercialisation', 'lasers', 'steels', 'strewn', 'collars', 'caving', 'mossack', 'deteriorating']
Lab: ['utterly', 'patently', 'inherently', 'misplaced', 'somewhat', 'bleeding', 'wholly', 'discredited']

contracting
Con: ['third-', 'transcends', 'communist', 'republican', 'socialist', 'sdlp', 'unionist', 'governing']
Lab: ['bail-', 'phasing', 'clapped-',

# Remainers vs Leavers

In [67]:
remain = all_contributions.query("ref_stance == 'remain'")
leave = all_contributions.query("ref_stance == 'leave'")

In [68]:
%%time

# remain model
rem_model = Word2Vec(all_toks.loc[remain.index], size=300)

# leave model
lea_model = Word2Vec(all_toks.loc[leave.index], size=300)

Wall time: 1min 57s


In [69]:
%%time
vocab_rem, vectors_rem = get_top_vocab_and_vectors(rem_model)
vocab_lea, vectors_lea = get_top_vocab_and_vectors(lea_model)

Wall time: 58 ms


In [70]:
%%time
rem_lea_ranked_words_vectors = get_most_changey_words_with_vectors(vocab_rem, vocab_lea, vectors_rem, vectors_lea, k=1000)

Wall time: 1min 6s


In [71]:
rem_lea_ranked_words_vectors[:20]

[(163, 'duck'),
 (216, 'gamble'),
 (219, 'rotten'),
 (226, 'shareholder'),
 (239, 'chaotic'),
 (252, 'mirrors'),
 (260, 'listing'),
 (261, 'dodgy'),
 (268, 'wash'),
 (274, 'dysfunctional'),
 (275, 'ii'),
 (279, 'tip'),
 (285, 'anymore'),
 (290, 'reversing'),
 (296, 'hypocrisy'),
 (298, 'nowadays'),
 (299, 'supposedly'),
 (302, 'bogus'),
 (302, 'comprehensively'),
 (302, 'scores')]

In [72]:
min_freq = 50
check_freq = lambda w, m: m.wv.vocab[w].count > min_freq
queries = [w[1] for w in rem_lea_ranked_words_vectors if check_freq(w[1],rem_model) and check_freq(w[1],lea_model)]
queries = queries[:10]
queries

['ii',
 'supposedly',
 'seemingly',
 'similarly',
 'hemel',
 'eastleigh',
 'fareham',
 'hyde',
 'plots',
 'unequivocally']

In [73]:
for query in queries:
    print(query)
    print("Remain:", [x[0] for x in eu_model.wv.most_similar(query, topn=8)])
    print("Leaver:", [x[0] for x in neu_model.wv.most_similar(query, topn=8)])
    print("")

ii
Remain: ['ecris', 'database', 'prüm', 'ec', 'europol', 'eurojust', 'records', 'warrant']
Leaver: ['vi', 'g', 'afriyie', 'k', 'f', 'v', 'borwick', 'h']

supposedly
Remain: ['roundly', 'allegedly', 'murderous', 'implying', 'imprisoned', 'overboard', 'impermissible', 'dismantled']
Leaver: ['blatantly', 'abusing', 'sued', 'traded', 'inherently', 'corrupt', 'blockaded', 'essentially']

seemingly
Remain: ['commercialisation', 'lasers', 'steels', 'strewn', 'collars', 'caving', 'mossack', 'deteriorating']
Leaver: ['utterly', 'patently', 'inherently', 'misplaced', 'somewhat', 'bleeding', 'wholly', 'discredited']

similarly
Remain: ['fourthly', 'furthermore', 'moreover', 'thirdly', 'secondly', 'lastly', 'meanwhile', 'crucially']
Leaver: ['furthermore', 'moreover', 'secondly', 'likewise', 'thirdly', 'fourthly', 'lastly', 'consequently']

hemel
Remain: ['hempstead', 'warley', 'frome', 'wyre', 'chorley', 'harrogate', 'maldon', 'wentworth']
Leaver: ['hempstead', 'rutland', 'barking', 'brentwood'

# Over Time

In [74]:
%%time
time_models = dict()
# Train a language model for various different portions of the forum.
for w, w_posts in get_time_windows(all_contributions, 365, 365, time_column="date"):
    time_models[w] = Word2Vec(all_toks.loc[w_posts.index], size=300)

Wall time: 2min 7s


In [75]:
def neighbours_over_time(search_term, time_models, top_n=10000):
    for window, curr_model in time_models.items():
        curr_vocab, curr_vectors = get_top_vocab_and_vectors(curr_model, top_n)
        print(window)
        if search_term in curr_vocab:
            print(neighbors(search_term, curr_vectors, curr_vocab, 12))

In [76]:
neighbours_over_time("leave", time_models)

2015-05-18 00:00:00
['stay', 'vote', 'go', 'lose', 'get', 'come', 'take', 'be', 'remain', 'tell', 'move', 'give']
2016-05-17 00:00:00
['referendum', 'remain', 'brexit', 'leaving', 'stay', 'left', 'lose', 'get', 'go', 'exit', 'voted', 'vote']
2017-05-17 00:00:00
['referendum', 'exit', 'leaving', 'vote', 'left', 'remain', 'stay', 'lose', 'voted', 'go', 'get', 'look']
2018-05-17 00:00:00
['referendum', 'remain', 'leaving', 'vote', 'stay', 'get', 'left', 'exit', 'go', 'deal', 'lose', 'come']


In [77]:
neighbours_over_time("single", time_models)

2015-05-18 00:00:00
['every', 'one', 'union', 'third', 'largest', 'two-', 'free', 'an', 'another', 'parent', 'each', 'any']
2016-05-17 00:00:00
['eu', 'union', 'european', 'free', 'common', 'labour', 'market', 'customs', 'one', 'leave', 'every', 'europe']
2017-05-17 00:00:00
['customs', 'union', 'labour', 'free', 'eu', 'euratom', 'common', 'eea', 'leave', 'one', 'internal', 'every']
2018-05-17 00:00:00
['common', 'customs', 'every', 'union', 'eu', 'labour', 'one', 'european', 'free', 'each', 'biggest', 'any']


### Get examples for chapter

In [78]:
def neighbours_over_time_comma_delimited(query, time_models, top_n=10000):
    for window, curr_model in time_models.items():
        curr_vocab, curr_vectors = get_top_vocab_and_vectors(curr_model, top_n)
        if query in curr_vocab:
            print(window.strftime("%y/%m/%d"), end=",")
            print(",".join(neighbors(query, curr_vectors, curr_vocab, 6)))
        else:
            print(window)

In [79]:
for query in ["brexit", "referendum", "immigration", "single"]:
    print(query)
    neighbours_over_time_comma_delimited(query, time_models)
    print()

brexit
15/05/18,election,leave,vote,eu,trade,european
16/05/17,referendum,eu,negotiations,leave,vote,trade
17/05/17,exit,eu,referendum,trade,deal,state
18/05/17,deal,referendum,vote,backstop,union,state

referendum
15/05/18,election,vote,debate,elections,parliament,consultation
16/05/17,election,vote,leave,brexit,debate,voted
17/05/17,election,vote,leave,brexit,voted,negotiations
18/05/17,vote,election,leave,brexit,voted,article

immigration
15/05/18,criminal,welfare,asylum,migration,legal,justice
16/05/17,foreign,eu,brexit,prime,tax,migration
17/05/17,trade,tax,eu,regulatory,legal,current
18/05/17,trade,justice,fisheries,migration,system,eu

single
15/05/18,every,one,union,third,largest,two-
16/05/17,eu,union,european,free,common,labour
17/05/17,customs,union,labour,free,eu,euratom
18/05/17,common,customs,every,union,eu,labour



### Changiest words

In [80]:
def get_changiest_words_per_window(time_models, top_n=10000, k=1000):
    out_dic = dict()
    windows = list(time_models.keys())
    for i in range(1, len(windows)):
        model_1 = time_models[windows[i-1]]
        model_2 = time_models[windows[i]]

        vocab_1, vectors_1 = get_top_vocab_and_vectors(model_1, top_n)
        vocab_2, vectors_2 = get_top_vocab_and_vectors(model_2, top_n)

        out_dic[windows[i]] = get_most_changey_words_with_vectors(vocab_1, vocab_2, vectors_1, vectors_2, k=k)

    return out_dic

In [81]:
%%time
changiest_words_per_window = get_changiest_words_per_window(time_models, 5000)

Wall time: 1min 19s


In [82]:
def print_changiest_over_time(changiest_words_per_window, min_freq=0):
    for window, changey_words in changiest_words_per_window.items():
        check_freq = lambda w, m: m.wv.vocab[w].count > min_freq
        queries = [w[1] for w in changey_words if check_freq(w[1],model)]
        queries = queries[:20]

        print(window)
#         t20_words = [f"{w[1]} {w[0]}" for w in changey_words[:20]]
        print("{:20} {:20} {:20} {:20} {:20}".format(*queries[:5]))
        print("{:20} {:20} {:20} {:20} {:20}".format(*queries[5:10]))
        print("{:20} {:20} {:20} {:20} {:20}".format(*queries[10:15]))
        print("{:20} {:20} {:20} {:20} {:20}".format(*queries[15:20]))
        print("-----------------------------")

In [83]:
print_changiest_over_time(changiest_words_per_window, 100)

2016-05-17 00:00:00
google               customs              dog                  strikes              bomb                
smith                managing             brexit               e-                   wilson              
haven                plain                exit                 style                initially           
bombing              exclusively          wing                 rbs                  sunday              
-----------------------------
2017-05-17 00:00:00
retained             selection            osborne              salisbury            tower               
no-                  bbc                  basically            noise                principal           
latter               wear                 super-               shipley              donations           
text                 leigh                namely               junior               similarly           
-----------------------------
2018-05-17 00:00:00
offensive            overnight          

In [84]:
neighbours_over_time("customs", time_models)

2015-05-18 00:00:00
['tax', 'customs', 'bank', 'revenue', 'trade', 'allowance', 'government', 'border', 'gas', '(', 'european', 'pensions']
2016-05-17 00:00:00
['customs', 'trade', 'eu', 'credit', 'europe', 'movement', 'market', 'national', 'single', 'brexit', 'membership', 'common']
2017-05-17 00:00:00
['european', 'trade', 'eu', 'euratom', 'market', 'agreement', 'border', 'brexit', 'eea', 'single', 'regulatory', 'withdrawal']
2018-05-17 00:00:00
['european', 'backstop', 'trade', 'deal', 'eu', 'agreement', 'border', 'market', 'arrangement', 'relationship', 'regulatory', 'treaty']


In [85]:
neighbours_over_time("brexit", time_models)

2015-05-18 00:00:00
['election', 'leave', 'vote', 'eu', 'trade', 'european', 'union', 'paris', 'stay', 'he', 'bad', 'membership']
2016-05-17 00:00:00
['referendum', 'eu', 'negotiations', 'leave', 'vote', 'trade', 'exit', 'uncertainty', 'article', 'european', 'what', 'state']
2017-05-17 00:00:00
['exit', 'eu', 'referendum', 'trade', 'deal', 'state', 'economic', 'customs', 'negotiations', 'union', 'border', 'transition']
2018-05-17 00:00:00
['deal', 'referendum', 'vote', 'backstop', 'union', 'state', 'trade', 'austerity', 'what', 'leave', 'exit', 'prime']


In [86]:
neighbours_over_time("zero", time_models)

2015-05-18 00:00:00
['NUMBER%', 'rate', 'emissions', 'lower', 'gdp', 'zero', 'low', 'deficit', 'per', 'higher', 'reduction', 'price']
2016-05-17 00:00:00
['rate', 'lower', 'income', 'low', '£', 'higher', 'average', 'rates', 'prices', 'increased', 'gdp', 'increase']
2017-05-17 00:00:00
['homes', 'average', 'rate', '£', 'zero', 'lower', 'tariffs', 'per', 'income', 'rates', 'gdp', 'growth']
2018-05-17 00:00:00
['carbon', 'emissions', 'NUMBER%', 'net', '£', 'tariffs', 'reduce', 'rate', 'gdp', 'tariff', 'reduction', 'growth']


In [87]:
neighbours_over_time("tower", time_models)

2015-05-18 00:00:00
['st', 'constituency', 'city', 'station', 'street', 'college', 'manchester', 'cities', 'borough', 'county', 'university', 'towns']
2016-05-17 00:00:00
['st', 'hospital', 'town', 'borough', 'constituency', 'royal', 'county', 'park', 'college', 'city', 'east', 'street']
2017-05-17 00:00:00
['grenfell', 'fire', 'blocks', 'homes', 'cladding', 'accommodation', 'residents', 'tragedy', 'hospital', 'buildings', 'happened', 'living']
2018-05-17 00:00:00
['tower', 'killed', 'fire', 'london', 'visited', 'constituency', 'station', 'died', 'grenfell', 'st', 'street', 'streets']


In [88]:
neighbours_over_time("exit", time_models)

2015-05-18 00:00:00
['benefit', 'pension', 'income', 'impact', 'payment', 'overseas', 'insurance', 'licence', 'compensation', 'eu', 'annual', 'exemption']
2016-05-17 00:00:00
['brexit', 'negotiations', 'article', 'membership', 'referendum', 'leave', 'eu', 'agreement', 'withdrawal', 'trade', 'leaving', 'departure']
2017-05-17 00:00:00
['withdrawal', 'brexit', 'eu', 'law', 'leave', 'negotiations', 'agreement', 'implementation', 'referendum', 'statute', 'legislation', 'transition']
2018-05-17 00:00:00
['article', 'leave', 'withdrawal', 'extension', 'eu', 'leaving', 'implementation', 'brexit', 'backstop', 'departure', 'negotiations', 'deal']


In [89]:
neighbours_over_time("no-", time_models)

2015-05-18 00:00:00
['to-', 'nuclear', 'isil', 'co-', 'high-', 'non-', 'no-', 'anti-', 'assad', 'two-', 'air', 'without']
2016-05-17 00:00:00
['nuclear', 'property', 'non-', 'border', 'car', 'power', 'offence', 'zone', 'or', 'global', 'on-', 'solution']
2017-05-17 00:00:00
['with', 'bad', 'great', 'brexit', 'transitional', 'transition', 'no', 'trade', 'customs', 'cliff', 'without', 'post-']
2018-05-17 00:00:00
['no', 'without', 'scenario', 'negotiated', 'vote', 'bad', 'exit', 'trade', 'post-', 'great', 'brexit', 'any']


In [90]:
neighbours_over_time("permanent", time_models)

2015-05-18 00:00:00
['state', 'home', 'permanent', 'cabinet', 'chief', 'defence', 'financial', 'health', 'private', 'justice', 'pensions', 'former']
2016-05-17 00:00:00
['home', 'state', 'permanent', 'cabinet', 'defence', 'chief', 'justice', 'financial', 'un', 'states', 'private', 'former']
2017-05-17 00:00:00
['home', 'state', 'foreign', 'cabinet', 'environment', 'former', 'homes', 'accommodation', 'financial', 'private', 'housing', 'temporary']
2018-05-17 00:00:00
['permanent', 'foreign', 'home', 'customs', 'cabinet', 'general', 'transport', 'chief', 'defence', 'trade', 'environment', 'brexit']


In [91]:
for query in ["brexit", "customs", "strike", "tower", "salisbury", "no-"]:
    print(query)
    neighbours_over_time_comma_delimited(query, time_models)
    print()

brexit
15/05/18,election,leave,vote,eu,trade,european
16/05/17,referendum,eu,negotiations,leave,vote,trade
17/05/17,exit,eu,referendum,trade,deal,state
18/05/17,deal,referendum,vote,backstop,union,state

customs
15/05/18,tax,customs,bank,revenue,trade,allowance
16/05/17,customs,trade,eu,credit,europe,movement
17/05/17,european,trade,eu,euratom,market,agreement
18/05/17,european,backstop,trade,deal,eu,agreement

strike
15/05/18,take,taken,industrial,buy,action,military
16/05/17,strike,get,carry,thing,war,taken
17/05/17,get,bring,strike,carry,put,give
18/05/17,reach,get,negotiate,be,move,strike

tower
15/05/18,st,constituency,city,station,street,college
16/05/17,st,hospital,town,borough,constituency,royal
17/05/17,grenfell,fire,blocks,homes,cladding,accommodation
18/05/17,tower,killed,fire,london,visited,constituency

salisbury
15/05/18,st,royal,john,tribute,sir,james
16/05/17,hon,south,states,east,north,west
17/05/17,attack,syria,yemen,terrorist,war,grenfell
18/05/17,killed,war,attacks,

### Changiest Words Conservative vs Labour

In [92]:
%%time
con_time_models = dict()
lab_time_models = dict()
# Train a language model for various different portions of the forum.
for w, w_posts in get_time_windows(all_contributions, 365, 365, time_column="date"):
    curr_con = w_posts[w_posts.index.isin(conservatives.index)].index
    curr_lab = w_posts[w_posts.index.isin(labour.index)].index
    
    con_time_models[w] = Word2Vec(all_toks.loc[curr_con], size=300)
    lab_time_models[w] = Word2Vec(all_toks.loc[curr_lab], size=300)

Wall time: 1min 54s


In [93]:
%%time
con_changiest_words_per_window = get_changiest_words_per_window(con_time_models, 5000)

Wall time: 1min 19s


In [94]:
print_changiest_over_time(con_changiest_words_per_window, 100)

2016-05-17 00:00:00
dual                 customs              google               trusted              reflecting          
selection            english              blue                 red                  naturally           
strikes              exit                 settled              cheap                precious            
campaigners          similarly            pause                closures             indian              
-----------------------------
2017-05-17 00:00:00
selection            thereafter           retained             henry                style               
text                 depth                similarly            shock                hopes               
naturally            semitism             scenario             salisbury            banning             
advertising          apparent             radio                largely              permanent           
-----------------------------
2018-05-17 00:00:00
e-                   calm               

In [95]:
def compare_neighours(query):
    print("Conservative")
    neighbours_over_time(query, con_time_models)
    print("\nLabour")
    neighbours_over_time(query, lab_time_models)
    
def compare_neighours_comma_delimited(query):
    print("Conservative")
    neighbours_over_time_comma_delimited(query, con_time_models)
    print("\nLabour")
    neighbours_over_time_comma_delimited(query, lab_time_models)

In [96]:
compare_neighours_comma_delimited("chaos")

Conservative
15/05/18,daesh,syria,isil,conflict,labour,threat
16/05/17,war,person,problem,(,party,insert
17/05/17,court,legislation,agreement,arrangements,circumstances,weapons
18/05/17,border,brexit,risk,backstop,referendum,union

Labour
15/05/18,review,spending,budget,poverty,past,recent
16/05/17,crisis,year,world,investment,economy,nhs
17/05/17,NUMBER%,problem,country,law,government,legislation
18/05/17,austerity,deal,crisis,brexit,years,credit


In [97]:
compare_neighours_comma_delimited("no-")

Conservative
15/05/18,isil,long-,of-,to-,illegal,into
16/05/17,&,between,car,mobile,nuclear,parking
17/05/17,without,no,with,customs,free,trade
18/05/17,no,without,brexit,negotiated,eu,scenario

Labour
15/05/18,into,high-,to-,between,or,against
16/05/17,long-,year-,free,&,between,low-
17/05/17,long-,with,no,transitional,deal,between
18/05/17,no,brexit,trade,customs,without,any


In [98]:
compare_neighours_comma_delimited("customs")

Conservative
15/05/18,s,trade,taxpayers,european,billion,gas
16/05/17,eu,customs,trade,europe,international,market
17/05/17,european,eu,trade,agreement,euratom,market
18/05/17,european,eu,backstop,trade,agreement,deal

Labour
15/05/18,’,energy,majesty,chief,allowance,department
16/05/17,eu,trade,customs,union,market,single
17/05/17,european,trade,eu,agreement,market,single
18/05/17,european,trade,eu,deal,agreement,market


In [99]:
compare_neighours_comma_delimited("exit")

Conservative
15/05/18,£,income,rate,NUMBER%,exit,insurance
16/05/17,eu,negotiations,european,trade,brexit,leave
17/05/17,eu,withdrawal,leave,law,negotiations,legislation
18/05/17,eu,leave,period,leaving,agreement,deal

Labour
15/05/18,income,impact,increase,eu,tax,NUMBER%
16/05/17,european,trade,union,brexit,market,agreement
17/05/17,eu,agreement,period,brexit,european,leave
18/05/17,eu,withdrawal,article,trade,customs,exit


In [100]:
%%time
lab_changiest_words_per_window = get_changiest_words_per_window(lab_time_models, 5000)

Wall time: 1min 17s


In [101]:
print_changiest_over_time(lab_changiest_words_per_window, 100)

2016-05-17 00:00:00
mixed                right-               bold                 bearing              bombing             
empty                promising            scenario             irresponsible        firmly              
presumably           tremendous           airstrikes           actively             managing            
ill-                 fashion              map                  facility             court               
-----------------------------
2017-05-17 00:00:00
selection            basically            bbc                  electorate           channels            
lacking              fashion              empty                title                respects            
thoroughly           dreadful             backwards            presumably           content             
tremendous           bearing              precious             presenting           chemical            
-----------------------------
2018-05-17 00:00:00
politically          fashion            

In [102]:
compare_neighours_comma_delimited("sovereign")

Conservative
15/05/18,sovereign,european,rights,united,protect,own
16/05/17,united,leader,sovereign,democratic,law,pension
17/05/17,nuclear,global,united,european,nation,british
18/05/17,sovereign,customs,law,nation,democratic,british

Labour
15/05/18,member,united,affairs,(,office,leader
16/05/17,united,vote,members,leader,gentleman,welsh
17/05/17,must,security,our,across,international,vital
18/05/17,vote,(,parliament,united,rights,country


In [103]:
compare_neighours_comma_delimited("sovereignty")

Conservative
15/05/18,sovereignty,membership,security,law,human,nation
16/05/17,parliament,party,vote,leader,sovereignty,members
17/05/17,parliament,democracy,law,role,control,scrutiny
18/05/17,interests,democracy,referendum,vote,decision,voted

Labour
15/05/18,union,role,membership,eu,european,security
16/05/17,debate,party,leader,parliament,house,statement
17/05/17,party,parliament,members,democracy,committee,debate
18/05/17,party,deal,economy,country,democracy,vote


In [104]:
compare_neighours_comma_delimited("brexit")

Conservative
15/05/18,couple,little,page,”,insert,short
16/05/17,eu,referendum,negotiations,trade,union,european
17/05/17,eu,agreement,trade,period,negotiations,union
18/05/17,deal,referendum,vote,union,party,agreement

Labour
15/05/18,member,party,election,leader,european,united
16/05/17,eu,referendum,vote,what,us,european
17/05/17,trade,foreign,state,eu,customs,deal
18/05/17,deal,state,trade,vote,prime,eu


### Changiest Words Remain vs Leave

In [105]:
%%time
rem_time_models = dict()
lea_time_models = dict()
# Train a language model for various different portions of the forum.
for w, w_posts in get_time_windows(all_contributions, 365, 365, time_column="date"):
    curr_rem = w_posts[w_posts.index.isin(remain.index)].index
    curr_lea = w_posts[w_posts.index.isin(leave.index)].index
    
    rem_time_models[w] = Word2Vec(all_toks.loc[curr_rem], size=300)
    lea_time_models[w] = Word2Vec(all_toks.loc[curr_lea], size=300)

Wall time: 2min


In [106]:
def compare_neighours(query):
    print("Remain")
    neighbours_over_time(query, rem_time_models)
    print("\nLeave")
    neighbours_over_time(query, lea_time_models)
    
def compare_neighours_comma_delimited(query):
    print("Remain")
    neighbours_over_time_comma_delimited(query, rem_time_models)
    print("\nLeave")
    neighbours_over_time_comma_delimited(query, lea_time_models)

In [107]:
%%time
rem_changiest_words_per_window = get_changiest_words_per_window(rem_time_models, 5000, k=100)

Wall time: 18 s


In [108]:
print_changiest_over_time(rem_changiest_words_per_window, 100)

2016-05-17 00:00:00
cell                 facility             customs              dog                  exit                
tv                   managing             bomb                 e-                   for-                
precious             pubs                 currency             mature               autonomy            
routinely            search               brexit               gift                 google              
-----------------------------
2017-05-17 00:00:00
retained             sterling             salisbury            settled              radio               
wear                 principal            collectively         solely               NUMBERg             
precious             games                privatisation        block                c                   
fresh                text                 title                nationally           natural             
-----------------------------
2018-05-17 00:00:00
continually          basically          

In [109]:
%%time
lea_changiest_words_per_window = get_changiest_words_per_window(lea_time_models, 5000, k=100)

Wall time: 17.4 s


In [110]:
print_changiest_over_time(lea_changiest_words_per_window, 100)

2016-05-17 00:00:00
barnett              dates                explaining           lock                 re-                 
command              elect                orders               selection            passing             
definitely           discover             english              readily              signs               
standing             undoubtedly          books                conviction           divorce             
-----------------------------
2017-05-17 00:00:00
selection            restored             stance               tribunal             precise             
tower                partisan             dates                calls                complaints          
count                radio                secondary            content              explaining          
intent               requests             substance            worthy               appropriately       
-----------------------------
2018-05-17 00:00:00
sound                temporary          

In [111]:
compare_neighours_comma_delimited("sovereign")

Remain
15/05/18,member,secretary,devolved,democratic,european,sovereign
16/05/17,united,sovereign,democracy,vote,voted,british
17/05/17,united,democratic,nuclear,nation,member,british
18/05/17,british,united,democracy,nation,independent,democratic

Leave
15/05/18,european,eu,nation,security,protect,rights
16/05/17,european,british,referendum,leader,eu,kingdom
17/05/17,united,member,european,trade,customs,uk
18/05/17,customs,eu,united,law,uk,our


In [112]:
compare_neighours_comma_delimited("mess")

Remain
15/05/18,deficit,problem,election,situation,point,budget
16/05/17,problem,country,point,thing,put,position
17/05/17,position,country,problem,point,speech,situation
18/05/17,country,deal,situation,point,house,position

Leave
15/05/18,set,year,NUMBER%,made,carried,million
16/05/17,set,point,pointed,were,had,carried
17/05/17,have,get,eu,go,take,let
18/05/17,“,being,were,£,),own


In [113]:
compare_neighours_comma_delimited("progressive")

Remain
15/05/18,political,welfare,democratic,strong,cross-,effective
16/05/17,effective,global,our,sustainable,important,positive
17/05/17,low,strong,different,long-,positive,competitive
18/05/17,sustainable,strong,cross-,global,framework,based

Leave
15/05/18,million,compared,households,homes,£,billion
16/05/17,its,all-,our,sector,long-,year
17/05/17,s,am,withdrawal,customs,trade,national
18/05/17,(,new,withdrawal,trading,partnership,long-
