In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import json
from datetime import datetime
import numpy as np
import seaborn as sns
import sqlite3
import sys
sys.path.insert(1, "../")
sys.path.insert(1, "../utilities")

from language_change_methods.utility_functions import tokenise, get_time_windows, get_data_windows, count_tokens
from language_change_methods.cross_entropy import single_CE_run

from helpers import load_posts, load_toks, load_pos, load_ent

GRAPH_DIR = "./Graphs"

In [2]:
%load_ext autoreload
%autoreload 2

# Loading in the data

In [3]:
def read_db(db_fp, query):
    conn = sqlite3.connect(db_fp)
    comments = pd.read_sql_query(query, conn, index_col="uid", parse_dates={"time": "%Y/%m/%d %H:%M:%S"})
    comments.index = comments.index.astype(str)
    comments.sort_values("time", inplace=True)
    return comments


def get_align_toks(tok_fp, posts):
    # Get the corresponding tokens
    toks = {x[0]: x[1] for x in load_toks(tok_fp)}
    toks = pd.Series(toks)
    toks = toks[toks.index.isin(posts.index)]

    # Remove the posts that don't have tokens
    posts = posts[posts.index.isin(toks.index)]
    # Align the ordering of forum posts and toks
    toks = toks.loc[posts.index]

    return toks, posts

In [4]:
from settings import TFES_FP as FORUM_DB_FP, TFES_TOK_FP as FORUM_TOKS_FP
from helpers import flat_earth_boards, off_topic_boards as other_boards

sql_get_forum ="""
                SELECT p.uid AS uid, p.time AS time, p.user AS poster_id, b.uid AS board_id
                FROM posts as p
                INNER JOIN topics as t
                ON t.uid = p.topic
                INNER JOIN boards as b
                ON b.uid = t.board;""".strip()

In [5]:
%%time
# Gets all flat earth posts
forum_posts = read_db(FORUM_DB_FP, sql_get_forum)

forum_toks, forum_posts = get_align_toks(FORUM_TOKS_FP, forum_posts)

fe_posts = forum_posts.query("board_id in @flat_earth_boards")
ot_posts = forum_posts.query("board_id in @other_boards")

Wall time: 15.1 s


In [6]:
%%time
from gensim.models import Word2Vec
from settings import SCIENCE_W2V_FP
model = Word2Vec.load(SCIENCE_W2V_FP)

Wall time: 555 ms


# Training the models

In [7]:
from copy import deepcopy

In [8]:
%%time
# This model will be continually updated
updated_model = deepcopy(model)

time_models = dict()
# Train a language model for various different portions of the forum.
for w, w_posts in get_data_windows(fe_posts, 10000, 10000, time_column="time"):
    updated_model.build_vocab(forum_toks.loc[w_posts.index], update=True)
    updated_model.train(forum_toks.loc[w_posts.index], total_examples=len(w_posts), epochs=5)
    time_models[w] = deepcopy(updated_model)

Wall time: 23.3 s


In [9]:
%%time
time_models_not_gradual = dict()
# Train a language model for various different portions of the forum.
for w, w_posts in get_data_windows(fe_posts, 10000, 10000, time_column="time"):
    curr_model = deepcopy(model)
    curr_model.build_vocab(forum_toks.loc[w_posts.index], update=True)
    curr_model.train(forum_toks.loc[w_posts.index], total_examples=len(w_posts), epochs=5)
    time_models_not_gradual[w] = deepcopy(curr_model)

Wall time: 25.8 s


# Print some examples

In [10]:
from word_vector_change import neighbours_over_time

In [11]:
neighbours_over_time("flat", time_models)

2013-12-01 18:43:04
['round', 'rule', 'globe', 'conspiracy', 'pointing', 'consensus', 'religious', 'liberal', 'conservative', 'map', 'conservatives', 'christian']
2015-12-30 23:54:31
['round', 'globe', 'fe', 'sphere', 'bot', 'repost', 'rule', 'shape', 'spherical', 'reasonable', 'map', 'conspiracy']
2017-01-16 01:34:55
['round', 'globe', 'fe', 'sphere', 'map', 'spherical', 'bot', 'shape', 'infinite', 'basis', 'curvature', 'rule']
2017-11-09 16:57:19
['round', 'globe', 'fe', 'spherical', 'shape', 'sphere', 'conspiracy', 'curvature', 'infinite', 'burden', 'consensus', 'male']
2018-04-20 09:03:11
['round', 'globe', 'fe', 'spherical', 'sphere', 'curved', 'shape', 'bot', 'fet', 'infinite', 'basis', 'map']
2018-10-07 05:55:37
['round', 'globe', 'fe', 'spherical', 'sphere', 'curved', 'infinite', 'shape', 'fet', 'bot', 'curvature', 'male']


In [12]:
neighbours_over_time("flat", time_models_not_gradual)

2013-12-01 18:43:04
['round', 'rule', 'globe', 'conspiracy', 'pointing', 'consensus', 'religious', 'liberal', 'conservative', 'scientific', 'map', 'climate']
2015-12-30 23:54:31
['round', 'globe', 'conspiracy', 'rule', 'fe', 'false', 'repost', 'sphere', 'map', 'wrong', 'reasonable', 'true']
2017-01-16 01:34:55
['round', 'globe', 'fe', 'map', 'rule', 'conspiracy', 'hypothesis', 'consensus', 'true', 'scientific', 'warming', 'sphere']
2017-11-09 16:57:19
['round', 'globe', 'fe', 'rule', 'conspiracy', 'scientific', 'climate', 'map', 'hypothesis', 'there', 'pointing', 'sphere']
2018-04-20 09:03:11
['round', 'globe', 'rule', 'map', 'fe', 'conspiracy', 'valid', 'scientific', 'warming', 'false', 'true', 'spherical']
2018-10-07 05:55:37
['globe', 'round', 'fe', 'sphere', 'map', 'rule', 'model', 'spherical', 'infinite', 'shape', 'conspiracy', 'climate']


# Find Changiest Words per window

In [13]:
%%time
from word_vector_change import get_changiest_words_per_window

changiest_words_per_window = get_changiest_words_per_window(time_models, 5000)

Wall time: 2min 19s


In [14]:
from word_vector_change import print_changiest_over_time

In [15]:
print_changiest_over_time(changiest_words_per_window, time_models, 50, remove_punc=True, remove_func=True)

2015-12-30 23:54:31
fe                   even                 also                 globe                actually            
earth                flat                 only                 re                   moon                
n't                  not                  just                 very                 map                 
why                  exactly              how                  love                 wiki                
-----------------------------
2017-01-16 01:34:55
also                 actually             still                even                 tom                 
how                  only                 n't                  fe                   flat                
not                  fet                  now                  perspective          just                
why                  here                 flight               very                 earth               
-----------------------------
2017-11-09 16:57:19
also                 even               

In [16]:
%%time
from word_vector_change import get_changiest_words_per_window

changiest_words_per_window_2 = get_changiest_words_per_window(time_models_not_gradual, 5000)

Wall time: 2min 19s


In [17]:
print_changiest_over_time(changiest_words_per_window_2, time_models, 50, remove_punc=True, remove_func=True)

2015-12-30 23:54:31
fe                   actually             also                 only                 globe               
even                 re                   flat                 n't                  moon                
still                quite                map                  exactly              now                 
almost               tom                  not                  why                  love                
-----------------------------
2017-01-16 01:34:55
also                 actually             fe                   tom                  even                
only                 n't                  very                 flat                 now                 
well                 still                flight               perhaps              quite               
globe                gps                  almost               re                   why                 
-----------------------------
2017-11-09 16:57:19
also                 even               

In [18]:
neighbours_over_time("fet", time_models_not_gradual)

2013-12-01 18:43:04
['model', 'evolution', 'hypothesis', 'fet', 'climate', 'theory', 'quantum', 'theories', 'science', 'submission', 'religion', 'discussion']
2015-12-30 23:54:31
['hypothesis', 'model', 'article', 'theories', 'scientific', 'science', 'relativity', 'evolution', 'predictions', 'arguments', 'gravity', 'logic']
2017-01-16 01:34:55
['fet', 'evolution', 'universe', 'theory', 'physics', 'theories', 'relativity', 'model', 'fe', 'climate', 'hypothesis', 'perspective']
2017-11-09 16:57:19
['universe', 'gravity', 'theory', 'model', 'evolution', 'particle', 'fe', 'relativity', 'principle', 'equation', 'physics', 'explanation']
2018-04-20 09:03:11
['fe', 'evolution', 'theory', 'scientific', 'flat', 'model', 'conspiracy', 'warming', 'belief', 'universe', 'science', 'hypothesis']
2018-10-07 05:55:37
['fe', 'model', 'quantum', 'theory', 'universe', 'evolution', 'science', 'scientific', 'fda', 'relativity', 'mechanism', 'explanation']
