In [1]:
import json
import pandas as pd
from datetime import datetime
import sqlite3
import sys
sys.path.insert(1, "../")
sys.path.insert(1, "../utilities")
from helper_functions import get_static_kws
from settings import DB_FP

In [2]:
sql_get_all_posts ="""
SELECT c.uid, m.name, p.party, d.date, c.body, c.topic, c.section, s.tmay_deal, s.benn_act, s.ref_stance, s.constituency_leave
FROM contributions as c
INNER JOIN members as m
ON m.PimsId = c.member
INNER JOIN debates as d
ON d.uid = c.debate
INNER JOIN member_party as p
ON p.PimsId = m.PimsId
INNER JOIN member_stances as s
ON s.PimsId = m.PimsId
WHERE (d.date BETWEEN date("2015-05-01") AND date("2019-09-10"))
AND (((d.date BETWEEN p.start AND p.end) AND NOT (p.end IS NULL))
OR ((d.date >= p.start) AND (p.end IS NULL)));""".strip()

# regex for identifying EU/brexit mentions
eu_regex = r'\b(EU|[Ee]uropean [Uu]nion|[Bb]rexit)\b'

In [3]:
conn = sqlite3.connect(DB_FP)
curs = conn.cursor()

In [4]:
%%time
# Gets all the contributions and creates a nice dataframe
all_contributions = pd.read_sql_query(sql_get_all_posts, conn)
all_contributions.columns = ['uid', 'name', 'party', 'date', 'text', 'topic', 'section', 'tmay_deal', 'benn_act', 'ref_stance', 'constituency_leave']
all_contributions.set_index("uid", inplace=True)
convert_to_date = lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S")
all_contributions['date'] = all_contributions['date'].apply(convert_to_date)

Wall time: 7.28 s


In [5]:
%%time
from language_change_methods.utility_functions import clean_text, spacy_tokenise
# from text_processing import ucrel_tokenise
import nltk
import regex as re    
import spacy

nlp = spacy.load('en_core_web_sm', parser=False, entity=False, matcher=False, add_vectors=False)

def tokenise(text):
    cleaned = clean_text(text)
    cleaned = re.sub(r"(\p{P})\p{P}*", r"\1 ", cleaned)
    tokens = spacy_tokenise(cleaned)
    return tokens

all_toks =  all_contributions["text"].apply(tokenise)

Wall time: 2min 53s


In [10]:
from language_change_methods.utility_functions import merge_lists
from collections import Counter

In [12]:
%%time
tok_counter = Counter(merge_lists(all_toks))

Wall time: 4.99 s


In [15]:
from importlib import reload
import helper_functions
reload(helper_functions)
from helper_functions import get_static_kws

In [16]:
%%time
kws = get_static_kws(all_contributions, tok_counter, all_toks, group_type="brexit_stance_mp_and_constituency")

FINDING KEYWORDS
STARTED PROCESSING GROUP con-leave-mp-leave
STARTED PROCESSING GROUP con-leave-mp-remain
STARTED PROCESSING GROUP con-remain-mp-leave
STARTED PROCESSING GROUP con-remain-mp-remain
Wall time: 31 s


In [22]:
with open("../resources/group-kw-static.json", "w", encoding="utf-8") as f:
    json.dump({gname: kws[gname].to_dict() for gname in kws}, f)