# Data Processing

In [None]:
import psycopg2
from psycopg2 import sql
from collections import OrderedDict

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.language import Language
from spacy.tokens import Token

from tqdm import tqdm

In [None]:
import multiprocessing
multiprocessing.set_start_method('fork') # Because Mac OS does not default to forking processes.

## Tokenize NYT articles

In [None]:
conn = psycopg2.connect(host = 'localhost', database = 'nytpopular')
with conn.cursor() as cursor:
    cursor.execute('''SELECT article, id
                        FROM articles
                        WHERE article IS NOT NULL;''')
    article_table = cursor.fetchall()
conn.close()

In [None]:
POS_blacklist = [
    'ADP',
    'ADV',
    'AUX',
    'CONJ',
    'CCONJ',
    'DET',
    'INJ',
    'PART',
    'PRON',
    'PUNCT',
    'SCONJ',
]

stops = set(STOP_WORDS)
stops.update(
    ["'s", "mr.", "mrs.", "ms.", "said", "according"]
)

@Language.component("lowercase_lemmas")
def lowercase_lemmas(doc : spacy.tokens.doc.Doc) -> spacy.tokens.doc.Doc:
    for token in doc:
        token.lemma_ = token.lemma_.lower()
    return doc
    
def get_is_excluded(token):
    return (token.pos_ in POS_blacklist) or (token.lemma_ in stops)

if not Token.has_extension('is_excluded'):
    Token.set_extension('is_excluded', getter=get_is_excluded)

nlp = spacy.load('en_core_web_sm', disable = ['ner'])
nlp.add_pipe('lowercase_lemmas', last = True)

In [None]:
docs_generator = nlp.pipe(article_table, n_process=4, as_tuples=True)

docs = []
for doc, context in tqdm(docs_generator, total = len(article_table)):
    docs.append((context, [token.lemma_ for token in doc if not token._.is_excluded], [sentence.text for sentence in doc.sents]))

In [None]:
conn = psycopg2.connect(host = 'localhost', database = 'nytpopular')
conn.autocommit = True
cursor = conn.cursor()

# Create tweets table
cursor.execute('''CREATE TABLE wordbags (
	id BIGINT PRIMARY KEY,
    bag TEXT[],
    sentences TEXT[]
);''')

def insertWordBagintoDB(bag, cursor):
    fields = OrderedDict()
    fields['id'] = bag[0]
    fields['bag'] = bag[1]
    fields['sentences'] = bag[2]
    keys, items = zip(*fields.items())
    keys = sql.SQL(',').join(map(sql.Identifier, keys))
    items = sql.Literal(items)
    command = sql.SQL('INSERT INTO wordbags ({}) VALUES {} ON CONFLICT (id) DO NOTHING;').format(keys, items)
    cursor.execute(command)

for bag in docs:
    insertWordBagintoDB(bag, cursor)

cursor.close()
conn.close()

# The Train-Validation-Test Split

I might add new articles to the dataset in the future, so I need a robust and consistent way to train-test split.

In [None]:
from zlib import crc32

def twitter_id_hash(id : int) -> int:
    return crc32((id & 0xFFFFFFFF).to_bytes(4, byteorder = 'big'))

def train_or_test(hash_ : int) -> str:
    TWO_POW_32 = 4294967296
    if hash_ < 0.6 * TWO_POW_32:
        return 'train'
    elif hash_ < 0.8 * TWO_POW_32:
        return 'valid'
    else:
        return 'test'

In [None]:
conn = psycopg2.connect(host = 'localhost', database = 'nytpopular')
conn.autocommit = True
cursor = conn.cursor()

cursor.execute('''SELECT id
                    FROM tweets;''')
ids = [item[0] for item in cursor.fetchall()]

# Create tweets table
cursor.execute('''CREATE TABLE traintest (
	id BIGINT PRIMARY KEY,
    hash BIGINT,
    split VARCHAR(5)
);''')

def insertTrainTestintoDB(id, cursor):
    fields = OrderedDict()
    fields['id'] = id
    fields['hash'] = twitter_id_hash(id)
    fields['split'] = train_or_test(fields['hash'])
    keys, items = zip(*fields.items())
    keys = sql.SQL(',').join(map(sql.Identifier, keys))
    items = sql.Literal(items)
    command = sql.SQL('INSERT INTO traintest ({}) VALUES {} ON CONFLICT (id) DO NOTHING;').format(keys, items)
    cursor.execute(command)

for id in ids:
    insertTrainTestintoDB(id, cursor)

cursor.close()
conn.close()