# Data Processing

In [None]:
import psycopg2
from psycopg2 import sql
from collections import OrderedDict

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.language import Language
from spacy.tokens import Token

from tqdm import tqdm

import re
import datetime

import pytz

# Maybe this should be tested, but there are probably too few holidays to justify adding it as a feature.
# from pandas.tseries.holiday import USFederalHolidayCalendar
# import holidays

In [None]:
import multiprocessing
multiprocessing.set_start_method('fork') # Because Mac OS does not default to forking processes.

In [None]:
def insertFieldsintoDB(fields, table_name, cursor):
    keys, items = zip(*fields.items())
    keys = sql.SQL(',').join(map(sql.Identifier, keys))
    items = sql.Literal(items)
    table_name = sql.Identifier(table_name)
    command = sql.SQL('INSERT INTO {} ({}) VALUES {} ON CONFLICT (id) DO NOTHING;').format(table_name, keys, items)
    cursor.execute(command)

## Tokenize NYT articles

In [None]:
conn = psycopg2.connect(host = 'localhost', database = 'nytpopular')
with conn.cursor() as cursor:
    cursor.execute('''SELECT article, id
                        FROM articles
                        WHERE article IS NOT NULL;''')
    article_table = cursor.fetchall()
conn.close()

In [None]:
POS_blacklist = [
    'ADP',
    'ADV',
    'AUX',
    'CONJ',
    'CCONJ',
    'DET',
    'INJ',
    'PART',
    'PRON',
    'PUNCT',
    'SCONJ',
]

stops = set(STOP_WORDS)
stops.update(
    ["'s", "mr.", "mrs.", "ms.", "said", "according"]
)

@Language.component("lowercase_lemmas")
def lowercase_lemmas(doc : spacy.tokens.doc.Doc) -> spacy.tokens.doc.Doc:
    for token in doc:
        token.lemma_ = token.lemma_.lower()
    return doc
    
def get_is_excluded(token):
    return (token.pos_ in POS_blacklist) or (token.lemma_ in stops)

if not Token.has_extension('is_excluded'):
    Token.set_extension('is_excluded', getter=get_is_excluded)

nlp = spacy.load('en_core_web_sm', disable = ['ner'])
nlp.add_pipe('lowercase_lemmas', last = True)

In [None]:
docs_generator = nlp.pipe(article_table, n_process=4, as_tuples=True)

docs = []
for doc, context in tqdm(docs_generator, total = len(article_table)):
    docs.append((context, [token.lemma_ for token in doc if not token._.is_excluded], [sentence.text for sentence in doc.sents]))

In [None]:
conn = psycopg2.connect(host = 'localhost', database = 'nytpopular')
conn.autocommit = True
cursor = conn.cursor()

cursor.execute('''CREATE TABLE wordbags (
	id BIGINT PRIMARY KEY,
    bag TEXT[],
    sentences TEXT[]
);''')

def insertWordBagintoDB(bag, cursor):
    fields = OrderedDict()
    fields['id'] = bag[0]
    fields['bag'] = bag[1]
    fields['sentences'] = bag[2]
    keys, items = zip(*fields.items())
    keys = sql.SQL(',').join(map(sql.Identifier, keys))
    items = sql.Literal(items)
    command = sql.SQL('INSERT INTO wordbags ({}) VALUES {} ON CONFLICT (id) DO NOTHING;').format(keys, items)
    cursor.execute(command)

for bag in docs:
    insertWordBagintoDB(bag, cursor)

cursor.close()
conn.close()

# The Train-Validation-Test Split

I might add new articles to the dataset in the future, so I need a robust and consistent way to train-test split.

In [None]:
from zlib import crc32

def twitter_id_hash(id : int) -> int:
    return crc32((id & 0xFFFFFFFF).to_bytes(4, byteorder = 'big'))

def train_or_test(hash_ : int) -> str:
    TWO_POW_32 = 4294967296
    if hash_ < 0.6 * TWO_POW_32:
        return 'train'
    elif hash_ < 0.8 * TWO_POW_32:
        return 'valid'
    else:
        return 'test'

In [None]:
conn = psycopg2.connect(host = 'localhost', database = 'nytpopular')
conn.autocommit = True
cursor = conn.cursor()

cursor.execute('''SELECT id
                    FROM tweets;''')
ids = [item[0] for item in cursor.fetchall()]

cursor.execute('''CREATE TABLE traintest (
	id BIGINT PRIMARY KEY,
    hash BIGINT,
    split VARCHAR(5)
);''')

def insertTrainTestintoDB(id, cursor):
    fields = OrderedDict()
    fields['id'] = id
    fields['hash'] = twitter_id_hash(id)
    fields['split'] = train_or_test(fields['hash'])
    keys, items = zip(*fields.items())
    keys = sql.SQL(',').join(map(sql.Identifier, keys))
    items = sql.Literal(items)
    command = sql.SQL('INSERT INTO traintest ({}) VALUES {} ON CONFLICT (id) DO NOTHING;').format(keys, items)
    cursor.execute(command)

for id in ids:
    insertTrainTestintoDB(id, cursor)

cursor.close()
conn.close()

## Text lengths

In [None]:
conn = psycopg2.connect(host = 'localhost', database = 'nytpopular')
conn.autocommit = True
cursor = conn.cursor()

cursor.execute('''SELECT tweets.id, tweets.text, articles.title, articles.summary, articles.article
                    FROM tweets
                    LEFT JOIN articles
                    ON tweets.id = articles.id;''')
texts = cursor.fetchall()

cursor.execute('''CREATE TABLE textlengths (
	id BIGINT PRIMARY KEY,
    tweetlength INT,
    titlelength INT,
    summarylength INT,
    articlelength INT
);''')

def text_len(text):
    if text is None:
        return None
    return len(text.strip().split())

def insertLengthsintoDB(row, cursor):
    fields = OrderedDict()
    fields['id'] = row[0]
    for idx, fieldname in enumerate(['tweet', 'title', 'summary', 'article']):
        fields[fieldname + 'length'] = text_len(row[idx + 1])
    insertFieldsintoDB(fields, 'textlengths', cursor)

for row in texts:
    insertLengthsintoDB(row, cursor)

conn.close()

## News section

In [None]:
conn = psycopg2.connect(host = 'localhost', database = 'nytpopular')
with conn.cursor() as cursor:
    cursor.execute('''SELECT id, url 
                        FROM articles;''')
    urls = cursor.fetchall()
conn.close()

In [None]:
regex = re.compile(r'^https://(?:www|cooking)\.nytimes\.com/.*?/?(\D*)/(?:\d*/)?[^/]*$')

In [None]:
def section_process(text):
    if 'opinion' in text:
        return 'opinion'
    if 'science' in text:
        return 'science'
    if 'fashion' in text:
        return 'fashion'
    if 'technology' in text:
        return 'technology'
    # Should 'well' also be consolidated?
    text = text.replace('video/', '')
    text = text.replace('sports/ncaa', 'sports/')
    text = text.replace('us/elections', 'us/politics')
    return text

def url_match(url):
    if 'wirecutter' in url:
        return 'wirecutter'
    m = regex.match(url)
    if m is not None:
        return section_process(m.group(1))
    else:
        return None

In [None]:
conn = psycopg2.connect(host = 'localhost', database = 'nytpopular')
conn.autocommit = True
cursor = conn.cursor()

cursor.execute('''CREATE TABLE sections (
	id BIGINT PRIMARY KEY,
    section VARCHAR(255)
);''')

def insertSectionintoDB(id, url, cursor):
    fields = OrderedDict()
    fields['id'] = id
    fields['section'] = url_match(url)
    insertFieldsintoDB(fields, 'sections', cursor)

for id, url in urls:
    insertSectionintoDB(id, url, cursor)

conn.close()

## Tweet date & time

In [None]:
conn = psycopg2.connect(host = 'localhost', database = 'nytpopular')
with conn.cursor() as cursor:
    cursor.execute('''SELECT id, date 
                        FROM tweets;''')
    dates = cursor.fetchall()
conn.close()

How should time be encoded?  Here are some ways of doing it:
- Divide the day into sections, and then one-hot encode the sections. Dividing the day into hours is an obvious choice, but there are many other ways to divide the day (e.g. morning, afternoon, and night). The division of the day doesn't even need to be a partition - a timestamp could belong to multiple sections.
    - Pros:
        - Easy to do
    - Cons:
        - Discrete
        - Ignores the cyclic nature of time
- Divide the day into sections, and integer encode the sections.
    - Pros:
        - Easy to do
        - Unlike one-hot encoding, only increases the dimensionality of the features by 1
        - Some (but not all) implementations of decision-tree-based algorithms know how to handle integer-encoded categorical variables
    - Cons:
        - Discrete
        - Ignores the cyclic nature of time
        - Imposes false ordering and arithmetic relationships between the sections
- Transform the time with sine and cosine.
    - Pros:
        - Continuous
        - Incorporates the cyclic nature of time (e.g. if partitioning by hour, 23:59 is closer to 00:01 than to 23:00)
    - Cons:
        - Tree-based algorithms split on a single feature, but the time is encoded by two features
- Transform the time with radial basis functions with periodic boundary conditions (i.e. the basis functions are periodic modulo one day).
    - Pros:
        - Continuous
        - Incorporates the cyclic nature of time
    - Cons:
        - Need to decide the number of basis functions, their widths, and their locations. This means more hyperparameters to tune...

For now, I will just precompute the numbers of seconds since Eastern-time midnight and store this data into an SQL database.

In [None]:
ids = []
seconds = []
months = []
dayofweek = []
for id, dt in dates:
    ids.append(id)
    dt_eastern = dt.astimezone(pytz.timezone('US/Eastern')) # Ensure it is Eastern time
    secs = (dt_eastern - dt_eastern.replace(hour = 0, minute = 0, second = 0, microsecond = 0)).total_seconds()
    seconds.append(secs)
    months.append(dt_eastern.month)
    dayofweek.append(dt_eastern.weekday())

In [None]:
conn = psycopg2.connect(host = 'localhost', database = 'nytpopular')
conn.autocommit = True
cursor = conn.cursor()

cursor.execute('''CREATE TABLE timeinfo (
	id BIGINT PRIMARY KEY,
    seconds INT,
    month INT,
    dayofweek INT
);''')

def insertTimeintoDB(id, sec, mon, dow, cursor):
    fields = OrderedDict()
    fields['id'] = id
    fields['seconds'] = sec
    fields['month'] = mon
    fields['dayofweek'] = dow
    insertFieldsintoDB(fields, 'timeinfo', cursor)

for args in zip(ids, seconds, months, dayofweek):
    insertTimeintoDB(*args, cursor)

conn.close()