# Preparing data

First, we load the data from the original files, knitting together variables and essays.

We:
* replace the `cntrl_a11_social_class` values with more interpretable names
* replace the `cntrl_a11_gender` values with `M` and `F`
* replace broken newline characters to identify separate paragraphs


## Steps

1. We then want to lightly process the essays identifying sentences and tokens. We use a small English spaCy model to do this.
1. Since misspellings are really important in the dataset, we wish to try and repair them. We use two strategies: the _enchant_ spell-checker to detect and suggest alternatives (choosing the first option), using character statistics to replace asterisks that indicate transcription errors (data learned from https://github.com/dwyl/english-words). On manual inspection, character context were not really very useful.
1. Extract the features, including:
  * basic count features over tokens, sentences, asterisk tokens, spelling-replaced tokens
  * readability metrics from https://pypi.python.org/pypi/readability
  * part-of-speech probabilities
  * LIWC counts and probabilities from the lists here http://clpsych.org/shared-task-2018/384-2/

In [1]:
from collections import Counter
import csv
import gzip
import logging
import os
import pickle
import pprint
import re
import time

import enchant
import spacy
from spacy.tokens import Doc, Token
import readability
import ujson as json

from data import read_data, write_data
from liwc import CLPSYCH_LIWC
from vis import write

FORMAT = '%(asctime)-15s\t%(message)s'
logging.basicConfig(format=FORMAT, level=logging.INFO)

log = logging.getLogger()

In [5]:
def make_exp(patterns):
    return re.compile('|'.join(r'(\b{}\b)'.format(re.escape(i)) for i in patterns), re.I)

EXPERT_EXPS = {}
with open('expert.csv') as f:
    for row in csv.reader(f):
        cat = row[0]
        patterns = [r.lower() for r in row[1:]]
        EXPERT_EXPS[cat] = make_exp(patterns)
LIWC_EXPS = {cat: make_exp(terms) 
             for cat, terms in CLPSYCH_LIWC.items()}

EMBEDDINGS = {}
if os.path.exists('fastText.jsonl'):
    with open('fastText.jsonl') as f:
        for l in f:
            i = json.loads(l.strip())
            EMBEDDINGS[i['id']] = [round(v, 3) for v in i['vector']]

engb = enchant.Dict("en_GB")

IGNORE = {
    'I',
    "NT",
    "nt",
    "alot",
    "oclock",
    "etc",
    "T.V.",
    "ve",
}
# Manually-scanned replacements.
REPLS = {
    "n't": "'nt",
    "Iam": "I'm",
    "thay": "they",
    "wen": "when",
    "wud": "would",
    "hav": "have",
    'moter': 'motor',
    "vist": "visit",
    "wat": "what",
    "haf": "have",
    "ther": "there",
    "worke": "work",   
}
DIGIT = re.compile('[0-9]')


class Spelling(object):
    def __init__(self, cache=True, correct=True):
        self.load(cache)
        self.correct = correct
        
    def load(self, cache):
        self.cache = {}
        if cache:
            if os.path.exists('spelling.cache'):
                for clean, corrected in pickle.load(open('spelling.cache', 'rb')).items():
                    if clean and corrected:
                        self.cache[clean] = corrected
            
    def save(self):
        with open('spelling.cache', 'wb') as f:
            pickle.dump(self.cache, f)
            
    def correct_doc(self, doc: Doc):
        if not self.correct:
            return {
                'text': str(doc),
                'replacements': []
            }
        repls = []
        for token in doc:
            clean, corrected = self.correct_token(token)
            if clean != corrected:
                repls.append((token.i, token.idx, token.idx + len(clean), 
                              clean, corrected))
        corrected_text = doc.text
        replaced_tokens = []
        for index, char_start, char_end, original, correction in sorted(repls, reverse=True):
            corrected_text = corrected_text[:char_start] + correction + corrected_text[char_end:]
            replaced_tokens.append((index, original))
        return {
            'text': corrected_text, 
            'replacements': replaced_tokens,
        }
            
    def correct_token(self, token: Token):
        clean = token.orth_.strip()
        corrected = self.cache.get(clean)
        if corrected is not None:
            return clean, corrected
        
        ast_count = clean.count('*')
        if clean != '*' and ast_count:
            clean = clean.replace('*', '')
        suggestions = []
        if clean \
           and token.is_alpha \
           and not token.is_currency \
           and not clean.startswith('ANON_') \
           and not DIGIT.search(clean) \
           and not clean.startswith("'") \
           and clean not in IGNORE \
           and not engb.check(clean):
            corrected = REPLS.get(clean)
            if corrected is None:
                suggestions = engb.suggest(clean)
        if not suggestions:
            corrected = clean
        else:
            corrected = suggestions[0]
        self.cache[clean] = corrected
        return clean, corrected


def run_shallow_nlp(data: dict):
    """ Takes data blob, returns a dict mapping doc_id to spaCy doc. """
    nlp = spacy.load('en')
    nlp.add_pipe(nlp.create_pipe('sentencizer'))
    inputs = []
    for s in data.values():
        essay = s['essay']
        # Replace pounds.
        essay = essay.replace('xxxx', '£')
        inputs.append((essay, s['id']))
    start = time.time()
    docs = {}
    log.info('Parsing docs')
    for doc, doc_id in nlp.pipe(inputs, as_tuples=True, 
                                disable=['tagger', 'parser', 'ner'], 
                                batch_size=50):
        docs[doc_id] = doc
    log.info(f'Parsed {len(docs)} docs in {time.time() - start:.1f}s')
    return docs


def build_corrections(docs, correct=True):
    corrections = {}
    start = time.time()
    spelling = Spelling(correct=correct)
    log.info('Correcting spelling')
    for i, (doc_id, doc) in enumerate(docs.items()):
        corrections[doc_id] = spelling.correct_doc(doc)
    log.info(f'Corrected {len(corrections)} docs in {time.time() - start:.1f}s')
    spelling.save()
    return corrections


def run_nlp(corrections):
    docs = {}
    nlp = spacy.load('en')
    nlp.add_pipe(nlp.create_pipe('sentencizer'))
    inputs = [(corrected['text'], doc_id) 
              for doc_id, corrected in corrections.items()]
    start = time.time()
    log.info('Parsing docs')
    for doc, doc_id in nlp.pipe(inputs, as_tuples=True,
                                disable=["parser"],
                                batch_size=50):
        docs[doc_id] = doc
    log.info(f'Parsed {len(corrections)} docs in {time.time() - start:.1f}s')
    return docs


def extract_doc_features(doc_id, doc, replacements):
    d = {}
    text = doc.text
    n_tokens = len(doc)
    n_sents = len(list(doc.sents))
    d.update({
        'stat_n_tokens': n_tokens,
        'stat_n_types': len(set(t.orth_ for t in doc)),
        'stat_p_type': len(set(t.orth_ for t in doc)) / n_tokens,
        'stat_n_sentences': n_sents,
        'stat_mean_sentence': n_tokens / n_sents,
    })
    
    # Spelling, anonymisation features.
    ast = lbr = 0
    for t in doc:
        if '*' in t.orth_:
            ast += 1
        if '[' in t.orth_:
            lbr += 1
    d.update({
        'noise_p_asttoks': ast / n_tokens,
        'noise_p_replacement_tokens': len(replacements) / n_tokens if replacements else 0,
        'noise_p_left_bracket': lbr / n_tokens if lbr else 0,
    })

    # Part-of-speech features.
    pos_counts = Counter(t.pos_ for t in doc)
    for pos, count in Counter(t.pos_ for t in doc).items():
        d.update({
            f'syn_p_pos-{pos}': count / n_tokens
        })
    d[f'syn_r_ADJ_NOUN'] = pos_counts['ADJ'] / pos_counts['NOUN'] \
                           if pos_counts['ADJ'] + pos_counts['NOUN'] else 0

    # Readability features.
    read = readability.getmeasures([' '.join(t.orth_ for t in s) + '\n' for s in doc.sents], lang='en')
    for cat, metrics in read.items():
        for k, v in metrics.items():
            d[f'read_{cat.replace(" ", "-")}_{k.replace(" ", "-")}'] = v

    # LIWC and expert features.
    for supertype, exps in (('LIWC', LIWC_EXPS), ('EXPERT', EXPERT_EXPS)):
        for cat, exp in exps.items():
            c = len(list(exp.findall(text)))
            d[f'{supertype}_zero_{cat}'] = int(c == 0)
            d[f'{supertype}_p_{cat}'] = c / n_tokens
    
    # Entity features.
    named_ents = Counter()
    typed_ents = Counter()
    for e in doc.ents:
        named_ents[str(e)] += 1
        typed_ents[e.label_] += 1
    d['ents_p'] = len(named_ents) / n_tokens if named_ents else 0
    for t, c in typed_ents.items():
        d[f'ents_p_{t}'] = c / n_tokens if c else 0
        
    # Embedding features.
    global EMBEDDINGS
    for i, v in enumerate(EMBEDDINGS.get(doc_id, [])):
        d[f'emb_{i}'] = v
    return d


def add_features(dataset, corrected_docs, corrections):
    log.info('Adding features')
    start = time.time()
    for doc_id, d in dataset.items():
        # Basic features.
        doc = corrected_docs[doc_id]
        replacements = corrections[doc_id]['replacements']
        features = extract_doc_features(doc_id, doc, replacements)
        d.update(features)
        # Make gender numeric.
        d['cntrl_gender'] = int(d['cntrl_gender'] == 'F')
    log.info(f'Added features in {time.time() - start:.1f}s')
    
        
def dump_csv(data, fname):
    # Get cols.
    cols = set()
    for row in data.values():
        cols.update(row.keys())
    cols.remove('essay')
    cols = list(sorted(cols))
    with open(fname, 'w') as f:
        w = csv.DictWriter(f, fieldnames=cols)
        w.writerow({c: c for c in cols})
        for row in data.values():
            w.writerow({k: v for k, v in row.items() if k != 'essay'})
    

def dump_text_for_vectors(docs, fname):
    with open(fname, 'w') as f:
        for doc_id, doc in docs.items():
            f.write(json.dumps({'id': doc_id, 'text': str(doc)}) + '\n')

        
def preprocess(data, correct=True):
    docs = run_shallow_nlp(data)
    corrections = build_corrections(docs, correct=correct)
    corrected_docs = run_nlp(corrections)
    add_features(data, corrected_docs, corrections)
    return data, docs, corrected_docs

In [3]:
train = read_data('../data/clpsych_2018_training_data/clp18_st_train')
# train = dict(list(train.items())[:20])
log.info(f'Found {len(train)} items\t{len(train)} train')
_, docs, corrected_docs = preprocess(train)
write(train, docs, corrected_docs, '../vis/CLPsych18-train')
write_data(train, 'train.jsonl')
dump_csv(train, 'train.csv')
dump_text_for_vectors(corrected_docs, 'train_for_vectors.jsonl')

2018-04-08 23:47:32,055	Found 9217 items	9217 train
2018-04-08 23:47:33,419	Parsing docs
2018-04-08 23:49:33,072	Parsed 9217 docs in 119.7s
2018-04-08 23:49:33,219	Correcting spelling
2018-04-08 23:49:47,881	Corrected 9217 docs in 14.8s
2018-04-08 23:49:49,369	Parsing docs
2018-04-08 23:57:02,896	Parsed 9217 docs in 433.5s
2018-04-08 23:57:02,918	Adding features
2018-04-09 00:04:30,872	Added features in 448.0s


In [4]:
test = read_data('../data/clpsych_2018_test_data/clp18_st_test')
log.info(f'Found {len(test)} items\t{len(test)} test')
_, docs, corrected_docs = preprocess(test)
write_data(test, 'test.jsonl')
dump_csv(test, 'test.csv')
dump_text_for_vectors(corrected_docs, 'test_for_vectors.jsonl')

2018-04-09 00:05:26,439	Found 1000 items	1000 test
2018-04-09 00:05:28,058	Parsing docs
2018-04-09 00:05:45,518	Parsed 1000 docs in 17.5s
2018-04-09 00:05:45,669	Correcting spelling
2018-04-09 00:05:47,942	Corrected 1000 docs in 2.4s
2018-04-09 00:05:49,571	Parsing docs
2018-04-09 00:06:55,116	Parsed 1000 docs in 65.5s
2018-04-09 00:06:55,153	Adding features
2018-04-09 00:07:57,015	Added features in 61.9s


In [6]:
train = read_data('../data/clpsych_2018_training_data/clp18_st_train')
# train = dict(list(train.items())[:20])
log.info(f'Found {len(train)} items\t{len(train)} train')
_, docs, corrected_docs = preprocess(train, correct=False)
write(train, docs, corrected_docs, '../vis/CLPsych18-train')
write_data(train, 'train.raw.jsonl')

2018-04-13 22:07:09,050	Found 9217 items	9217 train
2018-04-13 22:07:11,819	Parsing docs
2018-04-13 22:08:26,741	Parsed 9217 docs in 74.9s
2018-04-13 22:08:26,858	Correcting spelling
2018-04-13 22:08:29,171	Corrected 9217 docs in 2.4s
2018-04-13 22:08:30,113	Parsing docs
2018-04-13 22:15:59,546	Parsed 9217 docs in 449.4s
2018-04-13 22:15:59,595	Adding features
2018-04-13 22:29:44,885	Added features in 825.3s
