## TODO

* -Upsample positives-
* -TfidfVectorizer-
* -Optimise precision-
* -Feature selection using positive-class precision?-
* -Try ngrams-
* -Try SelectFromModel-
* -Replace words with WordNet synset frequencies-
* -Try against a binary depresed (>=4) or not label-
* Average across multiple trials with bootstrap resampling?

In [None]:
from fastText import train_supervised, train_unsupervised, load_model
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_regression, mutual_info_regression, chi2
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC, SVR
import pandas as pd
import spacy
import ujson as json

In [None]:
def load_data(fname):
    data = []
    with open(fname) as f:
        return [json.loads(l.strip()) for l in f]

train = load_data('train.jsonl')

nlp = spacy.load('en')
nlp.add_pipe(nlp.create_pipe('sentencizer'))
print(nlp.pipe_names)

for i, d in enumerate(train):
    doc = nlp(d['essay'], disable=['parser', 'ner'])
    d.update({
        'tokens': len(doc),
        'types': len(set(t.orth_ for t in doc)),
        'type_token_pct': len(set(t.orth_ for t in doc)) / len(doc),
        'sentences': len(list(doc.sents)),
        'asterisk_words_pct': len([t for t in doc if '*' in t.orth_]) / len(doc),
        'asterisk_words_count': len([t for t in doc if '*' in t.orth_]),
    })
    if i % 500 == 0:
        print(f'Processed {i + 1}')
print('Finished')

In [None]:
def text_make_xy(data, label_name):
    X, y = [], []
    for i in data:
        label = i[label_name]
        if label == '':
            continue
        X.append(i['essay'])
        y.append(label)
    return X, y


def text_experiment(data, label_name, scale_y=False):
    print(f'Experiment\t{label_name}')
    X, y = text_make_xy(data, label_name)
    
    pipeline = make_pipeline(
        CountVectorizer(ngram_range=(1,3)),
        SelectKBest(f_regression, k=100),
        #SGDRegressor(max_iter=1000, tol=1e3),
        SVR(kernel='rbf', C=1e3, gamma=0.1),
    )
    grid = GridSearchCV(
        pipeline,
        param_grid={},
        scoring='neg_mean_squared_error',
        n_jobs=4,
        verbose=0,
        cv=10,
    )
    if scale_y:
        y = scale(y)
    clf = grid.fit(X, y)
    m = clf.cv_results_['mean_test_score'][clf.best_index_]
    s = clf.cv_results_['std_test_score'][clf.best_index_]
    print(f'\nResults:\n{m:.3f}±{2*s:.3f}\t{clf.best_params_}')

    features = clf.best_estimator_.steps[0][1].get_feature_names()  # feature names
    mask = clf.best_estimator_.steps[1][1].get_support() #list of booleans
    weights = clf.best_estimator_.steps[-1][1].coef_  # weights
    weighted = [] # The list of your K best features
    
    for selected, feature_name in zip(mask, features):
        if selected:
            weighted.append((feature_name, weights[len(weighted)]))
    
    print('\nRanked coefficients:')
    for k, v in sorted(weighted, key=lambda i: i[1], reverse=True):
        print(f'\t{v:.3f}\t{k}')

    print('\n')
    return clf

In [None]:
clf = text_experiment(train, 'a23_pdistress', scale_y=False)
clf = text_experiment(train, 'a33_pdistress', scale_y=False)
clf = text_experiment(train, 'a42_pdistress', scale_y=False)

In [None]:
from collections import Counter

Counter(d['a23_pdistress'] for d in train)

### Binarise to distressed/not

It seems that the few essays with a very high corresponding distress scores heavily influence feature selection under regression. The data manual says that "a threshold value of 4+ on the 9-item \[Malaise Inventory\] scale is generally used to indicate depression".

In [None]:
def text_make_binary_xy(data, label_name):
    X, y = [], []
    for i in data:
        label = i[label_name]
        if label == '':
            continue
        X.append(i['essay'])
        y.append(label>=4)
    return X, y


def text_binary_experiment(data, label_name, scale_y=False):
    print(f'Experiment\t{label_name}')
    X, y = text_make_binary_xy(data, label_name)
    
    #pipeline = make_pipeline(
    #    CountVectorizer(), #CountVectorizer(ngram_range=(1,2)),
    #    SelectFromModel(LinearSVC(penalty="l1", dual=False, tol=1e-3)),
    #    LinearSVC(penalty="l2"),
    #)
    pipeline = make_pipeline(
        TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english'),
        #MultinomialNB(alpha=.01),
        KNeighborsClassifier()
    )
    grid = GridSearchCV(
        pipeline,
        param_grid={},
        scoring='f1',
        n_jobs=4,
        verbose=0,
        cv=10,
    )
    if scale_y:
        y = scale(y)
    clf = grid.fit(X, y)
    m = clf.cv_results_['mean_test_score'][clf.best_index_]
    s = clf.cv_results_['std_test_score'][clf.best_index_]
    print(f'\nResults:\n{m:.3f}±{2*s:.3f}\t{clf.best_params_}')

    """
    features = clf.best_estimator_.steps[0][1].get_feature_names()  # feature names
    mask = clf.best_estimator_.steps[1][1].get_support() #list of booleans
    weights = clf.best_estimator_.steps[-1][1].coef_  # weights
    weighted = [] # The list of your K best features
    
    for selected, feature_name in zip(mask, features):
        if selected:
            weighted.append((feature_name, weights[0][len(weighted)]))
    
    print('\nRanked coefficients:')
    for k, v in sorted(weighted, key=lambda i: abs(i[1]), reverse=True):
        print(f'\t{v:.3f}\t{k}')
    """

    features = clf.best_estimator_.steps[0][1].get_feature_names()  # feature names
    weights = clf.best_estimator_.steps[-1][1].coef_  # weights
    weighted = zip(features, weights[0])
    
    print('\nRanked coefficients:')
    for i, (k, v) in enumerate(sorted(weighted, key=lambda i: i[1], reverse=True)):
        if i >= 100:
            break
        print(f'\t{v:.3f}\t{k}')

    print('\n')
    return clf

In [None]:
clf = text_binary_experiment(train, 'a23_pdistress', scale_y=False)
clf = text_binary_experiment(train, 'a33_pdistress', scale_y=False)
clf = text_binary_experiment(train, 'a42_pdistress', scale_y=False)

## Feature selection without classification

In [None]:
def text_make_binary_xy(data, label_name):
    X, y = [], []
    for i in data:
        label = i[label_name]
        if label == '':
            continue
        X.append(i['essay'])
        y.append(label>=4)
    return X, y


def text_binary_experiment(data, label_name):
    print(f'Experiment\t{label_name}')
    X, y = text_make_binary_xy(data, label_name)
    
    pipeline = make_pipeline(
        #CountVectorizer(),
        #CountVectorizer(ngram_range=(1,2)),
        CountVectorizer(max_df=0.5, min_df=5, stop_words='english'),
        #TfidfVectorizer(sublinear_tf=True, max_df=0.5, min_df=10, stop_words='english'),
        SelectKBest(chi2, k=100),
    )
    clf = pipeline.fit(X, y)
    print('\nResults:')

    features = clf.steps[0][1].get_feature_names()  # feature names
    mask = clf.steps[1][1].get_support() #list of booleans
    weights = clf.steps[1][1].scores_
    weighted = [] # The list of your K best features
    
    for selected, feature_name in zip(mask, features):
        if selected:
            weighted.append((feature_name, weights[len(weighted)]))
    
    print('\nRanked coefficients:')
    for k, v in sorted(weighted, key=lambda i: abs(i[1]), reverse=True):
        print(f'\t{v:.3f}\t{k}')

    print('\n')
    return clf

In [None]:
clf = text_binary_experiment(train, 'a23_pdistress')
clf = text_binary_experiment(train, 'a33_pdistress')
clf = text_binary_experiment(train, 'a42_pdistress')

### Only subjects with consistent distress

In [None]:
from collections import Counter

label_names = [
    'a23_pdistress',
    'a33_pdistress',
    'a42_pdistress',
]
Counter(all(d[v] != '' and d[v] >= 4 for v in label_names) for d in train)

In [None]:
def text_make_binary_xy(data, label_names):
    X, y = [], []
    for i in data:
        if any(i[l] == '' for l in label_names):
            continue
        X.append(i['essay'])
        y.append(all(i[l] >= 4 for l in label_names))
    return X, y


def text_binary_experiment(data, label_names):
    X, y = text_make_binary_xy(data, label_names)
    
    pipeline = make_pipeline(
        #CountVectorizer(),
        #CountVectorizer(ngram_range=(1,2)),
        CountVectorizer(max_df=0.5, min_df=5, stop_words='english'),
        #CountVectorizer(max_df=0.5, min_df=5, stop_words='english', binary=True),
        #CountVectorizer(max_df=0.5, min_df=5, stop_words='english', ngram_range=(1,2)),
        #TfidfVectorizer(sublinear_tf=True, max_df=0.5, min_df=5, stop_words='english'),
        #TfidfVectorizer(sublinear_tf=True, max_df=0.5, min_df=5, stop_words='english', ngram_range=(1,2)),
        SelectKBest(chi2, k=100),
    )
    clf = pipeline.fit(X, y)

    features = clf.steps[0][1].get_feature_names()  # feature names
    mask = clf.steps[1][1].get_support() #list of booleans
    weights = clf.steps[1][1].scores_
    weighted = [] # The list of your K best features
    
    for selected, feature_name in zip(mask, features):
        if selected:
            weighted.append((feature_name, weights[len(weighted)]))
    
    print('\nRanked coefficients:')
    for k, v in sorted(weighted, key=lambda i: abs(i[1]), reverse=True):
        if v > 2.71:
            # independence rejected at 90% confidence with chi2
            # https://nlp.stanford.edu/IR-book/html/htmledition/feature-selectionchi2-feature-selection-1.html
            print(f'\t{v:.3f}\t{k}')

    print('\n')
    return clf

In [None]:
clf = text_binary_experiment(train, label_names)

### fastText classification

In [None]:
def text_make_binary_xy(data, label_names, train_split=0.75):
    X, y = [], []
    for i in data:
        if any(i[l] == '' for l in label_names):
            continue
        if all(i[l] >= 3 for l in label_names):
            X.append(i['essay'])
            y.append(True)
        if all(i[l] <= 0 for l in label_names):
            X.append(i['essay'])
            y.append(False)
    return X, y

train = load_data('train.jsonl')
X, y = text_make_binary_xy(train, label_names)
print(len(X), len(y))
print(Counter(y))

In [None]:
for i, (essay, label) in enumerate(zip(X, y)):
    if label:
        print(f'{i}\n\n{essay}\n\n\n\n')

In [None]:
def split_ft(X, y, train_split=0.75):
    train, dev = [], []
    for i, (text, label) in enumerate(zip(X, y)):
        line = f'__label__{label} {text}'
        if i < train_split * len(X):
            train.append(line)
        else:
            dev.append(line)
    return train, dev

train_ft, dev_ft = split_ft(X, y)

train_path = 'fasttext.train'
with open(train_path, 'w') as fh:
    fh.write('\n'.join(train_ft))

dev_path = 'fasttext.dev'
with open(dev_path, 'w') as fh:
    fh.write('\n'.join(dev_ft))

In [None]:
def ft_experiment(model, path):
    with open(path) as fh:
        lines = fh.readlines()
    preds = set()  # predicted true
    trues = set()  # gold true
    total = 0
    for i, t in enumerate(lines):
        total += 1
        labl, text = t.strip().split(maxsplit=1)
        if labl == '__label__True':
            trues.add(i)
        pred = model.predict(text)[0][0]
        if pred == '__label__True':
            preds.add(i)
    n_crct = len(preds.intersection(trues))
    n_pred = len(preds)
    n_true = len(trues)
    print(f'Total data points: {total}')
    print(f'Correct/predicted/gold: {n_crct}, {n_pred}, {n_true}')
    p = n_crct / n_pred if n_pred else 1.0
    r = n_crct / n_true if n_true else 1.0
    f = 2 * p * r / (p + r) if p + r > 0 else 0.0
    print(f'Precision/recall/f-score: {p:.2}/{r:.2}/{f:.2}')

# train_supervised uses the same arguments and defaults as the fastText cli
model = train_supervised(
    input=train_path, epoch=25, lr=1.0, wordNgrams=2, verbose=2, minCount=1
)
ft_experiment(model, dev_path)
print()

In [None]:
# train_supervised uses the same arguments and defaults as the fastText cli
model = train_supervised(
    input=train_path, epoch=25, lr=1.0, pretrainedVectors='crawl-300d-2M.vec.zip'
)
ft_experiment(model, dev_path)
print()

In [None]:
# specify true sizes as a proportion of negatives
SIZES = [0.5, 1.0, 2.0, 4.0, 8.0, 16.0]


for size in SIZES:
    
    # split True and False instances
    true = []
    fool = []
    for line in train_ft:
        labl, text = line.strip().split(maxsplit=1)
        if labl == '__label__True':
            true.append(line)
        else:
            fool.append(line)
            
    # resample True instances and shuffle
    n_fool = len(fool)
    n_true = int(size * n_fool)
    new = list(np.random.choice(true, size=n_true, replace=True))
    new += fool
    np.random.shuffle(new)
    
    # write to fastText-formatted file
    path = f'fasttext.train.{int(100*size):0>3}'
    with open(path, 'w') as fh:
        fh.write('\n'.join(new))
        
    # train_supervised uses the same arguments and defaults as the fastText cli
    print(f'{path} ({n_true} True / {n_fool} False)')
    #model = train_supervised(
    #    input=path, epoch=25, lr=1.0, wordNgrams=2, verbose=2, minCount=1
    #)
    model = train_supervised(
        input=train_path, epoch=25, lr=1.0, pretrainedVectors='crawl-300d-2M.vec.zip'
    )
    ft_experiment(model, dev_path)
    print()

In [None]:
size = 1.0

# split True and False instances
true = []
fool = []
for line in train_ft:
    labl, text = line.strip().split(maxsplit=1)
    if labl == '__label__True':
        true.append(line)
    else:
        fool.append(line)

# resample True instances and shuffle
n_fool = len(fool)
n_true = int(size * n_fool)
new = list(np.random.choice(true, size=n_true, replace=True))
new += fool
np.random.shuffle(new)

# write to fastText-formatted file
path = f'fasttext.train.{int(100*size):0>3}'
with open(path, 'w') as fh:
    fh.write('\n'.join(new))

# train_supervised uses the same arguments and defaults as the fastText cli
print(f'{path} ({n_true} True / {n_fool} False)')
#model = train_supervised(
#    input=path, epoch=25, lr=1.0, wordNgrams=2, verbose=2, minCount=1
#)
model = train_supervised(
    input=train_path, epoch=25, lr=1.0, pretrainedVectors='crawl-300d-2M.vec.zip'
)
ft_experiment(model, dev_path)
model.save_model("fasttext.bin")
print()

#model.quantize(input=train_path, qnorm=True, retrain=True, cutoff=100000)
#ft_experiment(model, dev_path)
#model.save_model("fasttext.ftz")

In [None]:
for i, t in enumerate(dev):
    labl, text = t.strip().split(maxsplit=1)
    v = model.get_sentence_vector(text)
    print(i, v.shape, v)
    break

In [None]:
m = train_unsupervised(train_path, pretrainedVectors='crawl-300d-2M.vec.zip')

In [None]:
help(train_unsupervised)

In [None]:
for i, t in enumerate(dev):
    labl, text = t.strip().split(maxsplit=1)
    v = m.get_sentence_vector(text)
    print(i, v.shape, v)
    break

In [None]:
for i, line in enumerate(dev):
    if line.startswith('__label__True'):
        pred = model.predict(line)
        print(f'{i}\t{pred}\t{line}')

### Replace words with synsets

In [None]:
from nltk.corpus import wordnet as wn
from spacy.symbols import ADJ, ADV, NOUN, VERB

pos_map = {
    ADJ: wn.ADJ,
    ADV: wn.ADV,
    NOUN: wn.NOUN,
    VERB: wn.VERB,
}

def iter_token_hypernyms(token):
    for s in wn.synsets(token.lemma_, pos=pos_map[token.pos]):
        for h in s.hypernyms():
            yield '_'.join(h.name().split('.'))

def hypernym_text(text):
    for s in nlp(text).sents:
        for t in s:
            if t.pos not in pos_map:
                continue
            yield 'WORD: {}'.format(' '.join(iter_token_hypernyms(t)))
        yield ''

In [None]:
for i in train:
    i['hypernym_text'] = '\n'.join(hypernym_text(i['essay']))

print(train[0])

In [None]:
def print_hyponyms(synset, max_depth=1, indent=' '*4):
    def _print_hyponyms(s, i=0):
        if i > max_depth:
            return
        print('{}{} — {}'.format(indent*i, s.name(), ', '.join(s.lemma_names())))
        for h in s.hyponyms():
            _print_hyponyms(h, i+1)
    
    synset = '.'.join(synset.rsplit('_', 2))
    _print_hyponyms(wn.synset(synset))

In [None]:
def text_make_hypernym_xy(data, label_names):
    X, y = [], []
    for i in data:
        if any(i[l] == '' for l in label_names):
            continue
        X.append(i['hypernym_text'])
        y.append(all(i[l] >= 4 for l in label_names))
    return X, y


def text_hypernym_experiment(data, label_names):
    X, y = text_make_hypernym_xy(data, label_names)
    
    pipeline = make_pipeline(
        #CountVectorizer(),
        #CountVectorizer(ngram_range=(1,2)),
        CountVectorizer(max_df=0.5, min_df=5, stop_words='english'),
        #CountVectorizer(max_df=0.5, min_df=5, stop_words='english', binary=True),
        #CountVectorizer(max_df=0.5, min_df=5, stop_words='english', ngram_range=(1,2)),
        #TfidfVectorizer(sublinear_tf=True, max_df=0.5, min_df=5, stop_words='english'),
        #TfidfVectorizer(sublinear_tf=True, max_df=0.5, min_df=5, stop_words='english', ngram_range=(1,2)),
        SelectKBest(chi2, k=100),
    )
    clf = pipeline.fit(X, y)

    features = clf.steps[0][1].get_feature_names()  # feature names
    mask = clf.steps[1][1].get_support() #list of booleans
    weights = clf.steps[1][1].scores_
    weighted = [] # The list of your K best features
    
    for selected, feature_name in zip(mask, features):
        if selected:
            w = weights[len(weighted)]
            weighted.append((feature_name, w))
    
    print('\nRanked coefficients:\n')
    for k, v in sorted(weighted, key=lambda i: abs(i[1]), reverse=True):
        if v > 2.71:
            # independence rejected at 90% confidence with chi2
            # https://nlp.stanford.edu/IR-book/html/htmledition/feature-selectionchi2-feature-selection-1.html
            print(f'{v:.3f}\t', end='')
            print_hyponyms(k)
            print()

    print('\n')
    return clf

In [None]:
clf = text_hypernym_experiment(train, label_names)