In [67]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold

from easydict import EasyDict

import pandas as pd
import numpy as np
import string


In [68]:
# train test split
# split copyright
# count vectorize
# delete common words
# delete anything that doesn't start with ascii characters
# select top words to make vocabulary
# try different values of C, vector sizes
# make cross-val splits
# get tf-idf features
# run logistic regression
# get scores
# for best model retrain on all
# get test score
# save the model

In [69]:
df = pd.read_csv('/Users/anush/veg2vec/samples_1/abstracts_labelled_tmp.csv')

In [170]:
def get_splits(df):
    texts = df.text.values
    texts = np.stack([t.split('©')[0] for t in texts])
    labels = df.relevant.values.astype('int')
    inds = df.index.values
    assert set(np.unique(inds)) == set(range(len(inds))) 
    txt_trn, txt_test, y_trn, y_test, inds_trn, inds_test = train_test_split(texts, labels, inds, test_size=0.1, shuffle=True)
    splitter = KFold(n_splits=5, shuffle=True)
    splits = list(splitter.split(txt_trn))
    return EasyDict(
        corpus=EasyDict(train=txt_trn, test=txt_test),
        labels=EasyDict(train=y_trn, test=y_test),
        inds=EasyDict(train=inds_trn, test=inds_test),
        splits=splits,
        train_test_splits=(np.arange(len(txt_trn)), 
                           np.arange(len(txt_trn), len(texts)))
    )

In [171]:
def get_features(trn, val, max_features=None):
    cvec = CountVectorizer(stop_words='english',
                          strip_accents='unicode',
                          max_features=max_features)
    cvec.fit(trn)
    exclude = ['author', 'abstract', 'copyright', 'journal', 'article']
    vocab = cvec.get_feature_names()
    vocab = np.stack(
        [
            v for v in vocab if v[0] in string.ascii_letters
        ]
    )
    vocab = np.stack(
        [
            v for v in vocab if not any(v.startswith(i) for i in exclude)
        ]
    )
    
    tidvec = TfidfVectorizer(
        vocabulary=vocab,
    )
    x_trn = tidvec.fit_transform(trn)
    x_val = tidvec.transform(val)
    return x_trn, x_val, cvec, tidvec

In [172]:
def make_dataset(corpus, labels, split, max_features=None):
    trn_inds, val_inds = split
    x_trn, x_val, cvec, tvec = get_features(corpus[trn_inds], corpus[val_inds], max_features)
    return EasyDict(
        features = EasyDict(train=x_trn, valid=x_val),
        labels = EasyDict(train=labels[trn_inds], valid=labels[val_inds]),
        cvec=cvec,
        tvec=tvec
    )

In [173]:
def train_logistic(features, labels, C=None, max_features=None):
    model = LogisticRegression(C=C, class_weight='balanced')
    model.fit(features.train, labels.train)
    
    score = f1_score(y_true=labels.valid, y_pred=model.predict(features.valid))
    
    return EasyDict(
        model=model,
        score=score
    )

In [174]:
def run_cross_val(corpus, labels, splits):
    scores = {}
    names = ['C', 'max_features']
    max_features = [3000, 6000, 9000, None]
    C = [1, 1.5, 2, 2.5, 3.]

    for split in splits:
        for mf in max_features:
            dt = make_dataset(corpus.train, labels.train, split, mf)
            for c in C:
                outputs = train_logistic(dt.features, dt.labels, C=c)
                scores.setdefault((c, mf), []).append(outputs.score)

    scores = {k: np.mean(v) for k, v in scores.items()}

    return pd.DataFrame(columns=['C', 'max_features', 'score'], 
             data=[[c, mf if mf is not None else 'None', v] for (c, mf), v in scores.items()])

In [175]:
np.random.seed(1)
splits = get_splits(df)
scores = run_cross_val(splits.corpus, splits.labels, splits.splits)

In [176]:
scores.nlargest(1, 'score')

Unnamed: 0,C,max_features,score
4,3.0,3000,0.844992


In [177]:
best_params = scores.nlargest(1, 'score').squeeze()
dt = make_dataset(np.concatenate([splits.corpus.train, splits.corpus.test]), 
                  np.concatenate([splits.labels.train, splits.labels.test]),
                  splits.train_test_splits, max_features=best_params.max_features) 
outputs = train_logistic(dt.features, dt.labels, C=best_params.C)

In [178]:
outputs.score

0.8035714285714285

In [179]:
import pickle

In [180]:
with open('/Users/anush/veg2vec/model.pkl', 'wb') as f:
    pickle.dump(file=f, obj=(
            EasyDict(
                splits=splits,
                scores=scores,
                best_params=best_params,
                train_test_dataset=dt,
                outputs=outputs
            )
        )
    )

In [181]:
with open('/Users/anush/veg2vec/model.pkl', 'rb') as f:
    p = pickle.load(f)

In [182]:
p.outputs.model

LogisticRegression(C=3.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)