# Classification

In [1]:
import os
import pickle
import logging
from datetime import datetime
from itertools import product
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegressionCV

logging.basicConfig(filename='logs/classification.log', filemode='w', level=logging.INFO, 
                        format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')

RANDOM_SEED = sum(map(ord, 'time'))
Cs = np.logspace(start=-4, stop=-2, base=10, num=10)
CV = 10

In [3]:
def classify(lg, model_name, dim):
    start = datetime.now()
    fname = os.path.join('representations', lg, '{}-{}.csv'.format(model_name, dim))
    data = pd.read_csv(fname)
    train = data[data['train']]
    test = data[~data['train']].copy()
    predictors = [c for c in data if c.isdigit()]
    assert len(predictors) == dim, "Something's wrong with the dimensions"
    response = 'tense'
    X_train, X_test = train[predictors], test[predictors]
    y_train, y_test = train[response], test[response]
    model = LogisticRegressionCV(Cs=Cs, penalty='l1', cv=CV, solver='liblinear', refit=True, random_state=RANDOM_SEED)
    model = model.fit(X_train, y_train)
    accuracy = model.score(X_test, y_test)
    test['prediction'] = model.predict(X_test)
    test.drop(predictors, axis=1, inplace=True)
    prediction_probs = pd.DataFrame(model.predict_proba(X_test), columns=model.classes_, index=test.index)
    p_correct = []
    # I'm so embarrassed to be iterating over a pandas object myself,
    # but I can't think of an easier way that is also cleanly written
    for i, v in test['tense'].iteritems(): 
        p = prediction_probs.loc[i][v]
        p_correct.append(p)
    test['prob_correct'] = p_correct
    fname = '{}-{}-results.csv'.format(model_name, dim)
    fname = os.path.join('classification', lg, fname)
    test.to_csv(fname, index=True)
    obj = {'accuracy': accuracy, 'weights': model.coef_, 'classes': model.classes_, 'model': model}
    fname = '{}-{}-model.pkl'.format(model_name, dim)
    fname = os.path.join('classification', lg, fname)
    with open(fname, 'wb') as f:
        pickle.dump(obj, f)
    end = datetime.now()
    msg = 'Classifying {} {} {} took {}'.format(lg, model_name, dim, end-start)
    logging.info(msg)

In [4]:
lgs = ['en']
models = ['skipgram']
dims = [20, 40, 60, 80]
fs = ['sum', 'mean', 'max', 'min', 't2b-0.0001']
for lg, model, dim, f in product(lgs, models, dims, fs):
    model = '{}-{}'.format(model, f)
    classify(lg, model, dim)