# Classification

In [1]:
import os
import pickle
import logging
from datetime import datetime
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegressionCV

logging.basicConfig(filename='logs/classification.log', filemode='w', level=logging.INFO, 
                        format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')

RANDOM_SEED = sum(map(ord, 'time'))
Cs = np.logspace(start=-4, stop=-2, base=10, num=10)
CV = 10

In [7]:
def read_sentence_embedding(lg, model_name, dim):
    fname = '{}-{}.csv'.format(model_name, dim)
    fname = os.path.join('representations', lg, fname)
    return pd.read_csv(fname)
    
def collect_results(trained_model, X_test, y_test, test):
    test['prediction'] = trained_model.predict(X_test)
    prediction_probs = pd.DataFrame(trained_model.predict_proba(X_test), columns=trained_model.classes_, index=test.index)
    p_correct = []
    for i, v in test['tense'].iteritems(): 
        p = prediction_probs.loc[i][v]
        p_correct.append(p)
    test['prob_correct'] = p_correct
    accuracy = trained_model.score(X_test, y_test)
    obj = {'accuracy': accuracy, 'weights': trained_model.coef_, 'classes': trained_model.classes_, 'model': trained_model}
    return test, obj

def write_results(lg, model_name, dim, df, obj):
    fname = '{}-{}.csv'.format(model_name, dim)
    fname = os.path.join('classification', lg, fname)
    df.to_csv(fname, index=False)
    fname = '{}-{}.pkl'.format(model_name, dim)
    fname = os.path.join('classification', lg, fname)
    with open(fname, 'wb') as f:
        pickle.dump(obj, f)

def classify(lg, model_name, dim):
    start = datetime.now()
    data = read_sentence_embedding(lg, model_name, dim)
    train = data[data['train']]
    test = data[~data['train']].copy()
    predictors = [c for c in data if c.isdigit()]
    X_train, X_test = train[predictors], test[predictors]
    y_train, y_test = train['tense'], test['tense']
    model = LogisticRegressionCV(Cs=Cs, penalty='l1', cv=CV, solver='liblinear', refit=True, random_state=RANDOM_SEED)
    model = model.fit(X_train, y_train)
    test.drop(predictors, axis=1, inplace=True)
    test, obj = collect_results(model, X_test, y_test, test)
    write_results(lg, model_name, dim, test, obj)
    end = datetime.now()
    msg = 'Classifying {} {} {} took {}'.format(lg, model_name, dim, end-start)
    logging.info(msg)