In [1]:
import csv
import itertools
import numpy as np
import os

RANDOM_SEED = 1337
np.random.seed(RANDOM_SEED)  # for reproducibility

In [13]:
from sklearn.linear_model import (
    LinearRegression, TheilSenRegressor, RANSACRegressor, HuberRegressor, Ridge)
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.pipeline import Pipeline, make_pipeline
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from math import sqrt

from scipy.stats import mode, kendalltau, pearsonr, spearmanr

In [3]:
import stanza
stanza.download('en')
parser = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma')

def tokenize_text(text, lemmatize=False):
    doc = parser(text)
    if lemmatize:
        return [word.text for sent in doc.sentences for word in sent.words]
    else:
        return [word.lemma for sent in doc.sentences for word in sent.words]

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.0.0.json: 115kB [00:00, 5.88MB/s]                    
2020-07-08 12:19:04 INFO: Downloading default packages for language: en (English)...
2020-07-08 12:19:05 INFO: File exists: /Users/elisa/stanza_resources/en/default.zip.
2020-07-08 12:19:11 INFO: Finished downloading models and saved to /Users/elisa/stanza_resources.
2020-07-08 12:19:11 INFO: Loading these models for language: en (English):
| Processor | Package |
-----------------------
| tokenize  | ewt     |
| pos       | ewt     |
| lemma     | ewt     |

2020-07-08 12:19:11 INFO: Use device: cpu
2020-07-08 12:19:11 INFO: Loading: tokenize
2020-07-08 12:19:11 INFO: Loading: pos
2020-07-08 12:19:12 INFO: Loading: lemma
2020-07-08 12:19:12 INFO: Done loading processors!


In [4]:
def rmse_loss(y_true, y_pred):
    loss = np.sqrt(mean_squared_error(y_true, y_pred))
    print(y_pred)
    print('rmse: ', loss)
    return loss

rmse_score = make_scorer(rmse_loss, greater_is_better=False)

In [5]:
def get_splits(splits_dir, train, dev, is_logistic=False, id_column=0, target_column=26, text_column=2):
    train_x, train_y = get_split(splits_dir, train, is_logistic, id_column, target_column, text_column)
    dev_x, dev_y = get_split(splits_dir, dev, is_logistic, id_column, target_column, text_column)
    return train_x, train_y, dev_x, dev_y

def get_split(splits_dir, split_file, is_logistic, id_column, target_column, text_column):
    split_file_path = os.path.join(splits_dir, split_file)
    with open(split_file_path) as f:
        split_reader = csv.reader(f, delimiter='\t')
        split_data = list(split_reader)[1:]  # skip header
    
    x = []
    y = []
    for line in split_data:
        if is_logistic:
            y.append(int(float((line[target_column]))*10))
        else:
            y.append(float(line[target_column]))
        x.append(line[text_column])
    return np.array(x), np.array(y)

In [21]:
def create_train_dev_cv(train_x, train_y, dev_x, dev_y):
    x = np.concatenate([train_x, dev_x])
    y = np.concatenate([train_y, dev_y])

    # create cv iterator object
    test_fold = np.concatenate([
                                    # The training data
                                    np.ones(train_x.shape[0], dtype=np.int8)*-1,
                                    # The development data
                                    np.zeros(dev_x.shape[0], dtype=np.int8)])
    cv_train_dev = PredefinedSplit(test_fold)

    return x, y, cv_train_dev

def run_grid_search(x, y, pipeline, parameters, cv_iter):
    grid_search = GridSearchCV(pipeline, parameters, cv=cv_iter, n_jobs=-1, verbose=1, scoring='neg_root_mean_squared_error')
    grid_search.fit(x, y)

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    return best_parameters

def search_linear(train_x, train_y, dev_x, dev_y):
    best_params = []
    # perform grid search over train and dev
    x, y, cv_train_dev = create_train_dev_cv(train_x, train_y, dev_x, dev_y)
    
    estimators = [#('OLS', LinearRegression()),
                  ('Ridge', Ridge(random_state=RANDOM_SEED)),
                  #('KernelRidge', KernelRidge(alpha=1, kernel='linear', gamma=None, degree=3, coef0=1))
              #('Theil-Sen', TheilSenRegressor(random_state=42)), gets error: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.
              #('RANSAC', RANSACRegressor(min_samples=len(dev_x), random_state=42)),
              #('HuberRegressor', HuberRegressor())
                ]
    for estimator in estimators:
        pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize_text, lowercase=True)),
        ('tfidf', TfidfTransformer()),
        estimator,
        ])
        # uncommenting more parameters will give better exploring power but will
        # increase processing time in a combinatorial way
        parameters = {
            #'vect__max_df': (0.75, 1.0),
            #'vect__min_df': (0.75, 1.0),
            'vect__ngram_range': ((1, 1), (1, 2), (1,3)),  # unigrams, bigrams, or trigrams
            #'tfidf__use_idf': (True, False),
            'tfidf__norm': ('l1', 'l2'),
            'Ridge__alpha': (0.5, 1.0, 1.5),
            'Ridge__tol': (0.0001, 0.01),
            #'clf__estimator__kernel': ('linear','poly', 'rbf', 'sigmoid'),
        }
        print('Running grid search for estimator: ', estimator)
        best_params.append(run_grid_search(x, y, pipeline, parameters, cv_train_dev))
    return best_params

In [23]:
splits_dir = '../../private_data/splits_folds_ordered_response_06-19/fold0'
train='train.tsv'
dev='test.tsv'

train_x, train_y, dev_x, dev_y = get_splits(splits_dir, train, dev)
best_params = search_linear(train_x, train_y, dev_x, dev_y)
best_params

Running grid search for estimator:  ('Ridge', Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=1337, solver='auto', tol=0.001))
Fitting 1 folds for each of 36 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed: 40.1min finished


Best score: -0.306
Best parameters set:
	Ridge__alpha: 1.5
	Ridge__tol: 0.0001
	tfidf__norm: 'l2'
	vect__ngram_range: (1, 3)


[{'memory': None,
  'steps': [('vect',
    CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                    dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                    lowercase=True, max_df=1.0, max_features=None, min_df=1,
                    ngram_range=(1, 3), preprocessor=None, stop_words=None,
                    strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                    tokenizer=<function tokenize_text at 0x7fa65e7ec7a0>,
                    vocabulary=None)),
   ('tfidf',
    TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)),
   ('Ridge',
    Ridge(alpha=1.5, copy_X=True, fit_intercept=True, max_iter=None,
          normalize=False, random_state=1337, solver='auto', tol=0.0001))],
  'verbose': False,
  'vect': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                  dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                  lowerc

In [24]:
pipeline = Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize_text, lowercase=True, 
                                     ngram_range=(1, 3))),
            ('tfidf', TfidfTransformer(norm='l2')),
            ('Ridge', Ridge(alpha=1.5, tol=0.0001, random_state=RANDOM_SEED)),
            ])
pipeline.fit(train_x, train_y)
preds = pipeline.predict(dev_x)
rmse = np.sqrt(mean_squared_error(dev_y, preds))
print('Ridge RMSE: ', rmse)
print('Ridge Kendall: ', kendalltau(dev_y, preds)[0])
print('Ridge Pearson: ', pearsonr(dev_y, preds)[0])
print('Ridge Spearman: ', spearmanr(dev_y, preds)[0])

Ridge RMSE:  0.30574382199995925
Ridge Kendall:  0.2030462612784842
Ridge Pearson:  0.3683692257369324
Ridge Spearman:  0.28202613821840333


In [20]:
entropy_most_freq = mode(train_y)[0][0]
entropy_mean = np.mean(train_y)

rmse_most_freq = np.sqrt(mean_squared_error(dev_y, [entropy_most_freq]*len(dev_y)))
rmse_mean_ent = np.sqrt(mean_squared_error(dev_y, [entropy_mean]*len(dev_y)))
print('Most Freq entropy:', entropy_most_freq)
print('RMSE using most freq entropy:', rmse_most_freq)
print('Mean entropy: ', entropy_mean)
print('RMSE using mean entropy:', rmse_mean_ent)

Most Freq entropy: 0.0
RMSE using most freq entropy: 0.5102494215256714
Mean entropy:  0.33980704680727497
RMSE using mean entropy: 0.33243672623925663


In [31]:
np.mean(train_y[np.where(train_y !=0)])


0.6598195083636408

### Ordinal Regression

In [16]:
import warnings

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, accuracy_score, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PowerTransformer, FunctionTransformer

import torch
from torch import nn
import mord as m
warnings.simplefilter('ignore')

ModuleNotFoundError: No module named 'mord'

In [17]:
def run_estimators_logistic(train_x, train_y, dev_x, dev_y, 
                   lad_epsilon=0.0, lad_tol=0.0001, lad_C=1.0, lad_loss='l1',
                   it_alpha=1.0,
                   at_alpha=1.0):
    model_to_preds = {}
    estimators = [('LAD', m.LAD(epsilon=lad_epsilon, tol=lad_tol, C=lad_C, loss=lad_loss, fit_intercept=True, intercept_scaling=1.0, dual=True, verbose=0, max_iter=1000, random_state=RANDOM_SEED)),
                  #('LogisticIT', m.LogisticIT(alpha=it_alpha, verbose=0)),
                  #('LogisticAT', m.LogisticAT(alpha=at_alpha, verbose=0))
                  #Values in y must be [0 1 2 3 4]
                 ]
    for estimator in estimators:
        pipeline = Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize_text, lowercase=True)),
            ('tfidf', TfidfTransformer()),
            estimator,
            ])
        pipeline.fit(train_x, train_y)
        preds = pipeline.predict(dev_x)
        rmse = np.sqrt(mean_squared_error(dev_y, preds))
        model_to_preds[pipeline] = preds
        print('Estimator: ', estimator, ' RMSE: ', rmse)
    return model_to_preds

def run_estimators(train_x, train_y, dev_x, dev_y, 
                   ridge_alpha=1.0, ridge_tol=0.001):
    model_to_preds = {}
    estimators = [('OrdinalRidge', m.OrdinalRidge(alpha=ridge_alpha, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=ridge_tol, solver='auto', random_state=RANDOM_SEED))]
    for estimator in estimators:
        pipeline = Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize_text, lowercase=True)),
            ('tfidf', TfidfTransformer()),
            estimator,
            ])
        pipeline.fit(train_x, train_y)
        preds = pipeline.predict(dev_x)
        rmse = np.sqrt(mean_squared_error(dev_y, preds))
        kend = kendalltau(dev_y, preds)[0]
        pear = pearsonr(dev_y, preds)[0]
        spear = spearmanr(dev_y, preds)[0]
        model_to_preds[pipeline] = preds
        print('Estimator: ', estimator, '\n  RMSE:', rmse, '\tKendall:', kend, '\tPearson:', pear, '\tSpearman: ', spear)
    return model_to_preds

In [None]:
splits_dir = '../../data/splits_folds_ordered_response_06-19/fold0'
train='train.tsv'
dev='test.tsv'

train_x, train_y, dev_x, dev_y = get_splits(splits_dir, train, dev)

In [None]:
ridge_alphas = [0.5, 1.0, 1.5]
ridge_tols = [0.0001, 0.001, 0.01]
model_dicts=[]
for alpha in ridge_alphas:
    for tol in ridge_tols:
        print('alpha: ', alpha, ', tol: ', tol)
        model_dicts.append(run_estimators(train_x, train_y, dev_x, dev_y, ridge_alpha=alpha, ridge_tol=tol))

### Ordinal Logistic Regression

In [None]:
train_x_int, train_y_int, dev_x_int, dev_y_int = get_splits(splits_dir, train, dev, is_logistic=True)

In [None]:
logistic_dicts = run_estimators_logistic(train_x_int, train_y_int, dev_x_int, dev_y_int)

In [None]:
rmse_log = np.sqrt(mean_squared_error(dev_y_int, [0]*len(dev_y_int)))
print(rmse_log)