In [1]:
import csv
import itertools
import numpy as np
import os

RANDOM_SEED = 1337
np.random.seed(RANDOM_SEED)  # for reproducibility

In [18]:
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import recall_score, precision_score, f1_score, classification_report
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.dummy import DummyClassifier
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV, PredefinedSplit

In [3]:
import stanza
stanza.download('en')
parser = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma')

def tokenize_text(text, lemmatize=False):
    doc = parser(text)
    if lemmatize:
        return [word.text for sent in doc.sentences for word in sent.words]
    else:
        return [word.lemma for sent in doc.sentences for word in sent.words]

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.0.0.json: 115kB [00:00, 3.05MB/s]                    
2020-06-25 10:35:57 INFO: Downloading default packages for language: en (English)...
2020-06-25 10:35:59 INFO: File exists: /Users/elisa/stanza_resources/en/default.zip.
2020-06-25 10:36:06 INFO: Finished downloading models and saved to /Users/elisa/stanza_resources.
2020-06-25 10:36:06 INFO: Loading these models for language: en (English):
| Processor | Package |
-----------------------
| tokenize  | ewt     |
| pos       | ewt     |
| lemma     | ewt     |

2020-06-25 10:36:06 INFO: Use device: cpu
2020-06-25 10:36:06 INFO: Loading: tokenize
2020-06-25 10:36:06 INFO: Loading: pos
2020-06-25 10:36:07 INFO: Loading: lemma
2020-06-25 10:36:08 INFO: Done loading processors!


In [8]:
class_labels = ["answer", "answer_overans-sway", "shift-dodge", "shift-correct", "cant-answer-lying",
                "cant-answer-sincere"]

def get_splits(splits_dir, train, dev, test):
    train_x, train_y = get_split(splits_dir, train)
    dev_x, dev_y = None, None
    if dev:
        dev_x, dev_y = get_split(splits_dir, dev)
    test_x, test_y = get_split(splits_dir, test)
    return train_x, train_y, dev_x, dev_y, test_x, test_y

def get_split(splits_dir, split_file):
    split_file_path = os.path.join(splits_dir, split_file)
    with open(split_file_path) as f:
        split_reader = csv.reader(f, delimiter='\t')
        split_data = list(split_reader)
    
    x = []
    y = []
    for (i, line) in enumerate(split_data):
        if i==0:
            continue
        label_set = line[1]
        text = line[2]
        y.append([int(digit) for digit in label_set])
        x.append(text)
    return np.array(x), np.array(y)

In [None]:
def search_svm(train_x, train_y, dev_x, dev_y):
    # perform grid search over train and dev
    x, y, cv_train_dev = create_train_dev_cv(train_x, train_y, dev_x, dev_y)
    
    # define pipeline and parameters
    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize_text, lowercase=True)),
        ('tfidf', TfidfTransformer()),
        ('clf', OneVsRestClassifier(SVC(random_state=RANDOM_SEED))),
    ])

    model = make_pipeline(PolynomialFeatures(3), estimator)
    model.fit(this_X, this_y)
    mse = mean_squared_error(model.predict(X_test), y_test)
    # uncommenting more parameters will give better exploring power but will
    # increase processing time in a combinatorial way
    parameters = {
        'vect__max_df': (0.75, 1.0),
        'vect__min_df': (0.75, 1.0),
        'vect__ngram_range': ((1, 1), (1, 2), (1,3)),  # unigrams, bigrams, or trigrams
        'tfidf__use_idf': (True, False),
        'tfidf__norm': ('l1', 'l2'),
        #'clf__estimator__kernel': ('linear','poly', 'rbf', 'sigmoid'),
    }
    run_grid_search(x, y, pipeline, parameters, cv_train_dev)
    
def create_train_dev_cv(train_x, train_y, dev_x, dev_y):
    x = np.concatenate([train_x, dev_x])
    y = np.concatenate([train_y, dev_y])

    # create cv iterator object
    test_fold = np.concatenate([
                                    # The training data
                                    np.ones(train_x.shape[0], dtype=np.int8)*-1,
                                    # The development data
                                    np.zeros(dev_x.shape[0], dtype=np.int8)])
    cv_train_dev = PredefinedSplit(test_fold)

    return x, y, cv_train_dev

def run_grid_search(x, y, pipeline, parameters, cv_iter):
    grid_search = GridSearchCV(pipeline, parameters, cv=cv_iter, n_jobs=-1, verbose=1)
    grid_search.fit(x, y)

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    return best_parameters

In [9]:
splits_dir = '../../data/splits_folds_ordered_response_06-19/fold0'
train='train.tsv'
dev=None
test='test.tsv'

train_x, train_y, dev_x, dev_y, test_x, test_y = get_splits(splits_dir, train, dev, test)
#best_parameters = search_svm(train_x, train_y, dev_x, dev_y)

In [None]:
def eval_classifier(clf, train_x, train_y, test_x, test_y, best_parameters):
    clf_pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize_text, lowercase=True)),
        ('tfidf', TfidfTransformer()),
        ('clf', clf),
    ])
    clf_pipeline.set_params(**best_parameters)
    print(clf_pipeline)
    clf_pipeline.fit(train_x, train_y)
    predict_y = clf_pipeline.predict(test_x)
    
    
    if isinstance(clf_pipeline['clf'], MultiOutputRegressor):
        set_accuracy = np.sum(np.equal(test_y, predict_y).all(axis=1)) / test_y.shape[0]
    else:
        set_accuracy = clf_pipeline.score(test_x, test_y)
    metrics_dict = classification_report(test_y, predict_y, target_names=class_labels, output_dict=True)
        
#     if isinstance(classifier, MLARAM):
#             classifier.reset()
    return set_accuracy, metrics_dict

In [None]:
best_params = {'tfidf__norm': 'l1', 'tfidf__use_idf': True, 
               'vect__max_df': 1.0, 'vect__min_df': 0.75,
               'vect__ngram_range': (1, 2)}

In [None]:
svm_clf = OneVsRestClassifier(SVC(random_state=RANDOM_SEED))
set_accuracy_svm, metrics_dict_svm = eval_classifier(svm_clf, train_x, train_y, dev_x, dev_y, best_params)

In [None]:
print(set_accuracy_svm)
print(metrics_dict_svm)

In [14]:
def eval_majority(train_x, train_y, test_x, test_y, dummy_strategy, constant_value=None):
    if dummy_strategy == 'constant':
        dummy_clf = DummyClassifier(strategy=dummy_strategy, constant=constant_value, random_state=RANDOM_SEED)
    else:
        dummy_clf = DummyClassifier(strategy=dummy_strategy,random_state=RANDOM_SEED)
    dummy_clf.fit(train_x, train_y)
    predict_y_maj = dummy_clf.predict(test_x)
    metrics_dict = classification_report(test_y, predict_y_maj, target_names=class_labels, output_dict=True)
    set_accuracy = dummy_clf.score(test_x, test_y)
    return set_accuracy, metrics_dict

In [20]:
set_accuracy_prior, metrics_dict_prior = eval_majority(train_x, train_y, test_x, test_y, 'prior')
set_accuracy_freq, metrics_dict_freq = eval_majority(train_x, train_y, test_x, test_y, 'most_frequent')
set_accuracy_1, metrics_dict_1 = eval_majority(train_x, train_y, test_x, test_y, 'constant',np.ones(6))

  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
print("Prior")
print(set_accuracy_prior)
print(metrics_dict_prior)
print("Freq")
print(set_accuracy_freq)
print(metrics_dict_freq)
print("Predict 1")
print(set_accuracy_1)
print(metrics_dict_1)

Prior
0.31
{'answer': {'precision': 0.665, 'recall': 1.0, 'f1-score': 0.7987987987987988, 'support': 133}, 'answer_overans-sway': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 17}, 'shift-dodge': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 91}, 'shift-correct': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 40}, 'cant-answer-lying': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 31}, 'cant-answer-sincere': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 25}, 'micro avg': {'precision': 0.665, 'recall': 0.39465875370919884, 'f1-score': 0.4953445065176909, 'support': 337}, 'macro avg': {'precision': 0.11083333333333334, 'recall': 0.16666666666666666, 'f1-score': 0.13313313313313313, 'support': 337}, 'weighted avg': {'precision': 0.26244807121661723, 'recall': 0.39465875370919884, 'f1-score': 0.315252938398339, 'support': 337}, 'samples avg': {'precision': 0.665, 'recall': 0.48, 'f1-score': 0.5391666666666667, 's

In [None]:
lr_clf = MultiOutputRegressor(LogisticRegression(random_state=RANDOM_SEED, multi_class='multinomial'))
set_accuracy_lr, metrics_dict_lr = eval_classifier(lr_clf, train_x, train_y, dev_x, dev_y, best_params)

In [None]:
print(set_accuracy_lr)
print(metrics_dict_lr)

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from gensim.utils import simple_preprocess
from gensim.sklearn_api.ftmodel import FTTransformer
np.random.seed(0)


class FTTransformer2(FTTransformer):

    def fit(self, x, y):
        super().fit([simple_preprocess(doc) for doc in x])
        return self



classifiers = [OneVsRestClassifier(SVC(random_state=RANDOM_SEED))]
#[MultiOutputRegressor(LogisticRegression(random_state=RANDOM_SEED, multi_class='multinomial'))]

for classifier in classifiers:

    pipeline = Pipeline([
            ('ftt', FTTransformer2(size=12, min_count=1, seed=0,batch_words=100)),
            ('clf', classifier)
        ])

    print(pipeline)
    pipeline.fit(train_x, train_y)
    predict_y = pipeline.predict(dev_x)
    print(classification_report(dev_y, predict_y, target_names=class_labels, output_dict=True))
    #print(np.sum(np.equal(dev_y, predict_y).all(axis=1)) / dev_y.shape[0])
    print(pipeline.score(dev_x, dev_y))

In [None]:
from skmultilearn.problem_transform import ClassifierChain

label_orders = list(itertools.permutations(range(len(class_labels))))
for label_order in label_orders:
    print("Results for label order:", label_order)
    classifier = ClassifierChain(
        classifier = SVC(),
        require_dense = [False, True],
        order=label_order
    )

    pipeline = Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize_text, lowercase=True)),
                ('tfidf', TfidfTransformer()),
                ('clf', classifier)
            ])

    best_params = {'tfidf__norm': 'l1', 'tfidf__use_idf': True, 
                   'vect__max_df': 1.0, 'vect__min_df': 0.75,
                   'vect__ngram_range': (1, 2)}

    pipeline.set_params(**best_params)

    pipeline.fit(train_x, train_y)
    predict_y = pipeline.predict(dev_x)
    print(classification_report(dev_y, predict_y, target_names=class_labels, output_dict=True))
        #print(np.sum(np.equal(dev_y, predict_y).all(axis=1)) / dev_y.shape[0])
    print(pipeline.score(dev_x, dev_y))


In [None]:
from skmultilearn.problem_transform import LabelPowerset
classifier = LabelPowerset(
    classifier = SVC(),
    require_dense = [False, True]
)

pipeline = Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize_text, lowercase=True)),
            ('tfidf', TfidfTransformer()),
            ('clf', classifier)
        ])

best_params = {'tfidf__norm': 'l1', 'tfidf__use_idf': True, 
               'vect__max_df': 1.0, 'vect__min_df': 0.75,
               'vect__ngram_range': (1, 2)}

pipeline.set_params(**best_params)

pipeline.fit(train_x, train_y)
predict_y = pipeline.predict(dev_x)
print(classification_report(dev_y, predict_y, target_names=class_labels, output_dict=True))
#print(np.sum(np.equal(dev_y, predict_y).all(axis=1)) / dev_y.shape[0])
print(pipeline.score(dev_x, dev_y))

In [None]:
from skmultilearn.ensemble import RakelD
n_labels = len(class_labels)
for i in range(1,n_labels):
    classifier = RakelD(
        base_classifier=SVC(),
        base_classifier_require_dense=[True, True],
        labelset_size=i
    )
    pipeline = Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize_text, lowercase=True)),
            ('tfidf', TfidfTransformer()),
            ('clf', classifier)
        ])

    best_params = {'tfidf__norm': 'l1', 'tfidf__use_idf': True, 
                   'vect__max_df': 1.0, 'vect__min_df': 0.75,
                   'vect__ngram_range': (1, 2)}

    pipeline.set_params(**best_params)

    pipeline.fit(train_x, train_y)
    predict_y = pipeline.predict(dev_x)
    print("Label set size: ", i)
    print(classification_report(dev_y, predict_y, target_names=class_labels, output_dict=True))
        #print(np.sum(np.equal(dev_y, predict_y).all(axis=1)) / dev_y.shape[0])
    print(pipeline.score(dev_x, dev_y))


In [None]:
from skmultilearn.adapt import BRkNNaClassifier, BRkNNbClassifier, MLkNN, MLARAM, MLTSVM

print('BRkNNaClassifier')
for k_val in range(3,8):
    print('k=',k_val)
    classifier = BRkNNaClassifier(k=k_val)
    pipeline = Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize_text, lowercase=True)),
            ('tfidf', TfidfTransformer()),
            ('clf', classifier)
        ])

    best_params = {'tfidf__norm': 'l1', 'tfidf__use_idf': True, 
                   'vect__max_df': 1.0, 'vect__min_df': 0.75,
                   'vect__ngram_range': (1, 2)}

    pipeline.set_params(**best_params)

    pipeline.fit(train_x, train_y)
    predict_y = pipeline.predict(dev_x)
    print(classification_report(dev_y, predict_y, target_names=class_labels, output_dict=True))
    print(np.sum(np.equal(dev_y, predict_y).all(axis=1)) / dev_y.shape[0])
    #print(pipeline.score(dev_x, dev_y))

In [None]:
print('BRkNNbClassifier')
for k_val in range(4,8):
    print('k=',k_val)
    classifier = BRkNNbClassifier(k=k_val)
    pipeline = Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize_text, lowercase=True)),
            ('tfidf', TfidfTransformer()),
            ('clf', classifier)
        ])

    best_params = {'tfidf__norm': 'l1', 'tfidf__use_idf': True, 
                   'vect__max_df': 1.0, 'vect__min_df': 0.75,
                   'vect__ngram_range': (1, 2)}

    pipeline.set_params(**best_params)

    pipeline.fit(train_x, train_y)
    predict_y = pipeline.predict(dev_x)
    print(classification_report(dev_y, predict_y, target_names=class_labels, output_dict=True))
        #print(np.sum(np.equal(dev_y, predict_y).all(axis=1)) / dev_y.shape[0])
    print(pipeline.score(dev_x, dev_y))

In [None]:
print('MLkNN')
for k_val in range(1,8):
    print('k=',k_val)
    classifier = MLkNN(k=k_val)
    pipeline = Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize_text, lowercase=True)),
            ('tfidf', TfidfTransformer()),
            ('clf', classifier)
        ])

    best_params = {'tfidf__norm': 'l1', 'tfidf__use_idf': True, 
                   'vect__max_df': 1.0, 'vect__min_df': 0.75,
                   'vect__ngram_range': (1, 2)}

    pipeline.set_params(**best_params)

    pipeline.fit(train_x, train_y)
    predict_y = pipeline.predict(dev_x)
    print(classification_report(dev_y, predict_y, target_names=class_labels, output_dict=True))
        #print(np.sum(np.equal(dev_y, predict_y).all(axis=1)) / dev_y.shape[0])
    print(pipeline.score(dev_x, dev_y))

In [None]:
print('MLARAM')
for vig in [.8,.85,.9,.99]:
    print('vigilance=',vig)
    classifier = MLARAM(vigilance=vig)
    pipeline = Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize_text, lowercase=True)),
            ('tfidf', TfidfTransformer()),
            ('clf', classifier)
        ])

    best_params = {'tfidf__norm': 'l1', 'tfidf__use_idf': True, 
                   'vect__max_df': 1.0, 'vect__min_df': 0.75,
                   'vect__ngram_range': (1, 2)}

    pipeline.set_params(**best_params)

    pipeline.fit(train_x, train_y)
    predict_y = pipeline.predict(dev_x)
    print(classification_report(dev_y, predict_y, target_names=class_labels, output_dict=True))
        #print(np.sum(np.equal(dev_y, predict_y).all(axis=1)) / dev_y.shape[0])
    print(pipeline.score(dev_x, dev_y))
    classifier.reset()

In [None]:
print('MLTSVM')
for c_k_val in [2**i for i in range(-5, 5, 2)]:
    print('c_k=',c_k_val)
    classifier = MLTSVM(c_k=c_k_val)
    pipeline = Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize_text, lowercase=True)),
            ('tfidf', TfidfTransformer()),
            ('clf', classifier)
        ])

    best_params = {'tfidf__norm': 'l1', 'tfidf__use_idf': True, 
                   'vect__max_df': 1.0, 'vect__min_df': 0.75,
                   'vect__ngram_range': (1, 2)}

    pipeline.set_params(**best_params)

    pipeline.fit(train_x, train_y)
    predict_y = pipeline.predict(dev_x)
    print(classification_report(dev_y, predict_y, target_names=class_labels, output_dict=True))
    print(np.sum(np.equal(dev_y, predict_y).all(axis=1)) / dev_y.shape[0])
    #print(pipeline.score(dev_x, dev_y))

In [None]:
predict_y

In [None]:
def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{:.1f}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

In [None]:
def barplot(ax, models, classes, data):
    '''
    Create a barchart for data across different categories with
    multiple conditions for each category.
    
    '''
    
    # the space between each set of bars
    space = 0.3
    n = len(models)
    width = (1 - space) / (len(models))
    
    # Create a set of bars at each position
    for i,model in enumerate(models):
        indeces = range(1, len(classes)+1)
        vals = data[i,:]
        pos = [j - (1 - space) / 2. + i * width for j in indeces]
        ax.bar(pos, vals, width=width, label=model)
    
    # Set the x-axis tick labels to be equal to the categories
    ax.set_xticks(indeces)
    ax.set_xticklabels(classes)
    ax.set_ylim([0,1.2])
    plt.setp(plt.xticks()[1], rotation=90)
    
    # Add the axis labels
    ax.set_ylabel("F1")
    ax.set_xlabel("Response Label")
    
    # Add a legend
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles, labels, loc='upper right')
        


In [None]:
data = [np.mean(majority_metrics, axis=0)[2,:], 
                 np.mean(bow1_metrics, axis=0)[2,:],
                 np.mean(bow2_metrics, axis=0)[2,:],
                 np.mean(bow3_metrics, axis=0)[2,:],
                 np.mean(bow2_tfidf_norml2_metrics, axis=0)[2,:],
                 np.mean(bow2_tfidf_norml1_metrics, axis=0)[2,:]]

for svc_metric,_ in svc_metrics:
    data.append(np.mean(svc_metric, axis=0)[2,:])

data = np.array(data) 

In [None]:
np.mean(data, axis=1)

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
barplot(ax, ['Majority', 'SVC-L-BOW-1', 'SVC-L-BOW-2', 'SVC-L-BOW-3','SVC-L-BOW1-TFIDF-L2', 'SVC-L-BOW1-TFIDF-L1', 'SVC-P-BOW-1', 'SVC-R-BOW-1', 'SVC-S-BOW-1'], class_labels, data) 
plt.show()