# Paraphrase

## Read test and train data

In [220]:
def split_tags(string):
    return [tuple(i.split("/")) for i in string.split()]

def readTrainData(filename):
    data = []
    for line in open(filename):
        line = line.strip()
        #read in training or dev data with labels
        if len(line.split('\t')) == 7:
            (trendid, trendname, origsent, candsent, judge, origsenttag, candsenttag) = \
            line.split('\t')
        else:
            continue
        # ignoring the training data that has middle label 
        nYes = eval(judge)[0]            
        if nYes >= 3:
            amt_label = True
            data.append((split_tags(origsenttag), split_tags(candsenttag), amt_label))
        elif nYes <= 1:
            amt_label = False
            data.append((split_tags(origsenttag), split_tags(candsenttag), amt_label))
    return data

def readTestData(filename):
    data = []
    for line in open(filename):
        line = line.strip()
        #read in training or dev data with labels
        if len(line.split('\t')) == 7:
            (trendid, trendname, origsent, candsent, judge, origsenttag, candsenttag) = \
            line.split('\t')
        else:
            continue
        # ignoring the training data that has middle label 
        nYes = int(judge[0])
        if nYes >= 4:
            expert_label = True
        elif nYes <= 2:
            expert_label = False
        else:
            expert_label = None
        data.append((split_tags(origsenttag), split_tags(candsenttag), expert_label))
    return data

train_data = readTrainData("SemEval-PIT2015-py3/data/train.data")
dev_data = readTrainData("SemEval-PIT2015-py3/data/dev.data")
test_data = readTestData("SemEval-PIT2015-py3/data/test.data")

## Extract features

In [221]:
from pyemd import emd
import gensim
from gensim.models import KeyedVectors

word2vec_model = KeyedVectors.load_word2vec_format("numberbatch-en.txt", binary=False)

In [230]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))
lem = WordNetLemmatizer()

def preprocess_text(words):
    words = [w for w in words if w not in stop_words]
    words = [w for w in words if len(w) > 3]
    words = [lem.lemmatize(w) for w in words]
    return words

def sample_x(sample, sub=False):
    get_words = lambda ann_tw: [ann_w[0] for ann_w in ann_tw]
    to_vec = lambda w : word2vec_model[w] if w in word2vec_model else np.zeros(300)
    
    def tw_vec(vecs):
        if not vecs:
            return np.zeros(300)
        return np.sum(np.array(vecs), axis=0) / len(vecs)
    
    text0 = preprocess_text(get_words(sample[0]))
    text1 = preprocess_text(get_words(sample[1]))

    vec0 = tw_vec([to_vec(w) for w in text0])
    vec1 = tw_vec([to_vec(w) for w in text1])
    #dist = model.wmdistance(get_words(sample[0]), get_words(sample[1]))
    #dist = min(dist, 100)

    if sub:
        return vec0 - vec1
    else:
        return np.concatenate((vec0, vec1), axis=None)

    #return np.concatenate((vec0, vec0), axis=None)
    #return {'vecs0': vecs0, 'vecs1': vecs1}

def sample_y(sample):
    return sample[2]

X_train = [sample_x(sample) for sample in train_data]
Y_train = [sample_y(sample) for sample in train_data]

X_dev = [sample_x(sample) for sample in dev_data]
Y_dev = [sample_y(sample) for sample in dev_data]

X_test = [sample_x(sample) for sample in test_data]

## Logistic Regression

In [231]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

lrc = LogisticRegression(random_state=42, solver="sag", max_iter=1000)
lrc.fit(X_train, Y_train)
print(classification_report(lrc.predict(X_dev), Y_dev))

              precision    recall  f1-score   support

       False       0.93      0.65      0.76      3862
        True       0.07      0.36      0.12       280

   micro avg       0.63      0.63      0.63      4142
   macro avg       0.50      0.51      0.44      4142
weighted avg       0.87      0.63      0.72      4142



## FNN

In [269]:
def vectorize_sequences(train_vecs):
    results = np.zeros((len(train_vecs), len(train_vecs[0])))
    for i, train_vec in enumerate(train_vecs):
        results[i] = train_vec
    return results

X_train = vectorize_sequences([sample_x(sample, True) for sample in train_data])
X_test = vectorize_sequences([sample_x(sample, True) for sample in test_data])
Y_train = np.asarray(Y_train).astype('float32')

In [273]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import regularizers

model = keras.Sequential([
    keras.layers.Dense(10, activation='relu', input_shape=(300,)),
    keras.layers.Dense(2, activation='softmax')
])

model.compile(optimizer='adam', 
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.fit(X_train, Y_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History object at 0x10862d160>

## Evaluate

In [274]:
def OutputPredictions(Y_pred, outfile):
    # output the results into a file
    outf = open(outfile,'w') 

    for y in Y_pred:
        if y >= 0.5:
            outf.write("true\t" + "{0:.4f}".format(y) + "\n")
        else:
            outf.write("false\t" + "{0:.4f}".format(y) + "\n")

    outf.close()

In [275]:
%cd -

#Y_pred = [1] * len(test_data)
#Y_pred = lrc.predict(X_test)
#OutputPredictions(Y_pred, 'SemEval-PIT2015-py3/systemoutputs/PIT2015_SASHA_01_LG.output')


Y_pred = model.predict(X_test)
Y_pred = [int(y[0]>y[1]) for y in Y_pred]
OutputPredictions(Y_pred, 'SemEval-PIT2015-py3/systemoutputs/PIT2015_SASHA_01_FNN.output')

/Users/sasha/Documents/prj-nlp-2019_/practice_11


In [276]:
%cd SemEval-PIT2015-py3/scripts
!python3 pit2015_eval_single.py ../data/test.label ../systemoutputs/PIT2015_BASELINE_02_LG.output
!python3 pit2015_eval_single.py ../data/test.label ../systemoutputs/PIT2015_SASHA_01_LG.output
!python3 pit2015_eval_single.py ../data/test.label ../systemoutputs/PIT2015_SASHA_01_FNN.output

/Users/sasha/Documents/prj-nlp-2019_/practice_11/SemEval-PIT2015-py3/scripts
838	BASELINE	02_LG		F: 0.589	Prec: 0.679	Rec: 0.520		P-corr: 0.511	F1: 0.601	Prec: 0.674	Rec: 0.543
838	SASHA	01_LG		F: 0.163	Prec: 0.515	Rec: 0.097		P-corr: 0.131	F1: 0.389	Prec: 0.258	Rec: 0.783
838	SASHA	01_FNN		F: 0.344	Prec: 0.209	Rec: 0.983		P-corr: -0.027	F1: 0.375	Prec: 0.251	Rec: 0.743
