In [None]:
import argparse
import numpy as np
import nltk
from nltk.tokenize import RegexpTokenizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import os

def get_indices(s):
    return [i for i, c in enumerate(s) if c == "\t"]


def get_n_gram_overlap(first, second):
    overlap = set(first) & set(second)
    return([overlap, len(overlap)])


def set_n_grams(first_tokens, second_tokens, n):
    first_n_grams = list(nltk.ngrams(first_tokens, n))
    second_n_grams = list(nltk.ngrams(second_tokens, n))

    return first_n_grams, second_n_grams


def create_training_data(s, n=2):
    train_file = open(s)
    train = train_file.read().split('\n')
    train = train[1:]
    print(train[len(train) - 1])

    train_dict = {}

    for s, i in zip(train, range(1, len(train))):

        tabs = get_indices(s)
        quality = int(s[0])
        first = s[tabs[2] + 1:tabs[3]]
        second = s[tabs[3] + 1:]

        tokenizer = RegexpTokenizer(r'\w+')
        first_tokens = tokenizer.tokenize(first)
        second_tokens = tokenizer.tokenize(second)

        first_n_grams, second_n_grams = set_n_grams(first_tokens,
                                                    second_tokens, n)

        overlap = get_n_gram_overlap(first_n_grams, second_n_grams)

        train_dict[i] = {}
        train_dict[i]['quality'] = quality
        train_dict[i]['first'] = first_tokens
        train_dict[i]['second'] = second_tokens
        train_dict[i]['ngram overlap'] = overlap[0]
        train_dict[i]['overlap count'] = overlap[1]

    return train_dict


os.getcwd()

train = create_training_data("../../ParaphraseDetection/data/msr_paraphrase_train.txt")
test = create_training_data("../../ParaphraseDetection/data/msr_paraphrase_test.txt")
full = create_training_data("../../ParaphraseDetection/data/msr_paraphrase_data.txt")

len(train)
len(test)
len(full)

quality_1_train = 0
quality_0_train = 0
quality_1_test = 0
quality_0_test = 0

sent_len_train = 0
sent_len_test = 0

for i in range(1, len(train)+1):
    if train[i]['quality'] == 1:
        quality_1_train += 1
    else:
        quality_0_train += 1

    sent_len_train += ((len(train[i]['first']) + len(train[i]['second']))/2)

for i in range(1, len(test)+1):
    if test[i]['quality'] == 1:
        quality_1_test += 1
    else:
        quality_0_test += 1

    sent_len_test += ((len(test[i]['first']) + len(test[i]['second']))/2)

quality_1_train
quality_0_train
quality_1_test
quality_0_test

average_sent_train = sent_len_train/len(train)
average_sent_test = sent_len_test/len(test)

average_sent_train
average_sent_test
