In [1]:
import os
import csv
import re
from nltk import sent_tokenize

In [10]:
def get_sentences(filepath):
    """ Return sentences given a text file.
    """
    with open(filepath, mode='r', encoding="ISO-8859-1") as f:
        data = f.read()
    sentences = sent_tokenize(data)
    return sentences

In [8]:
def create_labelled_text(text, novs, divs, label):
    """ Create labelled text file with corresponding novelty and diversity values
    """
    text_dict = dict()

    for ix, t in enumerate(text):
        text_dict[ix] = (t, novs[ix], divs[ix], label)
        
    return text_dict


def write_to_csv(text_dict, filename):
    """
    """
    # real labelled as 1, fake labelled as 0
    fieldnames = ["index", "text", "novelty", "diversity", "label"]
    with open(file=filename, mode='w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        data = [dict(zip(fieldnames, [k, values[0], values[1], values[2], values[3]]))
                for k, values in text_dict.items()]
        writer.writerows(data)

# bad function name
def create_labelled_dictionary(text, novelties, diversities, label):
    """ Create labelled text file with no novelty or diversity values
    """
    text_dict = dict()

    for ix, t in enumerate(text):
        text_dict[ix] = (t, novelties[ix], diversities[ix], label)

    return text_dict


def write_to_file(text_dict, filename):
    """
    """
    # real labelled as 1, fake labelled as 0
    fieldnames = ["index", "text", "novelty", "diversity", "label"]
    with open(file=filename, mode='w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        data = [dict(zip(fieldnames, [k, values[0], values[1], values[2], values[3]]))
                for k, values in text_dict.items()]
        writer.writerows(data)


def read_list(filename: str) -> list:
    """ Read list from a text file containing
    """
    with open(file=filename, mode='r', encoding="ISO-8859-1") as f:
        result_list = list()
        data = f.read().split(',\n')
        for line in data[0:]:
            result_list.extend(re.findall("\d+\.\d+", line))
    return result_list

In [11]:
DATA_FILE = "../data/emnlp_news.txt"
TEST_FILE = "../data/test_emnlp.txt"
GENERATED_FILE = "../data/generated_text3.txt"

# save these sentences and novelties to save computation time
# corpus_sentences = get_sentences(DATA_FILE)  # 304222 sentences
real_sentences = get_sentences(TEST_FILE) # 10785 sentences
fake_sentences = get_sentences(GENERATED_FILE)  # 11055 sentences

fake_diversities = read_list('jaccard_diversities_fake3.txt')
fake_novelties = read_list('jaccard_novelties_fake3.txt')

real_novelties = read_list('jaccard_novelties_real.txt')
real_diversities = read_list('jaccard_diversities_real.txt')

print(len(fake_sentences), len(fake_diversities), len(fake_novelties))
print(len(real_sentences), len(real_diversities), len(real_novelties))

real_text = get_sentences(DATA_FILE)
fake_text = get_sentences(GENERATED_FILE)

real_dict = create_labelled_dictionary(real_sentences, real_novelties, real_diversities, 1)
write_to_file(real_dict, filename='labelled_real_metrics_jaccard.csv')

# label fake text with 0
fake_dict = create_labelled_dictionary(fake_sentences, fake_novelties, fake_diversities, 0)
write_to_file(fake_dict, filename='labelled_fake_metrics_jaccard.csv')

10769 10769 10769
