# Creating the description corpora for NN training

This script creates two corpora containing descriptive sentences, `description-corpus-115k.txt` with 115,735 sentences (3,153,298 tokens) and `description-corpus-20k.txt` with 20,680 sentences (557,969 tokens); the latter is a subset of the former.

These corpora are combinations of all of the concordances contained in the CSV files saved in the two subdirectories, `encow16a-nano/` and `encow16a/`.
The CSV files in these two folders contain concordances for various descriptive constructions which we intend to use to train our text-generating NN, so that it is familiar with typical descriptive sentences and might then be able to create better ones itself.
The folders are named according to the corpora from which the concordances are drawn: ENCOW16A (for the 115k-sentence corpus) and ENCOW16A-NANO (for the 20k-sentence corpus).
These are web corpora, containing texts crawled from many genres of website, from chat forums to official business pages, so diverse styles of language use are represented.
(For more detail, see the [Corpora from the Web website](https://corporafromtheweb.org/).)

A list of which descriptive constructions we searched for and the CQL queries used to find them in the corpora can be found in `constructions_and_cql.txt`, and Elizabeth's scripts used to query each corpus are found in the respective subfolders and are both called `descr_sents.py`.
(Even though these can only be run on the COW server, they are included here for transparency's sake.)

In [1]:
import os


def get_sents_from_csv(subdir, filename):
    """
    Reformats the tab-separated outputs of SeaCOW queries for each descriptive construction into a list of sentences.
    
    Args:
        subdir: a string with the subdirectory containing the CSV file.
        filename: a string ending in '.csv'; the file to read in.
    Returns:
        A list of sentences as strings from the given concordance file.
    """
    
    # Initialise empty list to fill as we go through the concordance file.
    sent_list = []
    
    path = str(subdir + '/' + filename)

    # Open desired file and go through it line by line.
    with open( path , encoding='utf-8') as file:
        for line in file:

            # We only care about the actual concordance, so, the lines in output file not beginning with # or the header 'doc'.
            if line[0] != "#" and line[0:3] != "doc":

                # Remove newlines and split by tabs (since the output is actually tab-separated), saving all but the first
                # element of the resulting list (the original URL of the sentence) as a string to the concordance's list.
                split_by_tab = line.strip('\n').split('\t')
                sent_as_str = " ".join( split_by_tab[1:] )                
                
                # If this joining results in a space being the first character of the line, remove that space.
                if sent_as_str[0] == " ":
                    sent_as_str = sent_as_str[1:]
                
                # Add to list.
                sent_list.append(sent_as_str)
        
    return sent_list


def create_total_sent_list(subdir):
    """
    Goes through each CSV file in the given subdirectory and combines all of their concordances into one single list.
    
    Args:
        subdir: a string, the name of the subdirectory to gather concordance files from.
    Returns:
        A list containing all sentences as strings.
    """
    
    # Get all csv files in given subdir.
    conc_files = [file for file in os.listdir(subdir) if file[-3:] == 'csv']

    # Initialise empty list to contain sentences.
    all_sents = []
    
    # Go through all files in subdir and extract the sentences from them, extending all_sents for each one and returning.
    for conc_file in conc_files:
        all_sents.extend( get_sents_from_csv(subdir, conc_file) )
        
    return all_sents


def get_token_count(sents_list):
    """
    A function to count the number of tokens in each corpus.
    
    Arg:
        sents_list: A list of each sentence as a string.
    """

    # Initialise token counter.
    tokens = 0

    # Go through each sentence in the list, counting its tokens and adding this number to the counter, and return.
    for sent in sents_list:
        
        # (Because the sentences are already tokenised and space-separated in ENCOW, it's sufficient to split the exported
        # sentences by space to get back to the original tokenisation.)
        sent_len = len( sent.split(' ') ) 
        tokens += sent_len

    return tokens


def write_to_corpus_file(sents_list, out_filename):
    """
    Creates a new file in the current directory containing all of the sentences, one on each line.
    
    Args:
        sents_list: A list of each sentence as a string.
        out_filename: A string ending in '.txt'; the name of the file to be created.
    """
    
    # Create a file to contain the list of sentences with the given filename.
    file = open(out_filename, 'w+', encoding='utf-8')
    
    # Write each line (with newline) to this file and close.
    for sent in sents_list:
        file.write(sent+'\n')
    file.close()

In [2]:
SUBDIR_NANO = 'encow16a-nano'  # The smaller version of the corpus
SUBDIR_FULL = 'encow16a'       # The larger version 


# Create smaller corpus from ENCOW16A-NANO concordances and print stats.
all_sents_nano = create_total_sent_list(SUBDIR_NANO)
write_to_corpus_file(all_sents_nano, 'description-corpus-20k.txt')

print('description-corpus-20k:')
print(' Number of sentences:\t',  len(all_sents_nano))
print(' Number of tokens:\t',  get_token_count(all_sents_nano))


# Create larger corpus from ENCOW16A concordances and print stats.
all_sents_full = create_total_sent_list(SUBDIR_FULL)
write_to_corpus_file(all_sents_full, 'description-corpus-115k.txt')

print('\ndescription-corpus-115k:')
print(' Number of sentences:\t',  len(all_sents_full))
print(' Number of tokens:\t',  get_token_count(all_sents_full))

description-corpus-20k:
 Number of sentences:	 20680
 Number of tokens:	 557969

description-corpus-115k:
 Number of sentences:	 115735
 Number of tokens:	 3153298
