- this file is for developing a script to reformat all of the individual csv files generated for each descriptive construction into one corpus file
- the corpus queries are tab-separated and span three columns, so that has to be tidied up
- then all sentences are combined into one list, which can then be exported

In [2]:
import os

In [27]:
def get_sents_from_csv(subdir, filename):
    """
    Reformats the tab-separated outputs of SeaCOW queries for each descriptive construction into a list of sentences.
    
    Args:
        subdir: a string with the subdirectory containing the CSV file.
        filename: a string ending in '.csv'; the file to read in.
    Returns:
        A list of sentences as strings from the given concordance file.
    """
    
    # Initialise empty list to fill as we go through the concordance file.
    sent_list = []
    
    path = str(subdir + '/' + filename)

    # Open desired file and go through it line by line.
    with open( path , encoding='utf-8') as file:
        for line in file:

            # We only care about the actual concordance, so, the lines in output file not beginning with # or the header 'doc'.
            if line[0] != "#" and line[0:3] != "doc":

                # Remove newlines and split by tabs (since the output is actually tab-separated), saving all but the first
                # element of the resulting list (the original URL of the sentence) as a string to the concordance's list.
                split_by_tab = line.strip('\n').split('\t')
                sent_as_str = " ".join( split_by_tab[1:] )                
                
                # If this joining results in a space being the first character of the line, remove that space.
                if sent_as_str[0] == " ":
                    sent_as_str = sent_as_str[1:]
                
                # Add to list.
                sent_list.append(sent_as_str)
        
    return sent_list


def create_total_sent_list(subdir):
    """
    Goes through each CSV file in the given subdirectory and combines all of their concordances into one single list.
    
    Args:
        subdir: a string, the name of the subdirectory to gather concordance files from.
    Returns:
        A list containing all sentences as strings.
    """
    
    # Get all csv files in given subdir.
    conc_files = [file for file in os.listdir(subdir) if file[-3:] == 'csv']

    # Initialise empty list to contain sentences.
    all_sents = []
    
    # Go through all files in subdir and extract the sentences from them, extending all_sents for each one and returning.
    for conc_file in conc_files:
        all_sents.extend( get_sents_from_csv(subdir, conc_file) )
        
    return all_sents

In [30]:
SUBDIR_NANO = 'encow16a-nano'  # The smaller version of the corpus; results in 20,680 sentences
SUBDIR_FULL = 'encow16a'       # The larger version; results in 115,735 sentences

all_sents_nano = create_total_sent_list(SUBDIR_NANO)
# len(all_sents_nano)

all_sents_full = create_total_sent_list(SUBDIR_FULL)
# len(all_sents_full)

In [29]:
print(all_sents_nano[:10])

['A cookie is a text file that is placed on your hard disk by a Web page server .', 'A listing is a product placed in the directory for sale .', 'After all , a home is the largest ( and most emotional ) investment most people will ever make .', "While emotions are probably in high gear once you 've found a home you love , it 's important to remember that a home is an investment .", "With so many questions , a consultation is the start of contributing positively to one 's future .", 'A classroom is a small community in and of itself .', "The recent studies like studies before them for 30 years showed cancer , heart attacks , strokes , obesity , diabetes , etc. It is said where there is smoke there is fire , and the aspartame studies that continually show aspartame as a killer are a blazing conflagration that EFSA does n't know how to deal with .", 'An NTD is an opening in the spinal cord or brain that occurs very early in human development .', 'A pinhole is a small hole used in Confocal

In [31]:
# Write these lists to new text files.

def write_to_corpus_file(sents_list, out_filename):
    """
    
    """
    file = open(out_filename, 'w+')
    for sent in sents_list:
        file.write(sent+'\n')
    file.close()
    
# write_to_corpus_file(all_sents_nano, 'description-corpus-20k.txt')
write_to_corpus_file(all_sents_nano[:10], 'test.txt')