### This code reads in training data, splits and writes out train and test data for experimentations

In [1]:
from pathlib import Path
import re
import random

In [2]:
# Set paths
qiqc_train_in = Path('/data/data1/datasets/kaggle/quora_insincere/train.csv')

qiqc_expt_train_out = Path('/data/data1/datasets/kaggle/quora_insincere/output/qiqc_expt_train_set.txt')
qiqc_expt_test_out = Path('/data/data1/datasets/kaggle/quora_insincere/output/qiqc_expt_test_set.txt')

In [3]:
# Set seed for repeatibility
random.seed(42)
# What percent of data to save separately as test data
percent_test_data = 0.10

In [4]:
# Function that converts string to lowercase and replaces any punctuations to <space>punctuation
def strip_formatting(string):
    string = string.lower()
    string = re.sub(r"([.!?,'/()])", r" \1 ", string)
    return string


In [5]:
def check_train_line_format(line):
    ''' Checks that line in the training dataset is in the correct format
    
        Correct format is: qid,question,(0|1)
        Where:
            qid: is a 20 digit hex number
            question: is a free-format string
            (0|1): is, well, 0 or 1
            
        Parameters:
            line: String to be checked
        Returns:
            True: if line is in correct pattern
            False: If line is not in correct pattern
    '''
    
    # The regex pattern is checked as follows:
    #   a) ^[0-9a-f]{20} => line starts with a hex characters 20 hex digits long
    #   b) .*: followed by any characters of any length
    #   c) \,(0|1): Line end with a comma followed by 0 or 1
    return True if re.match(r'^[0-9a-f]{20}.*\,(0|1)', line) else False
    

In [6]:
def check_test_line_format(line):
    ''' Checks that line in the test dataset is in the correct format
    
        Correct format is: qid,question
        Where:
            qid: is a 20 digit hex number
            question: is a free-format string
            
        Parameters:
            line: String to be checked
        Returns:
            True: if line is in correct pattern
            False: If line is not in correct pattern
    '''
    
    # The regex pattern is checked as follows:
    #   a) ^[0-9a-f]{20} => line starts with a hex characters 20 hex digits long
    #   b) .*: followed by any characters of any length
    return True if re.match(r'^[0-9a-f]{20}.*', line) else False
    

In [9]:
def prep_data(train_in, expt_train_out, expt_test_out):
    ''' Convert each line of in_data to format of fastText.
    
        For example, the in_data line: 00002165364db923c7e6,How did Quebec nationalists see their province as a nation in the 1960s?,0
        Needs to change ot: __label__0: how did quebec nationalists see their province as a nation in the 1960s ?
    
        So basically:
            1. Question ID is removed
            2. A "__label__0" or "__label__1" is prepended to each string
            3. String changes to all lower case
            4. All punctuation change to <space>punctuation. eg: in above example, 1960s? changes to 1960s ?
        
        Parameters:
            in_data: Input data (csv) with each question on a separate line in the format: qid, question, 0/1
            train_out: pathlib.Path name of training data
            test_out: pathlib. Path name of test data
            
    '''
    print('Preparing Data . . .')
    with train_in.open() as infh,                 \
         expt_train_out.open('w') as train_out_fh, \
         expt_test_out.open('w') as test_out_fh:

        for line in infh:
            # print(line)
            if line.strip().startswith('qid'):
                continue
            # If line is not in correct format, ignore it
            if not check_train_line_format(line):
                print(f'Ignoring line: {line.strip()}')
                continue
            # Using list slicing because I could not use split on comma. because commas appear withing the question.
            try:
                question = line.strip()[21:-2]
                question = strip_formatting(question)
                label = line.strip()[-1]
                line_out = '__label__{} {}'.format(label, question)
            except:
                print('Errored out')
                print(line)
            # print(line_out)
            
            # Write out formatted data
            if random.random() <= percent_test_data:
                test_out_fh.write(line_out + "\n")  
            else:
                train_out_fh.write(line_out + "\n")
                

In [10]:
# Run the main functions
prep_data(qiqc_train_in, qiqc_expt_train_out, qiqc_expt_test_out)

Preparing Data . . .
Ignoring line: 663c7523d48f5ee66a3e,"In ""Star Trek 2013"" why did they :
Ignoring line: 
Ignoring line: *Spoilers*
Ignoring line: *Spoilers*
Ignoring line: *Spoilers*
Ignoring line: *Spoilers*
Ignoring line: 
Ignoring line: 1)Make warping look quite a bit like an hyperspace jump
Ignoring line: 2)what in the world were those bright particles as soon as they jumped.
Ignoring line: 3)Why in the world did they make it possible for two entities to react in warp space in separate jumps.
Ignoring line: 4)Why did Spock get emotions for this movie.
Ignoring line: 5)What was the point of hiding the ""Enterprise"" underwater.
Ignoring line: 6)When they were intercepted by the dark ship, how come they reached Earth when they were far away from her.(I don't seem to remember the scene where they warp to earth).
Ignoring line: 7)How did the ship enter earth's atmosphere when it wasnt even in orbit.
Ignoring line: 8)When Scotty opened the door of the black ship , how come pike an