# Raw Data Cleaning

**File:** `files/Help a CS Student Graduate.csv`  
**Data format:** ```| Timestamp | Username | Ano | Bakit | Paano | Kailan | Sino | Alin | Saan |```

In [4]:
import os
import csv
import re

from nltk.tokenize import RegexpTokenizer

# --------------------------------------
#### Methods

- converts each row of the csv file into strings

In [5]:
def remove_unwanted_chars(my_list):
    """
    Remove the numbers in the beginning of the sentence.
    Also removes unnecessary space characters in a sentence.
    
    my_list - list of sentences in a row
    """

    for item in my_list:
        if len(item) == 0 or item == " ":
            my_list.remove(item)

        if re.match("^[0-9]", item) is not None:
            index = my_list.index(item)
            tmp = item.lstrip('0123456789.- ')
            my_list.remove(item)
            my_list.insert(index, tmp)

    return my_list

In [6]:
def extract(input, output):
    """
    Each row of the csf file contains 1 or more sentences.
    This function visits each of the row then clean each 
    of the sentences in it.
    """

    csv_f = csv.reader(input)
    wr = csv.writer(output, delimiter=',', quoting=csv.QUOTE_ALL)

    wr.writerow(["Questions", "Category"])

    for row in csv_f:
        for column in range(2, len(row)):
            if row[column].count(":") == 0:
                row_items = (row[column].split('\n'))
                row_items = remove_unwanted_chars(row_items)
                wr = csv.writer(output, delimiter='\n', quoting=csv.QUOTE_ALL)
                wr.writerow(row_items)

In [7]:
def to_wordpos_dict(file):
    text = file.readlines()
    array = list()
    dict = {}
    ctr = 0

    for line in text:
        data = line.split("\t")
        if data[0] != "?":
            word = data[0]
            pos = data[len(data)-1]
            # pos = get_tags(pos, 1)
            dict[word] = pos.strip()
        elif data[0] == '?':
            array.append(dict)
            dict = {}
        ctr += 1

    # print ctr
    # print array

    return array

In [8]:
def get_tags(tag, level):
    if level == 1:
        tag = tag.split("-")[0]
    elif level == 2:
        temp = tag.split("-")[1]
        tag = temp[:2]
    elif level == 3:
        temp = tag.split("-")[1]
        tag = temp[2:]

    return tag

In [9]:
def transform(data):
    dict = {'NN': 0, 'PR': 0, 'DT': 0, 'LM': 0, 'CC': 0, 'VB': 0, 'JJ': 0,
            'RB': 0, 'CD': 0, 'TS': 0}
    temp = []
    ctr = 0

    for sentence in data:
        for key in sentence:
            ctr += 1
            dict[sentence[key]] += 1

        temp.append(dict)
        dict = {'NN': 0, 'PR': 0, 'DT': 0, 'LM': 0, 'CC': 0, 'VB': 0,
                'JJ': 0, 'RB': 0, 'CD': 0, 'TS': 0}

    return temp

In [10]:
def category_vector(file):
    input_file = file
    csv_f = csv.reader(input_file)

    cat = []
    column = 0
    ctr = 0

    category = {'entity': 1, 'abbreviation': 2, 'description': 3, 'human': 4,
                'location': 5, 'numeric': 6}

    for row in csv_f:
        ctr += 1
        if "Questions" not in row:
            temp = []
            for column in range(1, 2):
                if(row[column] != "Category" and (row[column]) > 0):
                    temp.append(category[row[column].lower()])
            cat.append(temp)

    input_file.close()

    return cat

In [11]:
def count_qmark(file):
    text = file.readlines()

    for line in text:
        if line.count("?") > 2:
            print(line)

In [12]:
def get_wh_question(file):
    reader = csv.reader(file)

    q_word_tags = {
                    'alin': 1, 'saan': 2, 'ano': 3, 'kailan': 4, 'paano': 5,
                    'sino': 6, 'bakit': 7
                    }
    q_words = {
                'aling', 'alin-alin', 'alin-aling', 'saang', 'saan-saan',
                'nasaan', 'nasaang', 'anong', 'anu-ano', 'anu-anong', 'inaano',
                'paanong', 'papaano', 'papaanong', 'sinong', 'sinu-sino',
                'sino-sinong', 'kailang'
                }
    vec = []

    for row in reader:
        sentence = row[0].split(" ")
        temp = []

        for word in sentence:
            if word.lower() in q_word_tags:
                # print word.lower()
                temp.append(q_word_tags[word.lower()])
                vec.append(temp)
                break
            elif word.lower() in q_words:
                for key in q_word_tags:
                    if key in word.lower():
                        temp.append(q_word_tags[key])
                        vec.append(temp)
                        break
    return vec

In [13]:
def tokenize_word_data(file):
    reader = csv.reader(file)

    tokenizer = RegexpTokenizer(r'\w+')
    data = []

    for row in reader:
        # Escaping the first row because it only contains the column titles
        if row is not ['Questions', 'Category']:
            data.append(tokenizer.tokenize(row[0]))

#     print(data)

    return data

# --------------------------------------
#### The Main Function

In [14]:
def main():
    """
    1. Preprocessing the raw data.
    """
    # Input file reading and output file writing
    input_file=open(os.path.abspath('files/Help a CS Student Graduate.csv'))
    output_file=open(os.path.abspath('files/cleaning_output.csv'), 'w')
    
    # Cleaning
    extract(input_file, output_file)
    input_file.close()
    output_file.close()
    ########
  
    """
    2. DEBUGGING:
    Checking the equality of the pos tags, wh-words and category gathered.
    Total should be 3077.
    """
    # Ouput file
    dataset = 'files/dataset_pos.out'
    
    # Input file
    labelled_data = 'files/labelled_data.csv'
    
    pos_data = to_wordpos_dict(open(os.path.abspath(dataset)))
#     wh_vector = get_wh_question(open(os.path.abspath(labelled_data)))
#     category = category_vector(open(os.path.abspath(labelled_data)))

    print ("Data Length: ", len(pos_data))
#     print "Wh_Question Length: ", len(wh_vector)
#     print "Category Length: ", len(category)
    ###########

    tokenize_word_data(open(os.path.abspath('files/labelled_data.csv')))


In [15]:
if __name__ == '__main__':
    main()

Data Length:  3077
