# POS Tagging using CRF++
After stemming, run the Part-of-Speech tagger using the command below. This will produced a file that contains the sentences written vertiacally with the corresponding stemming and tagging information beside each of the word.

Command format:

`stemmer.out` - output file of stemming notebook

`tagger.out` - output file of pos tagging

`crf_test -m path/to/Bigram.model path/to/stemmer.out -o path/to/tagger.out`

In [3]:
%%bash
crf_test -m ../pos_tagger/Bigram.model ../files/stemmer.out -o ../files/pos_tagger.out

## ----------------------------------------------------------------
# Data Transformation
- Word-pos pair Extraction
- Coarse tag Extraction
- Data conversion for the rule-based algorithm consumption

In [4]:
import os
import csv
import math
import pandas

import logging

This method extracts only the coarse tag given to a word.

In [5]:
def get_tags(file):
    """
    Transform the input file into an array of tuples.
    Each tuple contains the word and its corresponding tag.
    """

    text = file.readlines()
    array = []
    sentence = []
    tuple = ()

    for line in text:
        data = line.split("\t")
        if data[0] != "?" and len(line.strip()) > 0:
            word = data[0]
            pos = data[len(data) - 1]

            # Extract only the coarse tag of the word
            pos = get_coarsetag(pos, 1)

            tuple = (word, pos.strip())
            sentence.append(tuple)
        elif data[0] == '?':
            array.append(sentence)
            sentence = []
            tuple = ()

    return array

Helper function to the `get_tags` method. 

In [6]:
def get_coarsetag(tag, level):
    """
    Separate the coarse tags from the fine pos tags.
    
    tag - the complete pos tag
    level - desired tag level
    """

    if level == 1:
        tag = tag.split("-")[0]
    elif level == 2:
        temp = tag.split("-")[1]
        tag = temp[:2]
    elif level == 3:
        temp = tag.split("-")[1]
        tag = temp[2:]

    return tag

In [7]:
tuples = get_tags(open(os.path.abspath('../files/pos_tagger.out')))
len(tuples)

3077

In [8]:
def get_raw_data(file, column_no):
    """
    Get every data in the specified column_no.
    """

    r = csv.reader(file)
    data = []
    
    for row in r:
        if column_no == 0:
            if row[0] != "Questions":
                data.append(row[0].strip())
        elif column_no == 1:
            if row[1] != "Category":
                data.append(row[1])

    return data

In [9]:
sentences = get_raw_data(open(os.path.abspath('../files/raw_data/labelled_data.csv')), 0)
len(sentences)

3077

In [10]:
sentences[85]
tuples[85]

'Bakit ka masyadong praning?'

[('Bakit', 'PR'), ('ka', 'PR'), ('masyadong', 'RB'), ('praning', 'RB')]

In [11]:
def prune(tuples_array):
    """
    Extract the independent clause of compound questions.
    Simple questions are left untouch.
    """

    conj = {"sapagkat", "dahil", "dahil sa", "at saka", "at hindi", "ni hindi",
            "pero", "datapwat", "ngunit", "subalit", "o", "o kaya",
            "gayon pa man", "gayumpaman", "gayunman", "kaya", "kung kaya't",
            "kung kaya", "man", "maging", "hindi lamang", "kundi", "bagaman",
            "bagama't", "kapag", "kasi", "dahilan sa", "gawa ng", "porke",
            "porke at", "porke't", "pagkat", "kaya", "kaysa", "kahit",
            "gayong", "kung", "kung gayon", "habang", "nang", "nang sa gayon",
            "maging", "maliban kung", "palibhasa", "para", "upang", "parang",
            "pansamantala", "hanggang"
            }

    q_words = {'aling', 'alin-alin', 'alin-aling', 'saang', 'saan-saan',
               'nasaan', 'nasaang', 'anong', 'anu-ano', 'anu-anong', 'inaano',
               'paanong', 'papaano', 'papaanong', 'sinong', 'sinu-sino',
               'sino-sinong', 'kailang', 'alin', 'saan', 'ano', 'kailan',
               'paano', 'sino', 'bakit'
               }
    
    with_tags = tuples_array
    
    # Scan data for compound question and extract the independent clause
    for i in range(0, len(with_tags)):
        text = with_tags[i]
        
        if len(with_tags[i]) == 0:
            print(len(questions))

        for pair in with_tags[i]:
            if pair[0] in conj:
                index = text.index(pair)

                # Conjuction should not be too near in the beginning of the
                # questions
                if len(text) >= 5 and index >= math.ceil(len(text)/2):
                    with_tags[i] = tuples_array[i][:index]
                elif index == 0:
                    for j in range(index, len(text)):
                        if text[j][0].lower() in q_words:
                            with_tags[i] = tuples_array[i][j:]
                            break

    # Scan each tuple array for arrays not starting with a wh-word
    # Scan questions without conjuctions but does not start with wh-word
    for i in range(0, len(with_tags)):
        tagged_text = with_tags[i]
        
        if tagged_text[0][0].lower() not in q_words:

            for j in range(0, len(tagged_text)):
                if tagged_text[j][0].lower() in q_words:
                    with_tags[i] = tuples_array[i][j:]

    # DEBUGGING
    # for i in range(0, len(with_tags)):
    #  if with_tags[i][0][0].lower() not in q_words:
    #      print(with_tags[i])

    return with_tags

In [12]:
# The preceding cells should be run first before running this cell!
a = prune(tuples)

In [13]:
def get_sampling_data(sentence_list, category, dataset):
    """
    Return the set of data specfied. The sentences in the sentences list is assumed to contain only thet
    independents clause.

    sentence_list -- a list that contains sentences which is represented as an array of tuples (word and pos).
    category -- the label of the sentence
    dataset -- the type of data needed:
                1 - training set
                2 - testing set
                3 - all
    """

    array = []
    
    # Training set
    if dataset == 1:
        y = int(math.ceil(len(sentence_list)*0.8))

        for i in range(0, y):
            data = []

            for j in range(0, len(sentence_list[i])):
                if j == 0:
                    data.append(sentence_list[i][j][0].lower())
                else:
                    data.append(sentence_list[i][j][1].lower())

            data.append(category[i].lower())
            array.append(data)
    # Testing set
    elif dataset == 2:
        x = int(len(sentence_list) - math.floor(len(sentence_list)*0.2))

        for i in range(x, len(sentence_list)):
            data = []

            for j in range(0, len(sentence_list[i])):
                if j == 0:
                    data.append(sentence_list[i][j][0].lower())
                else:
                    data.append(sentence_list[i][j][1].lower())

            data.append(category[i].lower())
            array.append(data)
    # All the data
    elif dataset == 3:
        for i in range(0, len(sentence_list)):
            data = []

            for j in range(0, len(sentence_list[i])):
                if j == 0:
                    data.append(sentence_list[i][j][0].lower())
                else:
                    data.append(sentence_list[i][j][1].lower())

            data.append(category[i].lower())
            array.append(data)

    return array

In [14]:
def format(sampling_type):
    """
    Splits the dataset into training and testing data.
    
    NOTE: category[i] should be the label of sentence[i] in the labelled_data.csv file. 
    """
    
    logging.getLogger().setLevel(logging.INFO)
    
    input1 = open(os.path.abspath('../files/pos_tagger.out'))
    input2 = open(os.path.abspath('../files/raw_data/labelled_data.csv'))
    input3 = open(os.path.abspath('../files/raw_data/labelled_data.csv'))

    # Extract the word and tag pairs from the pos tagging output file
    tuples = get_tags(input1)
    # Get all question sentences in the file
    sentences = get_raw_data(input2, 0)
    # Get all the categories in the file
    category = get_raw_data(input3, 1)

    input1.close()
    input2.close()
    input3.close()

    pruned_array = prune(tuples)
    
    training = []
    testing = []
    
    start = 0
    
    if sampling_type == 'random':
        training = get_sampling_data(pruned_array, category, 1)
        testing = get_sampling_data(pruned_array, category, 2)
        

        return training, testing

    elif sampling_type == 'stratified':
        category_names = ['abbreviation', 'description', 'entity', 'human', 'location', 'numeric']

        for label in category_names:
            # get all sentence with the same label
            for index in range(start, len(category)):
                if category[index].lower() != label or index == len(category)-1:
                    strata_sentence_list = pruned_array[start:index-1]
                    strata_category = category[start:index-1]
                    strata_training_set = get_sampling_data(strata_sentence_list, strata_category, 1)
                    strata_testing_set = get_sampling_data(strata_sentence_list, strata_category, 2)
                    training.extend(strata_training_set)
                    testing.extend(strata_testing_set)
                    start = index
                    break

        return training, testing

In [15]:
training, testing = format('stratified')

The method **write_to_file()** accepts unspecified number of arguments. It treats each argument as a dictionary key-value pair with *variable* as key and its *assigned value* as the value. The *kwargs keys* are the headers used in the csv output file.

In [16]:
def write_to_file(**kwargs):
    path = os.path.abspath('../files/transformed_data.csv')

    field_names = kwargs.keys()
    data_frame = pandas.DataFrame()

    for field in field_names:
        value = kwargs.get(field)
        string_array = []

        for v in value:
            s = ','.join(v)
            string_array.append(s)

        data_frame = pandas.concat([data_frame, pandas.DataFrame(
            data=string_array, columns=[field])], axis=1)
        
    data_frame.to_csv(path, index=False)

Writes the file training and testing data into a file.

In [17]:
write_to_file(training=training, testing=testing)