In [333]:
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np

In [360]:
def make_sentences(tokens,tags):
    """
    Function converts list of words into sentences with sentences of corresponding tags

    INPUT : Dataframe of tokens, Dataframe of tags

    OUTPUT : Zip of list of sentences, list of tags sentences
    """
    data = tokens.join(tags, on="id", how = "inner", rsuffix = "_tag").drop("id_tag",axis=1)
    sentences = []
    tags_list = []
    temp_tokens = []
    temp_tags = []
    for row in data.itertuples():
        word = row[2]
        tag = row[3]
        if word!='-DOCSTART-' and word!='.':
            temp_tokens.append(word)
            temp_tags.append(tag)
        if word=='.':
            sentences.append(' '.join(temp_tokens))
            tags_list.append(' '.join(temp_tags))
            temp_tokens = []
            temp_tags = []
    
    return zip(sentences,tags_list)

def load_data(sentence_file, tag_file=None):
    """Loads data from two files: one containing sentences and one containing tags.

    tag_file is optional, so this function can be used to load the test data.

    Suggested to split the data by the document-start symbol.

    """
    tokens = pd.read_csv(sentence_file)
    if tag_file:
        tags = pd.read_csv(tag_file)
    else:
        #dummy tags for test file
        tags = pd.DataFrame()
        tags['id'] = range(len(tokens))
        tags['tag'] = ['NNP']*len(tokens)

    return make_sentences(tokens,tags)

In [361]:
train_data = load_data("../data/train_x.csv", "../data/train_y.csv")

   id  tag
0   0    O
1   1  NNP
2   2  NNP
3   3    ,
4   4   CD
5   5  NNS
6   6   JJ
7   7    ,
8   8   MD
9   9   VB


In [363]:
len(train_data)

28510

In [344]:
def nGramTagger(n):
    dic = {}
    tags = zip(*train_data)[1]
    for line in tags:
        line = line.split(' ')
        line = ['*']*n + line
        for i in range(n,len(line)):
            if n==1:
                item = line[i]
            else:
                item = tuple(line[i-n:i])
            if item in dic:
                dic[item]+=1
            else:
                dic[item]=1
    return dic

def wordTagger():
    dic = defaultdict(int) 
    for line1,line2 in train_data:
        for word,tag in zip(line1.split(' '),line2.split(' ')):
            dic[(word,tag)]+=1
    return dic

unigram = nGramTagger(1)
bigram = nGramTagger(2)
trigram = nGramTagger(3)

wordtag = wordTagger()

In [347]:
train_data

[('Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov. 29',
  'id'),
 ('Mr. Vinken is chairman of Elsevier N.V. , the Dutch publishing group',
  'tag')]

In [345]:
tag_set = unigram.keys()

In [326]:
def get_q(tag_penult,tag_last,tag_current):
    if (tag_penult, tag_last) not in bigram:
        return 0.0
    return float(trigram.get((tag_penult, tag_last, tag_current),0.0))/bigram[(tag_penult, tag_last)]

def get_e(word,tag):
    return float(wordtag.get((word,tag),0.0))/unigram[tag]

In [327]:
def prob_distibution(k):
    prob = []
    tag_list = zip(*unigram.keys())[0]
    for key1 in tag_list :
        for key2 in tag_list:
            if (key1,key2) in bigram:
                num = bigram[(key1,key2)] + k
            else:
                num=k
            den = unigram[(key1,)] + (k*len(tag_list))
            prob.append(float(num)/den)
    print prob.count(0.0)
    plt.plot(sorted(prob,reverse=True))
    plt.show()

In [328]:
def sequence_probability(self, sequence, tags):
        """Computes the probability of a tagged sequence given the emission/transition
        probabilities.
        """
        tag_penult = '*'
        tag_prev = '*'
        prod = 1
        for word, tag in zip(sequence,tags):
            q = self.get_q(tag,tag_prev,tag_penult)
            e = self.get_e(word,tag)
            tag_penult = tag_prev
            tag_prev = tagger
            prod *= q*e

        return prod


In [329]:
def inference(sequence):
        """Tags a sequence with part of speech tags.

        You should implement different kinds of inference (suggested as separate
        methods):

            - greedy decoding
            - decoding with beam search
            - viterbi
        """
        #Method 1: Greedy Decoding
        tag_sequence = []
        tag_penult = '*'
        tag_prev = '*'
        print len(sequence)
        for word in sequence.split():
            scores = []
            for tag in tag_set:
                scores.append(get_q(tag_penult,tag_prev,tag)*get_e(word,tag))
            final_tag = tag_set[np.argmax(scores)]
            tag_sequence.append(final_tag)
            tag_penult = tag_prev
            tag_prev = final_tag
            
        return tag_sequence

In [None]:

dev_data = load_data("data/dev_x.csv", "data/dev_y.csv")
test_data = load_data("data/test_x.csv")