In [261]:
import pandas as pd
from collections import defaultdict
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
import csv
import string
from sklearn.metrics import accuracy_score

In [262]:
def make_sentences(tokens,tags):
    """
    Function converts list of words into sentences with sentences of corresponding tags

    INPUT : Dataframe of tokens, Dataframe of tags

    OUTPUT : Zip of list of sentences, list of tags sentences
    """
    data = tokens.join(tags, on="id", how = "inner", rsuffix = "_tag").drop("id_tag",axis=1)
    sentences = []
    tags_list = []
    temp_tokens = []
    temp_tags = []
    for row in data.itertuples():
        word = row[2]
        tag = row[3]
        if word!='-DOCSTART-' and word!='.':
            temp_tokens.append(word)
            temp_tags.append(tag)
        if word=='.':
            sentences.append(' '.join(temp_tokens) + ' .')
            tags_list.append(' '.join(temp_tags) + ' .')
            temp_tokens = []
            temp_tags = []
    
    return zip(sentences,tags_list)

def load_data(sentence_file, tag_file=None):
    """Loads data from two files: one containing sentences and one containing tags.

    tag_file is optional, so this function can be used to load the test data.

    Suggested to split the data by the document-start symbol.

    """
    tokens = pd.read_csv(sentence_file)
    if tag_file:
        tags = pd.read_csv(tag_file)
    else:
        #dummy tags for test file
        tags = pd.DataFrame()
        tags['id'] = range(len(tokens))
        tags['tag'] = ['NNP']*len(tokens)

    return make_sentences(tokens,tags)

In [263]:
train_data = load_data("../data/train_x.csv", "../data/train_y.csv")

In [264]:
#creating word_count dictionary
word_count = Counter()
for sentence in zip(*train_data)[0]:
    for word in sentence.split(' '):
        word_count[word]+=1

In [265]:
def categorize_word(word, idx):
    if word.isdigit():
        if len(word) == 2:
            return 'twoDigitNum'
        elif len(word) == 4:
            return 'fourDigitNum'
        else:
            return 'othernum'
    elif word.isalpha():
        if word.islower():
            return 'lowercase'
        elif word.isupper():
            return 'allCaps'
        elif word[0].isupper() and word[1:].islower() and idx == 0:
            return 'firstWord'
        elif word[0].isupper() and word[1:].islower():
            return 'initCap'
        else:
            return 'other'    
    else:
        without_punct = word.translate(None, string.punctuation)
        if without_punct.isdigit() and ',' in word:
            return 'containsDigitAndComma'
        elif without_punct.isdigit() and '-' in word:
            return 'containsDigitAndDash'
        elif without_punct.isdigit() and '/' in word:
            return 'containsDigitAndSlash'
        elif without_punct.isdigit() and '.' in word:
            return 'containsDigitAndPeriod'
        elif len(word) == 2 and word[0].isupper() and word[1] == '.':
            return 'capPeriod'
        elif without_punct.isalnum():
            return 'containsDigitAndAlpha'
        else:
            return 'other'
    

In [266]:
def nGramTagger(n):
    dic = {}
    tags = zip(*train_data)[1]
    for line in tags:
        line = line.split(' ')
        line = ['*']*n + line
        for i in range(n,len(line)):
            if n==1:
                item = line[i]
            else:
                item = tuple(line[i-n:i])
            if item in dic:
                dic[item]+=1
            else:
                dic[item]=1
    return dic

def wordTagger():
    dic = defaultdict(int) 
    for line1,line2 in train_data:
        for word,tag in zip(line1.split(' '),line2.split(' ')):
            dic[(word,tag)]+=1
            if word_count[word]<5:
                idx = line1.split(' ').index(word)
                category = categorize_word(word, idx)
                dic[(category,tag)]+=1
    return dic

unigram = nGramTagger(1)
bigram = nGramTagger(2)
trigram = nGramTagger(3)
fourgram = nGramTagger(4)

wordtag = wordTagger()

In [267]:
# def good_turing_smoothing(dic):
#     reverse_dic = {}
#     k=5
#     #smoothing for rare words
#     for key,value in dic.items():
#         if value<=5:
#             reverse_dic[value] = reverse_dic.get(value,0)+1
    
#     for key,value in dic.items():
#         if value<5:
#             dic[key] = (value+1)*reverse_dic[value+1]/reverse_dic[value]
#     print reverse_dic
#     #handling for unseen words or combinations
# #     dic['UNK'] = reverse_dic.get(1,1)
    
#     return dic
    
# unigram = good_turing_smoothing(unigram)
# bigram = good_turing_smoothing(bigram)
# trigram = good_turing_smoothing(trigram)

In [268]:
def KNSmoothing():
    probs = {}
    total = len(trigram)
    for tag in unigram.keys():
        c=0
        for item in trigram.keys():
            if tag==item[2]:
                c+=1
        probs[tag]=float(c)/total
        
    bigram_kn_counts={}
    for key in bigram.keys():
        for item in trigram.keys():
            if key[0]==item[0] and key[1]==item[1]:
                bigram_kn_counts[key] = bigram_kn_counts.get(key,0)+1
    
    return probs, bigram_kn_counts

In [269]:
probs,bigram_kn_counts = KNSmoothing()

In [270]:
tag_set = unigram.keys()

In [271]:
wordtag

defaultdict(int,
            {('course', 'NN'): 86,
             ('Close', 'NNP'): 2,
             ('Rosencrants', 'NNP'): 1,
             ('devastating', 'JJ'): 4,
             ('TAX', 'NN'): 2,
             ('good-quality', 'JJ'): 1,
             ('handily', 'RB'): 1,
             ('Said', 'VBD'): 2,
             ('Ca', 'JJ'): 1,
             ('Insiders', 'NNS'): 2,
             ('soundtrack', 'NN'): 2,
             ('competitor', 'NN'): 15,
             ('failed', 'VBD'): 65,
             ('63-year-old', 'JJ'): 2,
             ('policeman', 'NN'): 2,
             ('markets', 'NNS'): 291,
             ('pre-1967', 'JJ'): 2,
             ('Seacomb', 'NNP'): 1,
             ('368.3', 'CD'): 1,
             ('B-1', 'JJ'): 1,
             ('postings', 'NNS'): 1,
             ('3642.90', 'CD'): 1,
             ('8.24', 'CD'): 1,
             ('watchdog', 'NN'): 2,
             ('prone', 'JJ'): 3,
             ('3,609,800', 'CD'): 1,
             ('To', 'TO'): 117,
             ('junk', 'N

In [272]:
sum_bigram_counts = sum(bigram.values())

In [273]:
def get_q(tag_penult,tag_prev,tag_current):
    k=1
    d=0.75

    num = float(trigram.get((tag_penult, tag_prev, tag_current),0.0)) + k -d
    den = float(bigram.get((tag_penult, tag_prev), 0.0)) + k*len(bigram)
    lambd = float(d*bigram_kn_counts.get((tag_penult,tag_prev),0.0))/den
    Pcont  = probs.get(tag_current,0.0)
    
    return (num/den) + (lambd*Pcont)
    

def get_e(word,tag,idx):
    if word not in word_count:
        category = categorize_word(word,idx)
        return float(wordtag[(category,tag)])/unigram[tag]
    else:
        return float(wordtag[(word,tag)])/unigram[tag]
  
        
        

In [274]:
def prob_distibution(k):
    prob = []
    tag_list = zip(*unigram.keys())[0]
    for key1 in tag_list :
        for key2 in tag_list:
            if (key1,key2) in bigram:
                num = bigram.get((key1,key2),0.0) + k
            else:
                num=k
            den = unigram.get(key1,0.0) + (k*len(tag_list))
            prob.append(float(num)/den if den!=0 else 0.0)
    print prob.count(0.0)
    plt.plot(sorted(prob,reverse=True))
    plt.show()

In [275]:
def sequence_probability(self, sequence, tags):
        """Computes the probability of a tagged sequence given the emission/transition
        probabilities.
        """
        tag_penult = '*'
        tag_prev = '*'
        prod = 1
        for word, tag in zip(sequence,tags):
            q = self.get_q(tag,tag_prev,tag_penult)
            e = self.get_e(word,tag)
            tag_penult = tag_prev
            tag_prev = tagger
            prod *= q*e

        return prod


In [276]:
def inference(sequence,mode):
        """Tags a sequence with part of speech tags.

        You should implement different kinds of inference (suggested as separate
        methods):

            - greedy decoding
            - decoding with beam search
            - viterbi
        """
        #Method 1: Greedy Decoding
        if mode.lower()=='greedy':
            tag_sequence = []
            tag_penult = '*'
            tag_prev = '*'
            for idx,word in enumerate(sequence.split(' ')):
                scores = []
                for tag in tag_set:
                    scores.append(get_q(tag_penult,tag_prev,tag)*get_e(word,tag,idx))
                final_tag = tag_set[np.argmax(scores)]
                tag_sequence.append(final_tag)
                tag_penult = tag_prev
                tag_prev = final_tag
                
            return tag_sequence
        
        #Method 2: Beam Search
        elif mode.lower()=='beam':
            k=1
            sequence = sequence.split()
            best_sequences = [['*','*']]
            for idx,word in enumerate(sequence):
                scores = {}
                for item in best_sequences:
                    item = list(item)
                    tag_penult = item[-2]
                    tag_prev = item[-1]
                    for tag in tag_set:
                        score = get_q(tag_penult,tag_prev,tag)*get_e(word,tag,idx)
                        item.append(tag)
                        scores[tuple(item)] = score
                        item.pop()
                topk_scores = sorted(scores.items(), key=lambda x: x[1], reverse = True)
                best_sequences = list(zip(*topk_scores)[0][:k])
            
            best_sequence =  best_sequences[0][2:]
            

            return best_sequence
        
        elif mode.lower()=='viterbi':
            tag_sequence= []
            path = {}
            path['*','*'] = []
            pi_func = defaultdict(float)
            pi_func[(0, "*", '*')] = 0.0

            # v = Tag current, u = Tag previous, w = Tag penult
            sequence = sequence.split(" ")
            n = len(sequence)

            for k in range(1,n+1):
                temp_path = {}
                for u in possible_tags(k-1):
                    for v in possible_tags(k):
                        max_tag = ""
                        max_score = float("-Inf")
                        for w in possible_tags(k - 2):
                            score = pi_func.get((k-1, w, u),float('-Inf'))*get_q(w,u,v)*get_e(sequence[k-1],v,k-1)
                            if score > max_score:
                                max_score = score
                                max_tag = w
                        pi_func[(k, u, v)] = max_score
                        temp_path[u,v] = path[max_tag,u] + [v]
                path = temp_path

            prob,umax,vmax = max([(pi_func.get((n,u,v))*get_q(u,v,'.'),u,v) for u in possible_tags(n-1) for v in possible_tags(n)])

            return path[umax,vmax]
        
def possible_tags(k):
    if k in (-1, 0):
        return set('*')
    else:
        return tag_set


In [277]:
evaluate(dev_data,'beam')

1000
2000
3000
4000
5000
6000
7000
8000
9000
242559


(0.9456627047440004, 0.0)

In [225]:
def evaluate(data,mode):
    individual_score = 0
    sentence_score = 0
    total_word_count = 0
    final_results = []
    idx=0
    for sentence, tag_sequence in data:
        #checking number of sentences processed to gauge runtime
        idx+=1
        if idx%1000==0:
            print idx
        tag_sequence = tag_sequence.split(' ')
        result = inference(sentence,mode)
        final_results += result
        if result==tag_sequence:
            sentence_score+=1
            individual_score+=len(result)
            total_word_count += len(result)
        else:
            for predicted, actual in zip(result,tag_sequence):
                total_word_count+=1
                if predicted==actual:
                    individual_score+=1
    
    save_results(final_results,'dev')
    
    return float(individual_score)/total_word_count, float(sentence_score)/len(data)

        

In [221]:
def save_results(predicted_tags, mode):

    if mode.lower()=='dev':
        data_x = pd.read_csv('../data/dev_x.csv')
        data_y = pd.read_csv('../data/dev_y.csv')
    
    elif mode.lower()=='test':
        data_x = pd.read_csv('../data/test_x.csv')
#         data_y = pd.read_csv('../results/test_y.csv')
    
    predicitons=[]
    print len(predicted_tags)
    for row in data_x.itertuples():
        if row[2]=='-DOCSTART-':
            predicted_tags.insert(row[1],'O')
        
    df = pd.DataFrame()
    df['id'] = range(len(predicted_tags))
    df['tag'] = predicted_tags
#     print "Accuracy score: " + str(accuracy_score(data_y['tag'],predicted_tags)) 
    
    if mode.lower()=='dev':
        df.to_csv('../predictions.csv',index=False)
    elif mode.lower()=='test':
        df.to_csv('../results/test_y.csv',index=False)


In [16]:
dev_data = load_data("../data/dev_x.csv", "../data/dev_y.csv")
test_data = load_data("../data/test_x.csv")

In [114]:
df = pd.read_csv('../predictions.csv')
df_dev = pd.read_csv('../data/dev_y.csv')

In [231]:
df_x_dev = pd.read_csv('../data/train_x.csv')
df_y_dev = pd.read_csv('../data/dev_y.csv')

In [233]:
new_df = df_x_dev.join(df_y_dev, on="id", how = "inner", rsuffix = "_tag").drop("id_tag",axis=1)

In [251]:
get_e('There','EX',0)

0.36589147286821705

In [238]:
new_df.loc[df.tag=='RBR']

Unnamed: 0,id,word,tag
899,899,closer,JJR
1216,1216,more,JJR
2115,2115,earlier,RB
2193,2193,earlier,RB
2267,2267,Earlier,RB
2531,2531,More,RBR
3397,3397,longer,RB
4829,4829,better,JJR
5522,5522,more,JJR
6032,6032,earlier,RBR


In [254]:
accuracies = []
for i in range(len(df)-1):
    true = list(df_dev['tag'])
    pred = list(df['tag'])
    pred = pred[:i]+pred[i+1:]
    accuracy = accuracy_score(true,pred)
    if accuracy>0.93:
        print accuracy,i
    accuracies.append(accuracy_score(true,pred))
    if i%10000==0:
        print "Indexes tested: " + str(i)

Indexes tested: 0


KeyboardInterrupt: 

In [None]:
max(accuracies)