In [5]:
import numpy as np
import pandas as pd
import scipy as sc
import time

""" Contains the part of speech tagger class. """

def load_data(sentence_file, tag_file=None, data_type=None):
    """Loads data from two files: one containing sentences and one containing tags.

    tag_file is optional, so this function can be used to load the test data.

    Suggested to split the data by the document-start symbol.
    
    INPUT : file names, data type
    OUTPUT : tuple (token, tag)

    """
    if data_type=="train":
        tokens = pd.read_csv(sentence_file,delimiter=",",header=0)["word"].tolist()
        tags = pd.read_csv(tag_file,delimiter=",",header=0)["tag"].tolist()
        known_words = set(tokens)
        tag_list = set(tags)
        
        data = []
        for i in range(len(tokens)):
            data.append((tokens[i],tags[i]))
        
        return data, known_words, tag_list
    
    elif data_type=="dev":
        tokens = pd.read_csv(sentence_file,delimiter=",",header=0)["word"].tolist()
        tags = pd.read_csv(tag_file,delimiter=",",header=0)["tag"].tolist()
        
        data = []
        for i in range(len(tokens)):
            data.append((tokens[i],tags[i]))
    
    else:
        tokens = pd.read_csv(sentence_file,delimiter=",",header=0)["word"].tolist()
        
        data = []
        for i in range(len(tokens)):
            data.append((tokens[i],"NN")) # setting default tag as NOUN NN
    
    return data

def evaluate_per_token(data, model, datatype="None"):
    """Evaluates the POS model on some sentences and gold tags.

    This model can compute a few different accuracies:
        - whole-sentence accuracy
        - per-token accuracy
        - compare the probabilities computed by different styles of decoding

    You might want to refactor this into several different evaluation functions.
    
    """
    start_time = time.time()
    act_tags = []
    pred_tags = []
    count_right = 0
    for token,tag in data:
        pred = model.baseline_token_probability(token)
        pred_tags.append(pred)
        act_tags.append(tag)
        if pred==tag:
            count_right+=1
    
    if datatype=="train":
        acc = (count_right*100/len(act_tags))
        print ("Train set accuracy : " + str(acc))
    
    if datatype=="dev":
        acc = (count_right*100/len(act_tags))
        print ("Dev set accuracy : " + str(acc))
    
    print ("Evaluation complete in --- %s seconds ---" % (time.time() - start_time))
    return pred_tags

def evaluate_whole_sentence(data, model):
    """Evaluates the POS model on some sentences and gold tags.

    This model can compute a few different accuracies:
        - whole-sentence accuracy
        - per-token accuracy
        - compare the probabilities computed by different styles of decoding

    You might want to refactor this into several different evaluation functions.
    
    """
    pass

class POSTagger():
    
    token_tag_prob = {} # For every token-tag pair calculates probability from training data
    known_words = [] # Set of all known words
    tag_list = [] # Set of all possible tags 
    
    def __init__(self, known_words, tag_list):
        """Initializes the tagger model parameters and anything else necessary. """ 
        self.known_words = known_words
        self.tag_list = tag_list
        return

    def train(self, data):
        """Trains the model by computing transition and emission probabilities.

        You should also experiment:
            - smoothing.
            - N-gram models with varying N.
        
        """
        start_time = time.time()
        for token,tag in data:
            if token not in self.token_tag_prob.keys():
                tag_set = {}
                tag_set[tag] = 1
                self.token_tag_prob[token] = tag_set
            else:
                tag_set = self.token_tag_prob[token]
                if tag not in tag_set.keys():
                    tag_set[tag] = 1
                else:
                    tag_set[tag] += 1
        print ("Training complete in --- %s seconds ---" % (time.time() - start_time))
        return
    
    def baseline_token_probability(self, token):
        try:
            possible_tags = self.token_tag_prob[token]
            pred = ""
            count = 0
            tot = 0.0
        
            for tag,val in possible_tags.items():
                if val>=count:
                    count = val
                    pred = tag
                tot+=val
        except:
            return "NNP" # Defaulting to Noun
        return pred

    def sequence_probability(self, sequence, tags):
        """Computes the probability of a tagged sequence given the emission/transition
        probabilities.
        """
        return

    def inference(self, sequence):
        """Tags a sequence with part of speech tags.

        You should implement different kinds of inference (suggested as separate
        methods):

            - greedy decoding
            - decoding with beam search
            - viterbi
        """
        return []

if __name__ == "__main__":
    
    
    # USE APPROPRIATE FILE PATH
    train_data, known_words, tag_list = load_data("../data/train_x.csv", "../data/train_y.csv","train")
    dev_data = load_data("../data/dev_x.csv", "../data/dev_y.csv","dev")
    test_data = load_data("../data/test_x.csv")
    
    print "Done loading datasets"

    pos_tagger = POSTagger(known_words, tag_list)
    pos_tagger.train(train_data)

    # Experiment with your decoder using greedy decoding, beam search, viterbi...

    # Here you can also implement experiments that compare different styles of decoding,
    # smoothing, n-grams, etc.
    
    train_predictions = evaluate_per_token(train_data, pos_tagger, "train")
    dev_predictions = evaluate_per_token(dev_data, pos_tagger, "dev")
    test_predictions = evaluate_per_token(test_data,pos_tagger)
    
    # Predict tags for the test set MUST move to evaluate_whole_sentence()
    #test_predictions = []
    #for sentence in test_data:
    #    test_predictions.extend(pos_tagger.inference(sentence))
    
    # Write them to a file to update the leaderboard
    # TODO
    test_predictions = pd.DataFrame({"id":np.arange(len(test_predictions)), "tag":test_predictions})
    test_predictions.to_csv("../results/test_y.csv", sep=',',index=False)

NameError: global name 'token' is not defined