In [1]:
import re 
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

In [253]:
def read_file(file_name):
    print("Start Reading File, {}".format(file_name.rsplit('/', 1)[-1]))
    
    # open file
    with open(training_file) as f:
        words, tags = [], []
        # iterate the file line by line
        for line in tqdm(f.readlines()):
            # split the line by last / to seperate the word and tag
            word, tag = line.strip().rsplit('/', 1)
            words.append(word) # append in words list
            tags.append(tag) # append in tags list
            
    print("Prepare Dataset for File {}...".format(file_name.rsplit('/', 1)[-1]))
    df = pd.DataFrame([words, tags], index=['Words', 'Tags']).T # prepare dataset
    print("Successfuly Read and Prepare File, {} \U0001f600 \n\n".format(file_name.rsplit('/', 1)[-1])) 
    return df

def train_tagger(dataframe):
    print("Start Training of POS Tagger....")
    
    tagger_words = []
    tagger_tags = []
    distinct_words = dataframe['Words'].unique()
    for word in tqdm(distinct_words):
        temp_df = dataframe[dataframe['Words'] == word]
        max_prob_tag = temp_df['Tags'].value_counts().index[0]
        tagger_words.append(word)
        tagger_tags.append(max_prob_tag)
        
    print("Saving the Probabilities of Tagger..")
    tagger_df = pd.DataFrame([tagger_words, tagger_tags], index=['Words', 'Tags']).T
    tagger_df.to_csv("tagger_df.csv", index=False)
    print("Successfuly Train the POS Tagger! \U0001f600 \n\n")
    return tagger_df

def prediction_0(testing_file, tagger_df):
    print("Start POS Tagging of Test Words....")
    # open file
    with open(testing_file) as f:
        words, tags = [], []
        # remove previouslt existing file
        os.remove('data/pos-test-answers-0.txt') if os.path.exists('data/pos-test-answers-0.txt') else None
        pred_files = open("data/pos-test-answers-0.txt","w+") # create new file
        
        # iterate the file line by line
        for line in tqdm(f.readlines()):
            word = line.strip() # remove extra white spaces from both side of the word
            # assign tag according to given critaira
            tag = tagger_df[tagger_df['Words'] == word]['Tags'].values[0] if (word in tagger_df['Words'].values) else 'NN'
                
            words.append(word) # append words
            tags.append(tag) # append tags
            pred_files.write(word+"/"+tag+"\n") # write words and their tags
    print("Successfuly Tagged POS Tags to Test Words! \U0001f600 \n\n")

# POS Tagger (Mode 0)

In [15]:
training_file = "data/pos-train.txt"
testing_file = 'data/pos-test.txt'
train_df = read_file(training_file)
tagger_df = train_tagger(train_df)
prediction_0(testing_file, tagger_df)

Start Reading File, pos-train.txt


100%|███████████████████████████████████████████████████████████████████| 1232377/1232377 [00:00<00:00, 1535617.97it/s]


Prepare Dataset for File pos-train.txt...
Successfuly Read and Prepare File, pos-train.txt 😀 


Start Training of POS Tagger....


100%|████████████████████████████████████████████████████████████████████████████| 50496/50496 [59:17<00:00, 14.19it/s]


Saving the Probabilities of Tagger..
Successfuly Train the POS Tagger! 😀 




In [106]:
prediction(testing_file, tagger_df)

Start POS Tagging of Test Words....


100%|███████████████████████████████████████████████████████████████████████████| 56824/56824 [08:32<00:00, 110.79it/s]

Successfuly Tagged POS Tags to Test Words! 😀 







# POS Tagger (Mode 1)

In [339]:
## NN filling rules
def is_special_character_only(word):
    regex = re.compile("[A-Za-z0-9]")
    if(regex.search(word) != None):
        return "NN" 
    regex = re.compile("[^A-Za-z0-9]")
    return re.findall(regex, word)[0] if(regex.search(word) != None) else "NN"

def is_digit_only(word):
    return "CD" if(re.match('^\d+(\.\d+)*$', word) != None) else "NN"

def is_hyphenated_digits(word):
    regex = re.compile(r'\d+(?:-\d+)+')
    return "NNP" if(regex.search(word) != None) else "NN"

def is_hyphenated_words(word):
    regex = re.compile(r'[a-zA-Z]+(?:-[a-zA-Z]+)+')
    return "JJ" if(regex.search(word) != None) else "NN"

def is_contain_year(word):
    string = 'year'
    return "JJ" if string in word and (len(string) < len(word)) else "NN"


# error removing rules
def is_selling(word, tag):
    return "NN" if word == "selling" and tag == "VBG" else tag

def is_calls(word, tag):
    return "NNS" if word == "calls" and tag == "VBZ" else tag

def is_fall(word, tag):
    return "VB" if word == "fall" and tag == "NN" else tag

def validate_last_character(word, tag):
    if word[-1] == "s" and tag == "NNPS":
        countries = ['Americans', 'Soviets', 'Olympics', 'Workers', 'Yankees', 'Greeks',
       'Germans', 'Moslems', 'Europeans', 'Jews', 'Republicans',
       'Democrats', 'Representatives', 'Treasurys']
        return tag if word in countries else "NNP"
    return tag


# prediction module using updated rules
def prediction_1(testing_file, tagger_df):
    print("Start POS Tagging of Test Words....")
    # open file
    with open(testing_file) as f:
        words, tags = [], []
        # remove previouslt existing file
        os.remove('data/pos-test-answers-1.txt') if os.path.exists('data/pos-test-answers-1.txt') else None
        pred_files = open("data/pos-test-answers-1.txt","w+") # create new file
        
        # iterate the file line by line
        for line in tqdm(f.readlines()):
            word = line.strip() # remove extra white spaces from both side of the word
            # assign tag according updated critaria of NN
            if word in tagger_df['Words'].values:
                tag = tagger_df[tagger_df['Words'] == word]['Tags'].values[0]
            else:
                tag = is_special_character_only(word)
                tag = is_digit_only(word) if tag == "NN" else tag
                tag = is_hyphenated_words(word) if tag == "NN" else tag
                tag = is_hyphenated_digits(word) if tag == "NN" else tag
                tag = is_contain_year(word) if tag == "NN" else tag
                
            # remove error by manual rules
            tag = is_selling(word, tag)
            tag = is_calls(word, tag)
            tag = is_fall(word, tag)
            tag = validate_last_character(word, tag)
                
            words.append(word) # append words
            tags.append(tag) # append tags
            pred_files.write(word+"/"+tag+"\n") # write words and their tags
    print("Successfuly Tagged POS Tags to Test Words! \U0001f600 \n\n")

In [340]:
training_file = "data/pos-train.txt"
testing_file = 'data/pos-test.txt'
# train_df = read_file(training_file)
# tagger_df = train_tagger(train_df)
tagger_df = pd.read_csv("tagger_df.csv")
prediction_1(testing_file, tagger_df)

Start POS Tagging of Test Words....


100%|███████████████████████████████████████████████████████████████████████████| 56824/56824 [05:07<00:00, 184.69it/s]

Successfuly Tagged POS Tags to Test Words! 😀 







# POS Tagger Evaluation

In [341]:
def read_tags(file_name):
    # open file
    with open(file_name) as f:
        tags = []
        words = []
        # ite rate the file line by line
        for line in f.readlines():
            # split the line by last / to seperate the word and tag
            word, tag = line.strip().rsplit('/', 1)
            tags.append(tag) # append in tags list
            words.append(word)
            
    df = pd.DataFrame([words, tags], index=['Words', 'Tags']).T # prepare dataset
    ff = str(file_name.rsplit('/', 1)[-1].rsplit('.', 1)[0])+".csv"
    df.to_csv(ff, index=False)
    
    return tags

def evaluate_tags(output_file, test_tags, pred_tags):
    # compute accuracy score segment
    accuracy=[]
    for i in range(len(test_tags)):
        accuracy.append(1) if test_tags[i]==pred_tags[i] else accuracy.append(0)
            
    acc_score = np.mean(accuracy)
    print("Accuracy Score: {}".format(acc_score))
    
    # calculate confusion metrix block
    tags_name = sorted(set(test_tags) | set (pred_tags))
    c = len(tags_name) # Number of classes 
    confusion_metrix_ = np.zeros((c, c))

    for i in range(len(test_tags)):
        confusion_metrix_[tags_name.index(pred_tags[i])][tags_name.index(test_tags[i])] += 1

    # write confusion metrix in a file
    os.remove(output_file) if os.path.exists(output_file) else None # remove previouslt existing file
    eval_file = open(output_file,"w+") # create new file
    
    for i in tqdm(range(0, len(tags_name))):
        preds = confusion_metrix_[i]
        true_false_positives = np.where(preds != 0)[0]
        for index in true_false_positives:
            eval_file.write(tags_name[i]+" "+tags_name[index]+" : "+str(int(preds[index]))+" \n")
    
    print("Confusion Metrix Results are Write Successfuly in {} \U0001f600 \n\n".format(output_file.rsplit('/', 1)[-1]))

In [342]:
test_tag_file = "data/pos-key.txt"
pred_tag_file = "data/pos-test-answers-1.txt"
output_file = 'data/pos-test-1-eval.txt'
test_tags = read_tags(test_tag_file)
pred_tags = read_tags(pred_tag_file)
evaluate_tags(output_file, test_tags, pred_tags)

Accuracy Score: 0.926386737998029


100%|███████████████████████████████████████████████████████████████████████████████| 45/45 [00:00<00:00, 44399.83it/s]

Confusion Metrix Results are Write Successfuly in pos-test-1-eval.txt 😀 





