In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

# POS Tagger (Mode 0)

In [105]:
def read_file(file_name):
    print("Start Reading File, {}".format(file_name.rsplit('/', 1)[-1]))
    
    # open file
    with open(training_file) as f:
        words, tags = [], []
        # iterate the file line by line
        for line in tqdm(f.readlines()):
            # split the line by last / to seperate the word and tag
            word, tag = line.strip().rsplit('/', 1)
            words.append(word) # append in words list
            tags.append(tag) # append in tags list
            
    print("Prepare Dataset for File {}...".format(file_name.rsplit('/', 1)[-1]))
    df = pd.DataFrame([words, tags], index=['Words', 'Tags']).T # prepare dataset
    print("Successfuly Read and Prepare File, {} \U0001f600 \n\n".format(file_name.rsplit('/', 1)[-1])) 
    return df

def train_tagger(dataframe):
    print("Start Training of POS Tagger....")
    
    tagger_words = []
    tagger_tags = []
    distinct_words = dataframe['Words'].unique()
    for word in tqdm(distinct_words):
        temp_df = dataframe[dataframe['Words'] == word]
        max_prob_tag = temp_df['Tags'].value_counts().index[0]
        tagger_words.append(word)
        tagger_tags.append(max_prob_tag)
        
    print("Saving the Probabilities of Tagger..")
    tagger_df = pd.DataFrame([tagger_words, tagger_tags], index=['Words', 'Tags']).T
    tagger_df.to_csv("tagger_df.csv", index=False)
    print("Successfuly Train the POS Tagger! \U0001f600 \n\n")
    return tagger_df

def prediction(testing_file, tagger_df):
    print("Start POS Tagging of Test Words....")
    # open file
    with open(testing_file) as f:
        words, tags = [], []
        # remove previouslt existing file
        os.remove('data/pos-test-answers-0.txt') if os.path.exists('data/pos-test-answers-0.txt') else None
        pred_files = open("data/pos-test-answers-0.txt","w+") # create new file
        
        # iterate the file line by line
        for line in tqdm(f.readlines()):
            word = line.strip() # remove extra white spaces from both side of the word
            # assign tag according to given critaira
            tag = tagger_df[tagger_df['Words'] == word]['Tags'].values[0] if (word in tagger_df['Words'].values) else 'NN'
                
            words.append(word) # append words
            tags.append(tag) # append tags
            pred_files.write(word+"/"+tag+"\n") # write words and their tags
    print("Successfuly Tagged POS Tags to Test Words! \U0001f600 \n\n")

In [15]:
training_file = "data/pos-train.txt"
testing_file = 'data/pos-test.txt'
train_df = read_file(training_file)
tagger_df = train_tagger(train_df)
prediction(testing_file, tagger_df)

Start Reading File, pos-train.txt


100%|███████████████████████████████████████████████████████████████████| 1232377/1232377 [00:00<00:00, 1535617.97it/s]


Prepare Dataset for File pos-train.txt...
Successfuly Read and Prepare File, pos-train.txt 😀 


Start Training of POS Tagger....


100%|████████████████████████████████████████████████████████████████████████████| 50496/50496 [59:17<00:00, 14.19it/s]


Saving the Probabilities of Tagger..
Successfuly Train the POS Tagger! 😀 




In [106]:
prediction(testing_file, tagger_df)

Start POS Tagging of Test Words....


100%|███████████████████████████████████████████████████████████████████████████| 56824/56824 [08:32<00:00, 110.79it/s]

Successfuly Tagged POS Tags to Test Words! 😀 







# POS Tagger (Mode 1)

In [206]:
df_1 = pd.read_csv("pos-test-answers-0.csv")
df_2 = pd.read_csv("pos-key.csv")

In [207]:
df_1['Tags_original'] = df_2['Tags']

In [210]:
df_1 = df_1[df_1["Tags"] == "NN"]
df_1 = df_1[df_1["Tags_original"] != df_1["Tags"]]

In [212]:
df_1 = df_1[df_1["Words"] == "fall"]
# df_2 = df_1[df_1["Tags_original"] != df_1["Tags"]]
df_1.shape

(10, 3)

In [177]:
df_1.to_csv("tags_VBG.csv", index=False)

In [176]:
import re
 
# Function checks if the string
# contains any special character
def is_special_character_only(word):
    regex = re.compile("[^A-Za-z0-9]")
    return True if(regex.search(word) != None) else False


def is_digit_only(word):
    return bool(re.match('^\d+(\.\d+)*$', word))

def is_hyphenated_digits(word):
    regex = re.compile(r'\d+(?:-\d+)+')
    return True if(regex.search(word) != None) else False

def is_hyphenated_words(word):
    regex = re.compile(r'[a-zA-Z]+(?:-[a-zA-Z]+)+')
    return True if(regex.search(word) != None) else False

def is_contain_year(word):
    string = 'year'
    return True if string in word and (len(string) < len(word)) else False

In [143]:
word = "%"
if is_special_character_only(word):
    print("ok")
else:
    print("okk")

word = '123'
is_digit_only(word)

word = '324-423329'
is_hyphenated_word(word)

word = '454-45423'
is_hyphenated_digits(word)

word = '454-45423'
is_hyphenated_digits(word)

word = "year"
is_contain_year(word)

ok


False

In [None]:
def is_selling(word, tag):
    

# POS Tagger Evaluation

In [13]:
def read_tags(file_name):
    # open file
    with open(file_name) as f:
        tags = []
        words = []
        # ite rate the file line by line
        for line in f.readlines():
            # split the line by last / to seperate the word and tag
            word, tag = line.strip().rsplit('/', 1)
            tags.append(tag) # append in tags list
#             words.append(word)
            
#     df = pd.DataFrame([words, tags], index=['Words', 'Tags']).T # prepare dataset
#     ff = str(file_name.rsplit('/', 1)[-1].rsplit('.', 1)[0])+".csv"
#     df.to_csv(ff, index=False)
    
    return tags

def evaluate_tags(output_file, test_tags, pred_tags):
    # compute accuracy score segment
    accuracy=[]
    for i in range(len(test_tags)):
        accuracy.append(1) if test_tags[i]==pred_tags[i] else accuracy.append(0)
            
    acc_score = np.mean(accuracy)
    print("Accuracy Score: {}".format(acc_score))
    
    # calculate confusion metrix block
    tags_name = sorted(set(test_tags))
    c = len(tags_name) # Number of classes 
    confusion_metrix_ = np.zeros((c, c))

    for i in range(len(test_tags)):
        confusion_metrix_[tags_name.index(pred_tags[i])][tags_name.index(test_tags[i])] += 1

    # write confusion metrix in a file
    os.remove(output_file) if os.path.exists(output_file) else None # remove previouslt existing file
    eval_file = open(output_file,"w+") # create new file
    
    for i in tqdm(range(0, len(tags_name))):
        preds = confusion_metrix_[i]
        true_false_positives = np.where(preds != 0)[0]
        for index in true_false_positives:
            eval_file.write(tags_name[i]+" "+tags_name[index]+" : "+str(int(preds[index]))+" \n")
    
    print("Confusion Metrix Results are Write Successfuly in {} \U0001f600 \n\n".format(output_file.rsplit('/', 1)[-1]))

In [14]:
test_tag_file = "data/pos-key.txt"
pred_tag_file = "data/pos-test-answers-0.txt"
output_file = 'data/pos-test-0-eval.txt'
test_tags = read_tags(test_tag_file)
pred_tags = read_tags(pred_tag_file)
# evaluate_tags(output_file, test_tags, pred_tags)