/content/drive/MyDrive/NLP

In [3]:
import pandas as pd
import re
import csv
import codecs
import os
import sys
import time
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/NLP

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/NLP


Preprocessing Lambani Corpus (lambani.csv) and saving it to the lambani_preprocessed.csv

In [None]:
import pandas as pd
import re

lambani_word_pos_df = pd.read_csv('lambani.csv')
lambani_word_pos_df = lambani_word_pos_df.rename(columns={'Word':'word', 'POS tags': 'pos_tags'})
kannada_regex = re.compile(r'[\u0C80-\u0CFF]+')
regex_test = kannada_regex.findall('ಅರವಿಂದ karthik hey there ಒಲವು हिन्दी जिसके')

pos_alias_dict = {
    'conjunction': ['conjunction', 'conjunctin', 'conjuntion', 'coonjuction', 'cojunction', 'conjunctio', 'conjuncton', 'c'],
    'noun': ['noun', 'n', 'm', 'b'],
    'verb': ['verb', 'vernb', 'v', 'vn'],
    'adjective': ['adjective', 'adjectivev', 'adjctive', 'adj', 'j', 'adje', 'sdj', 'adjectve'],
    'adverb': ['adverb', 'adv', 'adveb', 'adc', 'adjverb', 'adverv'],
    'pronoun': ['pronoun', 'pronun', 'pro', 'pn', 'ppro', 'pto', 'pronnoun', 'prro'],
    'preposition': ['preposition', 'pre', 'prepositon', 'ppre', 'prepostion'],
    'interjection': ['interjection', 'interjectin', 'inetrjection']
}

pos_output_classes = ['adjective', 'adverb', 'conjunction', 'particle', 'noun', 'preposition', 'pronoun', 'verb', 'interjection']
pos_short = ['jj', 'rb', 'ccd', 'rpd', 'nn', 'psp', 'prp', 'vb', 'i']

valid_lambani_sentences: list[list[str]] = []
valid_sentences_pos_tags: list[list[str]] = []

current_sentence: list[str] = []
current_pos_tags: list[str] = []
is_sentence_valid = True

for index, row in lambani_word_pos_df.iterrows():
    if str(row['word']) == '<START>':
        is_sentence_valid = True
        current_sentence = []
        current_pos_tags = []
    elif str(row['word']) == '<END>':
        if is_sentence_valid and len(current_sentence) > 0:
            valid_lambani_sentences.append(current_sentence)
            valid_sentences_pos_tags.append(current_pos_tags)
    else:
        if is_sentence_valid == False:
            continue
        else:
            kannada_words: list = kannada_regex.findall(str(row['word']))
            if len(kannada_words) == 0:
                is_sentence_valid = False
            else:
                found_pos_tag = False
                for pos in pos_alias_dict:
                    if str(row['pos_tags']) in pos_alias_dict[pos]:
                        pos_tag = pos
                        found_pos_tag = True
                        break
                if found_pos_tag == True:
                    current_sentence.append(kannada_words[0])
                    current_pos_tags.append(pos_tag)
                else:
                    is_sentence_valid = False

# Save the preprocessed data into CSV without headers
with open('lambani_preprocessed.csv', 'w', encoding='utf-8-sig') as file:
    for sentence, tags in zip(valid_lambani_sentences, valid_sentences_pos_tags):
        file.write(' '.join(sentence) + ',' + ' '.join(tags) + '\n')

print("Preprocessed data saved to lambani_preprocessed.csv")


##Finding the unique POS tags present in the corpus

In [None]:
# Read the CSV file and extract unique POS tags
import csv
csv_file = 'lambani_preprocessed.csv'
unique_pos_tags = set()

with open(csv_file, 'r', encoding='utf-8') as file:
    next(file)  # Skip the header row
    for line in file:
        pos_tags = line.strip().split(',')[1].split()  # Assuming comma as delimiter
        unique_pos_tags.update(pos_tags)
print("Unique POS tags:", unique_pos_tags)

Unique POS tags: {'adverb', 'interjection', 'noun', 'verb', 'pronoun', 'preposition', 'adjective', 'conjunction'}


Splitting corpus into training and testing csv files

In [5]:
import pandas as pd
# Load the CSV file
data = pd.read_csv('lambani_preprocessed.csv')
# Select the last 100 lines for testing.csv
test_data = data.tail(100)
# Select the remaining lines for training.csv
train_data = data.iloc[:-100]
# Save the split data into separate CSV files
train_data.to_csv('training.csv', index=False)
test_data.to_csv('testing.csv', index=False)

In [9]:
import pandas as pd
df = pd.read_csv("training.csv", header=None)
# Function to convert POS tags to desired format
def convert_to_text(sentence, pos_tags):
    text = ""
    words = sentence.split()
    tags = pos_tags.split()
    for word, tag in zip(words, tags):
        text += f"{word} {tag}\n"
    return text
# Writing the converted text to a file
with open("lambani_training.txt", "w", encoding="utf-8") as f:
    for _, row in df.iterrows():
        sentence = row[0]  # Accessing the first column
        pos_tags = row[1]  # Accessing the second column
        f.write("<s> START\n")
        f.write(convert_to_text(sentence, pos_tags))
        f.write("</s> END\n")


NOTE: Keep lambani_testing.txt file in output folder

Output tagged text file lambani_tags.txt

In [11]:
%cd /content/drive/MyDrive/NLP
!python supervised.py 0 ./data/lambani_testing.txt

/content/drive/MyDrive/NLP
31.263862371444702 seconds for training
1.9558441638946533 seconds for testing 100 Sentences

Kindly check ./output/lambani_tags.txt file for POS tags.


## Calculating Accuracy

Firstly, preparing annotated testing file as ground truth

In [20]:
import csv

# Function to convert CSV data to desired format
def convert_csv_to_txt(csv_file, output_file):
    with open(csv_file, 'r', encoding='utf-8') as csvfile, open(output_file, 'w', encoding='utf-8') as txtfile:
        reader = csv.reader(csvfile, delimiter=',')
        for row in reader:
            sentence = row[0].split()  # Split sentence into words
            pos_tags = row[1].split()  # Split POS tags
            annotated_sentence = []
            for word, pos in zip(sentence, pos_tags):
                annotated_sentence.append(f"{word}_{pos}")  # Combine word and POS tag
            txtfile.write(' '.join(annotated_sentence) + '\n')  # Write annotated sentence to file

# Provide input and output filenames
csv_input_file = 'testing.csv'
txt_output_file = 'annotated_testing.txt'

# Call the function to convert CSV to txt
convert_csv_to_txt(csv_input_file, txt_output_file)


Finding accuracy based on groundtruth (annotated_testing.txt) and output tagged file(lambani_tags.txt)

In [23]:
# Function to calculate precision
def precision(tp, fp):
    return tp / (tp + fp)

# Function to calculate recall
def recall(tp, fn):
    return tp / (tp + fn)

# Function to calculate F1 score
def f1_score(precision, recall):
    return 2 * (precision * recall) / (precision + recall)

# Function to calculate accuracy
def accuracy(tp, tn, fp, fn):
    return (tp + tn) / (tp + tn + fp + fn)


# Read output file and annotated test data
with open("output/lambani_tags.txt", "r", encoding="utf-8") as output_file:
    output_lines = output_file.readlines()

with open("annotated_testing.txt", "r", encoding="utf-8") as annotated_file:
    annotated_lines = annotated_file.readlines()

# Initialize counters
tp = tn = fp = fn = 0

# Iterate over lines and compare tagged words
for output_line, annotated_line in zip(output_lines, annotated_lines):
    output_tags = output_line.strip().split()[:-1]  # Ignore last character in each tag
    annotated_tags = annotated_line.strip().split()

    # Check if lengths match
    if len(output_tags) != len(annotated_tags):
        print("Error: Lengths of lines do not match")
        continue

    # Iterate over tags and compare
    for output_tag, annotated_tag in zip(output_tags, annotated_tags):
        output_word, output_pos = output_tag.split("_")
        annotated_word, annotated_pos = annotated_tag.split("_")

        if output_pos == annotated_pos:
            if output_pos == "noun":  # Considering only nouns for simplicity
                tp += 1
            else:
                tn += 1
        else:
            if output_pos == "noun":
                fp += 1
            else:
                fn += 1

# Calculate precision, recall, F1 score, and accuracy
prec = precision(tp, fp)
rec = recall(tp, fn)
f1 = f1_score(prec, rec)
acc = accuracy(tp, tn, fp, fn)

# Print results
print("Precision:", prec)
print("Recall:", rec)
print("F1 Score:", f1)
print("Accuracy:", acc)

Precision: 0.702020202020202
Recall: 0.5914893617021276
F1 Score: 0.6420323325635104
Accuracy: 0.7047619047619048
