In [1]:
import os
from datetime import datetime
import nltk
import re
import string

In [3]:
train_path = os.path.join("dataset", "irish","train.txt")
test_path = os.path.join("dataset", "irish","test.txt")
clean_data_path = os.path.join("dataset", "irish","clean_train2.txt")
saved_model_path = os.path.join("dataset", "irish","my_model_")

In [4]:
# read in training data
f = open(train_path, 'r', encoding='utf8')
train_data = f.read()

In [5]:
def clean_data(data):
    cleaned_file_out = open(clean_data_path, 'w+', encoding='utf8')
    for i, line in enumerate(data.split("\n")):
        new_line = line.translate(str.maketrans('', '', string.punctuation+'01234567890'))
        new_line = new_line.lower()
        new_line = new_line.strip()
        new_line = " ".join(new_line.split())
        cleaned_file_out.write(new_line+'\n')

    cleaned_file_out.close()

In [6]:
# Let's clean the data
train_data = clean_data(train_data)

In [7]:
# read in cleaned data
f = open(train_path, 'r', encoding='utf8')
train_data = f.read()

In [8]:
# Get word tokens from the cleaned training data
train_data = nltk.word_tokenize(train_data)

In [27]:
# Get vocabulary size to be used for Laplace Smoothing (Add-1 smoothing) later
# the "isalpha()" function makes sure we filter out all non-alphabetic characters

#V = len(set(word.lower() for word in train_data if word.isalpha()))
#V = len(set(word for word in train_data if word.isalpha()))
#V = len(set(word.lower() for word in train_data))
V = len(set(word for word in train_data))
#V = 10000000
#V = len(sorted(list(set(train_data))))

In [23]:
# get bigrams of training data
train_bigrams = nltk.bigrams(train_data)

In [24]:
count_unigrams = {}

for word in train_data:
    count_unigrams[word] = count_unigrams.get(word, 0) + 1

total_train_unigrams = sum(count_unigrams.values())


In [25]:
count_bigrams = {}

l = list(train_bigrams)

for bg in l:
    count_bigrams[bg] = count_bigrams.get(bg, 0) + 1

total_train_bigrams = sum(count_bigrams.values())
    

In [29]:
# Make each prediction file different, so append current time to file name
currentDT = str(datetime.now())
currentDT = currentDT.replace(" ","_").replace(":","_")
file_name = "my_submission_"+currentDT+".csv"

output_file_path = os.path.join("dataset", "irish","submissions",file_name)

# Open the file for appending (a); create it if it does not aready exist (a+)
file_out = open(output_file_path, 'a+', encoding='utf8')

# First write the header in the file, as given in the sample submission
file_out.write("Id,Expected\n")

counter = 0

for test_sentence in open(test_path, 'r', encoding='utf8'):
    
    counter = counter + 1
    
    test_sentence = test_sentence.strip()
    l = re.split("(?<={).*?(?=})",test_sentence)
    words_left = l[0][:-1].strip()
    words_right = l[1][1:].strip()

    match = re.search(r'.*?\{(.*)}.*', test_sentence)
    pairs = match.group(1).split('|')
    pair1 = pairs[0].lower()
    pair2 = pairs[1].lower()

    words_left = nltk.word_tokenize(words_left)
    words_right = nltk.word_tokenize(words_right)

    if len(words_left) > 0 :
        w1 = words_left[-1:][0].lower()
    else:
        w1 = ""
        
    if len(words_right) > 0 :
        w2 = words_right[0:][0].lower()
    else:
        w2 = ""
    
    
    # add-alpha
    a = 1 # when alpha = 1, this is equivalent to Laplace Smoothing (Add-1 smoothing)  
    
    # Apply Laplace Smoothing (Add 1 and divide by V)    
    p1 = (count_bigrams.get((w1, pair1), 0) + a)/(count_unigrams.get(w1, 0) + a*V)
    p2 = (count_bigrams.get((pair1, w2), 0) + a)/(count_unigrams.get(pair1, 0) + a*V)
    
    alpha =p1*p2
       
    p1 = (count_bigrams.get((w1, pair2), 0) + a)/(count_unigrams.get(w1, 0) + a*V)
    p2 = (count_bigrams.get((pair2, w2), 0) + a)/(count_unigrams.get(pair2, 0) + a*V)
        
    beta =p1*p2
        
    prob = alpha/(alpha+beta)
    
    # Build each line and write it to the output file
    data = str(counter)+","+str(prob)+"\n"
    file_out.write(data)

file_out.close()