In [10]:
### Youngjun Yu wordLangId2.ipynb ###

import os
import re
import math
from collections import defaultdict

# ------------------------------------------------------------
# Preprocessing and Update Function

def preprocess(sentence):
    """
    Preprocess a sentence by converting to lower-case and extracting only alphanumeric tokens. A word is defined as a contiguous sequence of letters and digits.
    """
    sentence = sentence.lower()
    tokens = re.findall(r'[a-z0-9]+', sentence)
    return tokens

def update_counts(sentence, unigram, bigram):
    """
    Update unigram and bigram counts for a given sentence. '<s>' and '</s>' are added to mark start and end.
    """
    tokens = preprocess(sentence)
    if not tokens:
        return
    tokens = ['<s>'] + tokens + ['</s>']
    for i in range(len(tokens)):
        unigram[tokens[i]] += 1
        if i != 0:
            bigram[(tokens[i-1], tokens[i])] += 1

# ------------------------------------------------------------
# Training the Model

def train_model(filepath):
    """
    Train a language model from the input.
    """
    unigram = defaultdict(int)
    bigram = defaultdict(int)
    with open(filepath, encoding='utf8') as f:
        for sentence in f:
            sentence = sentence.strip()
            if sentence:
                update_counts(sentence, unigram, bigram)
    return unigram, bigram

def get_vocab(unigram):
    """
    The vocabulary is the set of word tokens observed in the input data.
    """
    return set(unigram.keys())

# ------------------------------------------------------------
# Computing Bigram Log-Probability

def compute_log_prob(sentence, unigram, bigram, vocab):
    """
    Compute the log probability of a sentence using Good-Turing smoothing.
    """
    tokens = preprocess(sentence)
    if not tokens:
        return float('-inf')
    tokens = ['<s>'] + tokens + ['</s>']
    
    log_prob = 0.0
    
    k = 5  # threshold
    
    for i in range(1, len(tokens)):
        r = bigram.get((tokens[i-1], tokens[i]), 0)
        ft_context = bigram.get('ft', {}).get(tokens[i-1], None)
        denom = bigram.get('denom', {}).get(tokens[i-1], None)
        
        if ft_context is not None and denom is not None:
            if r < k and ft_context.get(r, 0) > 0 and ft_context.get(r+1, 0) > 0:
                prob = (r + 1) * (ft_context[r+1] / ft_context[r]) / denom
            else:
                prob = r / denom
        else:
            prob = (r + 1) / (unigram.get(tokens[i-1], 0) + len(vocab))
            
        if prob == 0:
            prob = 1e-10  # to avoid log(0)
        log_prob += math.log(prob)
        
    return log_prob

# ------------------------------------------------------------
# Compute Context Frequency-of-Frequency and Denominator for Good-Turing Smoothing

def compute_ft_denom(model):
    """
    For each previous token, compute a frequency-of-frequency table (context_ft) and the total adjusted count (denom).
    """
    k = 5  # threshold
    
    context_ft = {}
    denom = {}
    for prev in model['unigram'].keys():
        ft = defaultdict(int)
        for next in model['vocab']:
            r = model['bigram'].get((prev, next), 0)
            ft[r] += 1
        context_ft[prev] = dict(ft)
        
        total_adjusted = 0.0
        for next in model['vocab']:
            r = model['bigram'].get((prev, next), 0)
            if r < k and ft.get(r, 0) > 0 and ft.get(r+1, 0) > 0:
                total_adjusted += (r+1) * (ft[r+1] / ft[r])
            else:
                total_adjusted += r
        denom[prev] = total_adjusted
    return context_ft, denom

# ------------------------------------------------------------
# Main: Train Models

train_path = os.path.join("..", "Data", "Input")
models = {}

for lang in ['English', 'French', 'Italian']:
    filepath = os.path.join(train_path, 'LangId.train.' + lang)
    unigram, bigram = train_model(filepath)
    vocab = get_vocab(unigram)
    models[lang] = {'unigram': unigram, 'bigram': bigram, 'vocab': vocab}

# ------------------------------------------------------------
# Main: Test Models on Validation Data

test_file = os.path.join("..", "Data", "Validation", "LangId.test")
results = []

# Precompute Good-Turing data and Store in the Bigram Dictionary.
for lang in ['English', 'French', 'Italian']:
    ft, denom = compute_ft_denom(models[lang])
    models[lang]['bigram']['ft'] = ft
    models[lang]['bigram']['denom'] = denom

with open(test_file, encoding='utf8') as f:
    sentences = f.readlines()
    for idx, sentence in enumerate(sentences):
        sentence = sentence.strip()
        lang_probs = {}
        for lang in ['English', 'French', 'Italian']:
            model = models[lang]
            log_prob = compute_log_prob(sentence, model['unigram'], model['bigram'], model['vocab'])
            lang_probs[lang] = log_prob
        predicted_lang = max(lang_probs, key=lang_probs.get)
        results.append(f"{idx+1} {predicted_lang}")

# ------------------------------------------------------------
# Main: Save the Results

output_file = os.path.join("..", "Data", "Output", "wordLangId2.out")
os.makedirs(os.path.dirname(output_file), exist_ok=True)
with open(output_file, 'w', encoding='utf8') as f:
    f.write("\n".join(results))

# ------------------------------------------------------------
# Main: Evaluate the Model by Computing Accuracy

solution_file = os.path.join("..", "Data", "Validation", "labels.sol")
with open(output_file, encoding='utf8') as f_out:
    output_sentences = []
    for sentence in f_out:
        sentence = sentence.strip()
        if sentence:
            output_sentences.append(sentence)

with open(solution_file, encoding='utf8') as f_sol:
    solution_sentences = []
    for sentence in f_sol:
        sentence = sentence.strip()
        if sentence:
            solution_sentences.append(sentence)

correct = 0
total = len(solution_sentences)
for output, sol in zip(output_sentences, solution_sentences):
    if output == sol:
        correct += 1

accuracy = correct / total * 100

print(f"Accuracy: {accuracy:.2f}%")


Accuracy: 99.00%
