# Set up

hello


In [20]:
# !git clone https://github.com/NLP-Reichman/2025_assignment_1.git
# !mv 2025_assignment_1/data data
# !rm 2025_assignment_1/ -r

# Introduction
In this assignment you will be creating tools for learning and testing language models. The corpora that you will be working with are lists of tweets in 8 different languages that use the Latin script. The data is provided either formatted as CSV or as JSON, for your convenience. The end goal is to write a set of tools that can detect the language of a given tweet.
The relevant files are under the data folder:

- en.csv (or the equivalent JSON file)
- es.csv (or the equivalent JSON file)
- fr.csv (or the equivalent JSON file)
- in.csv (or the equivalent JSON file)
- it.csv (or the equivalent JSON file)
- nl.csv (or the equivalent JSON file)
- pt.csv (or the equivalent JSON file)
- tl.csv (or the equivalent JSON file)

In [21]:
import os
import json
# from google.colab import files
import pandas as pd
import numpy as np
import math
from itertools import product
import random


# Implementation

## Part 1
Implement the function *preprocess* that iterates over all the data files and creates a single vocabulary, containing all the tokens in the data. Our token definition is a single UTF-8 encoded character. So, the vocabulary list is a simple Python list of all the characters that you see at least once in the data. The vocabulary should include the `<start>` and    `<end>` tokens.

Note - do NOT lowecase the sentences.

In [22]:
def preprocess() -> list[str]:
	'''
	Return a list of characters, representing the shared vocabulary of all languages
	'''
	vocab = []
	data_files = [file for file in os.listdir('data') if file.endswith('.csv')]
	for file in data_files:
		with open(os.path.join('data', file), 'r') as f:
			data = pd.read_csv(f)
			for tweet in data['tweet_text']:
				vocab.extend(list(tweet))
	
	vocab.append('<start>')
	vocab.append('<end>')        
	vocab = list(set(vocab))
	
	return vocab

In [23]:
vocab = preprocess()
print(f"vocab length: {len(vocab)}")
print(f"Some characters in the vocab: {vocab[:10]}")


vocab length: 1804
Some characters in the vocab: ['⭐', '외', 'خ', '📲', '🤚', '王', '写', '🍒', '⛳', 'ė']


## Part 2
Implement the function *build_lm* that generates a language model from a textual corpus. The function should return a dictionary (representing a model) where the keys are all the relevant *n*-1 sequences, and the values are dictionaries with the *n*_th tokens and their corresponding probabilities to occur. To ensure consistent probabilities calculation, please add n-1 `<start>` tokens to the beginning of a tweet and one `<end>` token at the end. For example, for a trigram model (tokens are characters), it should look something like:

{ "ab":{"c":0.5, "b":0.25, "d":0.25}, "ca":{"a":0.2, "b":0.7, "d":0.1} }

which means for example that after the sequence "ab", there is a 0.5 chance that "c" will appear, 0.25 for "b" to appear and 0.25 for "d" to appear.

Note - You should think how to add the add_one smoothing information to the dictionary and implement it.

Please add the `<unk>` token with $p(<unk>)=1|V|$ to the LM if buiulding a smoothed LM.

In [24]:
from functools import lru_cache

@lru_cache
def get_all_tweets(lang: str,number_of_starts: int = 1) -> list[str]:
	data_files = [file for file in os.listdir('data') if file.endswith('.csv') and file.startswith(lang)]
	all_tweets = []
	for file in data_files:
		with open(os.path.join('data', file), 'r') as f:
			data = pd.read_csv(f)
			for tweet in data['tweet_text']:
				text = ['<start>']*number_of_starts + list(tweet) + ['<end>']
				all_tweets.append(text)
	return all_tweets


def get_ngrams(text: str, n: int) -> list[str]:
	ngrams = []
	for i in range(len(text) - n + 1):
		ngrams.append(text[i:i+n])
	return ngrams

def count_next_tokens(LM: dict[str, dict[str, float]], n: int,all_tweets: list[str]) -> dict[str, dict[str, float]]:
	n_ngrams = (get_ngrams(tweet, n) for tweet in all_tweets)
	for sentence in n_ngrams:
		for word in sentence:
			LM["".join(word[:-1])][word[-1]] += 1
	return LM

def add_unknown_tokens(LM: dict[str, dict[str, float]]) -> dict[str, dict[str, float]]:
	for key in LM:
		LM[key]['<unk>'] = 1 
	return LM


def normalize_LM(LM: dict[str, dict[str, float]]) -> dict[str, dict[str, float]]:
	for key in LM:
		total = sum(LM[key].values())
		for next_token in LM[key]:
			if total != 0:
				LM[key][next_token] /= total
	return LM

def build_1_gram_LM(vocab: list[str], smoothed: bool = False, lang: str = 'en') -> dict[str, dict[str, float]]:
	all_chars = []
	data_files = [file for file in os.listdir('data') if file.endswith('.csv') and file.startswith(lang)]
	for file in data_files:
		with open(os.path.join('data', file), 'r') as f:
			data = pd.read_csv(f)
			for tweet in data['tweet_text']:
				all_chars.extend(list(tweet))
	
	all_chars.append('<start>')
	all_chars.append('<end>')
	if smoothed:
		all_chars.append('<unk>')
		vocab.append('<unk>')
	all_chars = list(all_chars)

 
	vocab_prob = {char: all_chars.count(char)/len(all_chars) for char in vocab}
	LM = {char: vocab_prob.copy() for char in vocab}
	return LM


def build_lm(lang: str, n: int, smoothed: bool = False) -> dict[str, dict[str, float]]:
	'''
	Return a language model for the given lang and n_gram (n)
	:param lang: the language of the model
	:param n: the n_gram value
	:param smoothed: boolean indicating whether to apply smoothing
	:return: a dictionary where the keys are n_grams and the values are dictionaries
	'''
	LM = {}
	if n == 1:
		return build_1_gram_LM(vocab, smoothed, lang)
	all_tweets = get_all_tweets(lang,n-1)
	keys_t = (get_ngrams(tweet, n-1) for tweet in all_tweets)
	sentences = (ngram for ngram in keys_t)
	keys = []
	for sentence in sentences:
		keys.extend(sentence)
	
	
	vocab_prob = {char: int(smoothed) for char in vocab}
	if smoothed:
		keys.append('<unk>')
	LM = {"".join(key): vocab_prob.copy() for key in keys if key[-1] != '<end>'}
	if smoothed:
		LM = add_unknown_tokens(LM)
	LM = count_next_tokens(LM, n, all_tweets)
	LM = normalize_LM(LM)
	return LM
	


In [25]:
# def test_build_lm():
# 	return {
# 		'english_2_gram_length': len(build_lm('en', 2, True)),
# 		'english_3_gram_length': len(build_lm('en', 3, True)),
# 		'french_3_gram_length': len(build_lm('fr', 3, True)),
# 		'spanish_3_gram_length': len(build_lm('es', 3, True)),
# 	}
	
# def g_test_build_lm(results):
# 	if results["english_2_gram_length"] != 748:
# 		return f"English 2-gram length is {results['english_2_gram_length']}, expected 748"
# 	if results["english_3_gram_length"] != 8239:
# 		return f"English 3-gram length is {results['english_3_gram_length']}, expected 8239"
# 	if results["french_3_gram_length"] != 8286:
# 		return f"French 3-gram length is {results['french_3_gram_length']}, expected 8286"
# 	if results["spanish_3_gram_length"] != 8469:
# 		return f"Spanish 3-gram length is {results['spanish_3_gram_length']}, expected 8469"
# 	return 1

In [26]:
# # LM = build_lm("en", 3, False)
# # print(f"English Language Model with 3-gram is of length: {len(LM)}")
# results = test_build_lm()
# print(g_test_build_lm(results))

## Part 3
Implement the function *eval* that returns the perplexity of a model (dictionary) running over the data file of the given target language.

The `<unk>` should be used for unknown contexts when calculating the perplexities.

In [27]:
def perplexity(model: dict, text: list, n:int) -> float:
	'''
	Calculates the perplexity of the given string using the given language model.
	:param model: The language model
	:param text: The tokenized text to calculate the perplexity for
	:param n: The n-gram of the model
	:return: The perplexity
	'''
	pp = 0
	log_prob_sum = 0.0	
	count = 0

	keys_t = (get_ngrams(tweet, n) for tweet in text)
	sentences = (ngram for ngram in keys_t)
	keys = []
	for sentence in sentences:
		keys.extend(sentence)
	for ngram in keys:
		context = ''.join(ngram[:-1])
		next_char = ngram[-1]	
		context_probs = model.get(context, model['<unk>'])
		prob = context_probs.get(next_char, context_probs['<unk>'])
		if prob == 0:
			prob = context_probs['<unk>']
		log_prob_sum += math.log(prob)
		count += 1	
	if count == 0:
		return float('inf')
	return math.exp(-log_prob_sum / count)	


In [28]:
def eval(model: dict, target_lang: str, n: int) -> float:
	'''
	Return the perplexity value calculated over applying the model on the text file
	of the target_lang language.
	:param model: the language model
	:param target_lang: the target language
	:param n: The n-gram of the model
	:return: the perplexity value
	'''
	pp = 0

	all_tweets = get_all_tweets(target_lang, n)
	pp = perplexity(model, all_tweets, n)
	return pp


In [29]:
LM = build_lm("en", 3, True)

In [30]:
# print("Perplexity of the English 3-gram model on datasets:")
# print(f"On English: {eval(LM, 'en', 3): .2f}")
# print(f"On French: {eval(LM, 'fr', 3): .2f}")
# print(f"On Dutch: {eval(LM, 'nl', 3): .2f}")
# print(f"On Tagalog: {eval(LM, 'tl', 3): .2f}")


In [31]:
# lm1 = build_lm("en", 1, True)
# lm2 = build_lm("en", 2, True)
# lm3 = build_lm("en", 3, True)
# lm4 = build_lm("en", 4, True)

# print("Perplexity on differnet n-gram models on English")
# print(f"On 1-gram: {eval(lm1, 'en', 1): .2f}")
# print(f"On 2-gram: {eval(lm2, 'en', 2): .2f}")
# print(f"On 3-gram: {eval(lm3, 'en', 3): .2f}")
# print(f"On 4-gram: {eval(lm4, 'en', 4): .2f}")

## Part 4
Implement the *match* function that calls *eval* using a specific value of *n* for every possible language pair among the languages we have data for. You should call *eval* for every language pair four times, with each call assign a different value for *n* (1-4). Each language pair is composed of the source language and the target language. Before you make the call, you need to call the *lm* function to create the language model for the source language. Then you can call *eval* with the language model and the target language. The function should return a pandas DataFrame with the following four columns: *source_lang*, *target_lang*, *n*, *perplexity*. The values for the first two columns are the two-letter language codes. The value for *n* is the *n* you use for generating the specific perplexity values which you should store in the forth column.

In [32]:
languages = ['en', 'es', 'fr', 'in', 'it', 'nl', 'pt', 'tl']


In [33]:

import time
def match() -> pd.DataFrame:
    
	'''
	Return a DataFrame containing one line per every language pair and n_gram.
	Each line will contain the perplexity calculated when applying the language model
	of the source language on the text of the target language.
	:return: a DataFrame containing the perplexity values
	'''
	number_of_ngrams = 4
	df = pd.DataFrame()
	for lang in languages:
		for n in range(1, number_of_ngrams + 1):
			lm = build_lm(lang, n, True)
			for target_lang in languages:
				perplexity = eval(lm, target_lang, n)
				df = pd.concat([df, pd.DataFrame({'source': lang, 'target': target_lang, 'n': n, 'perplexity': perplexity}, index=[0])], ignore_index=True)
	
	df.to_csv('match_test.csv', index=False)


	return df


In [34]:
# def test_match():
#     df = match()
#     df.to_csv('match_test.csv', index=False)
#     # df = pd.read_csv('match_test.csv')
#     return {
#         'df_shape': df.shape,
#         'en_en_3': df[(df['source'] == 'en') & (df['target'] == 'en') & (df['n'] == 3)]['perplexity'].values[0],
#         'en_tl_3': df[(df['source'] == 'en') & (df['target'] == 'tl') & (df['n'] == 3)]['perplexity'].values[0],
#         'en_nl_3': df[(df['source'] == 'en') & (df['target'] == 'nl') & (df['n'] == 3)]['perplexity'].values[0],
#     }
# def test_match_grader():
#     results = test_match()
#     perplexity_en_on_en = int(results["en_en_3"])  
#     perplexity_en_on_tl = int(results["en_tl_3"])  
#     perplexity_en_on_nl = int(results["en_nl_3"])  

#     perplexities = [
#         perplexity_en_on_en,
#         perplexity_en_on_tl,
#         perplexity_en_on_nl
#     ]

#     if min(perplexities) != perplexity_en_on_en:
#         return f"English model should perform best on English text. Results: {results}"

#     if not (perplexity_en_on_en <= max(perplexity_en_on_tl, perplexity_en_on_nl)):
#         return f"Expected increasing perplexity from English to other languages. Results: {results}"

#     return 1

# print(test_match_grader())

## Part 5
Implement the *generate* function which takes a language code, *n*, the prompt (the starting text), the number of tokens to generate, and *r*, which is the random seed for any randomized action you plan to take in your implementation. The function should start generating tokens, one by one, using the language model of the given source language and *n*. The prompt should be used as a starting point for aligning on the probabilities to be used for generating the next token.

Note - The generation of the next token should be from the LM's distribution with NO smoothing.

In [35]:
def generate(lang: str, n: int, prompt: str, number_of_tokens: int, r: int) -> str:
	'''
	Generate text in the given language using the given parameters.
	:param lang: the language of the model
	:param n: the n_gram value
	:param prompt: the prompt to start the generation
	:param number_of_tokens: the number of tokens to generate
	:param r: the random seed to use
	'''

	lm = build_lm(lang, n, False)
	random.seed(r)
	for i in range(number_of_tokens):
		if n == 1:
			prev_n_gram = prompt[-1]
			next_token = np.random.choice(list(lm[prev_n_gram].keys()),p=list(lm[prev_n_gram].values()))
		else:
			prev_n_gram = prompt[-n+1:]
			next_token = np.random.choice(list(lm[prev_n_gram].keys()),p=list(lm[prev_n_gram].values()))
		if next_token == '<unk>':
			next_token = random.choice(list(lm['<unk>'].keys()))
		if next_token == '<end>':
			break
		prompt += next_token
	
	return prompt

## Part 6
Play with your generate function, try to generate different texts in different language and various values of *n*. No need to submit anything of that.

In [36]:
print(generate('en', 1, "I am", 10, 5))
print(generate('en', 2, "I am", 10, 5))
print(generate('en', 3, "I am", 10, 5))
print(generate('en', 4, "I am ", 10, 5))
print(generate('es', 2, "Soy", 10, 5))
print(generate('es', 3, "Soy", 10, 5))
print(generate('fr', 2, "Je suis", 10, 5))
print(generate('fr', 3, "Je suis", 10, 5))

I ama.eovz Yna
I ame @F9: ool
I amn calwankl
I am some to th
Soya #Ceo mas
Soy Tie. Mun 
Je suis ri e fli 
Je suis rensommes


# Testing

Copy the content of the **tests.py** file from the repo and paste below. This will create the results.json file and download it to your machine.

In [37]:
########################################
# PLACE TESTS HERE #
# Create tests
def test_preprocess():
    return {
        'vocab_length': len(preprocess()),
    }

def test_build_lm():
    return {
        'english_2_gram_length': len(build_lm('en', 2, True)),
        'english_3_gram_length': len(build_lm('en', 3, True)),
        'french_3_gram_length': len(build_lm('fr', 3, True)),
        'spanish_3_gram_length': len(build_lm('es', 3, True)),
    }

def test_eval():
    lm = build_lm('en', 3, True)
    return {
        'en_on_en': round(eval(lm, 'en', 3), 2),
        'en_on_fr': round(eval(lm, 'fr', 3), 2),
        'en_on_tl': round(eval(lm, 'tl', 3), 2),
        'en_on_nl': round(eval(lm, 'nl', 3), 2),
    }

def test_match():
    df = match()
    return {
        'df_shape': df.shape,
        'en_en_3': df[(df['source'] == 'en') & (df['target'] == 'en') & (df['n'] == 3)]['perplexity'].values[0],
        'en_tl_3': df[(df['source'] == 'en') & (df['target'] == 'tl') & (df['n'] == 3)]['perplexity'].values[0],
        'en_nl_3': df[(df['source'] == 'en') & (df['target'] == 'nl') & (df['n'] == 3)]['perplexity'].values[0],
    }

def test_generate():
    return {
        'english_2_gram': generate('en', 2, "I am", 20, 5),
        'english_3_gram': generate('en', 3, "I am", 20, 5),
        'english_4_gram': generate('en', 4, "I Love", 20, 5),
        'spanish_2_gram': generate('es', 2, "Soy", 20, 5),
        'spanish_3_gram': generate('es', 3, "Soy", 20, 5),
        'french_2_gram': generate('fr', 2, "Je suis", 20, 5),
        'french_3_gram': generate('fr', 3, "Je suis", 20, 5),
    }

TESTS = [test_preprocess, test_build_lm, test_eval, test_match, test_generate]


# Run tests and save results
res = {}
for test in TESTS:
    try:
        cur_res = test()
        res.update({test.__name__: cur_res})
    except Exception as e:
        res.update({test.__name__: repr(e)})

with open('results.json', 'w') as f:
    json.dump(res, f, indent=2)

# Download the results.json file
# files.download('results.json')

# Download the results.json file
# files.download('results.json')
########################################

In [38]:
# Show the local files, results.json should be there now and
# also downloaded to your local machine
!ls -l

total 152
-rw-r--r--@  1 galdavidi  staff  29340 Apr 27 22:43 Assignment_1.ipynb
-rw-r--r--@  1 galdavidi  staff   1056 Apr 16 15:27 README.md
-rw-r--r--@  1 galdavidi  staff   4267 Apr 16 15:27 auto_grader.py
drwxr-xr-x@ 18 galdavidi  staff    576 Apr 16 17:42 [1m[36mdata[m[m
-rw-r--r--@  1 galdavidi  staff   3606 Apr 16 18:40 debug.py
-rw-r--r--@  1 galdavidi  staff   5904 Apr 27 22:18 match.csv
-rw-r--r--@  1 galdavidi  staff   6747 Apr 27 22:52 match_test.csv
-rw-r--r--@  1 galdavidi  staff      5 Apr 16 15:27 notebook_link.txt
drwxr-xr-x@  7 galdavidi  staff    224 Apr 27 21:47 [1m[36mpart4_json[m[m
-rw-r--r--@  1 galdavidi  staff    877 Apr 27 22:53 results.json
-rw-r--r--@  1 galdavidi  staff   2073 Apr 16 15:27 tests.py
