# Unicode Character Extraction and Filtering
This notebook extracts Unicode characters from a file and filters words that contain only Albanian letters.


In [1]:
# Load the file and extract unique Unicode characters
def extract_unicode_chars(filename):
    with open(filename, encoding='utf-8') as file:
        text = file.read()
    return set(text)

# Example usage:
file_path = "sq-sample.txt"
unicode_chars = extract_unicode_chars(file_path)
print("Unique Unicode characters:", unicode_chars)


Unique Unicode characters: {'м', '義', '婚', '俳', '鯉', 'œ', 'Ż', 'щ', '郑', 'ஆ', '流', '3', 'Κ', 'ة', 'უ', '+', 'ă', 'چ', 'Ј', '正', '¤', '¢', '限', 'λ', 'ფ', 'ʃ', 'đ', '´', 'ḥ', 'დ', '정', 't', 'ჩ', '்', '説', '金', '虚', '7', 'ř', '本', '伎', 'タ', 'م', 'Z', '忠', 'ἵ', 'C', 'ひ', '餅', 'ʾ', 'ą', 'φ', 'Œ', 'რ', 'β', 'د', 'ک', 'カ', '美', '越', 'Τ', '雷', '南', 'あ', 'し', 'o', 'р', '京', '士', '狂', 'Μ', 'Ч', '盟', 'प', '宗', 'Е', 'ლ', 'K', 'í', 'グ', 'I', 'ا', 'ạ', 'N', 'Ē', '한', 'ニ', '《', 'ǐ', 'ァ', 'נ', '鬼', 'ध', 'ز', '語', 'ש', 'ὴ', 'ç', 'Á', 'h', 'Б', '浮', '훈', '上', 'ο', '≠', 'z', 'у', '揚', 'â', 'ツ', 'à', 'க', '%', 'б', 'ф', '丸', 'Ф', '⌂', '\uf0b0', '議', '）', 'Î', 'リ', 'ə', 'ع', 'п', '先', '≡', 'ං', 'Р', 'ת', '六', 'ო', 'ʔ', 'º', 's', 'ق', 'ı', 'a', 'k', 'ε', 'ﻭ', 'Г', 'И', 'ː', '方', 'ὶ', 'ض', '6', '子', '¾', '古', 'ώ', 'г', '獄', 'Д', '不', 'Ë', 'パ', '野', 'ῖ', 'ż', '月', 'j', 'ς', '迪', '伝', 'Ḫ', '省', 'Ε', 'ê', 'ガ', 'ण', 'י', '¬', '蜂', '良', '酌', '舞', 'ී', 'ପ', 'マ', 'ダ', 'வ', 'Ν', 'ν', 'É', '郎', 'ର', '(', '话', 'A', 'σ

In [2]:
# Define the allowed Albanian letters (both uppercase and lowercase)
allowed_chars = set("abcçdeëfghijklmnopqrstuvxyzABCÇDEËFGHIJKLMNOPQRSTUVXYZ")

def is_albanian(word):
    return all(char in allowed_chars for char in word)




In [3]:
# Define the allowed Albanian letters (both uppercase and lowercase)
allowed_chars = set("abcçdeëfghijklmnopqrstuvxyzABCÇDEËFGHIJKLMNOPQRSTUVXYZ")

def is_albanian(word):
    return all(char in allowed_chars for char in word)

    # Example usage:
test_words = ["shtëpi", "hello", "tungjatjeta", "12345","şörle"]
filtered_words = [word for word in test_words if is_albanian(word)]
print("Filtered Albanian words:", filtered_words)




Filtered Albanian words: ['shtëpi', 'hello', 'tungjatjeta']


In [4]:
import random
import re

# Load the file
def load_text(filename):
    with open(filename, encoding='utf-8') as file:
        text = file.read()
    return text

file_path = "sq-sample.txt"
text = load_text(file_path)

In [5]:
# Split text into sentences
sentences = re.split(r'(?<=[.!?])\s+', text)

# Shuffle the sentences randomly
random.shuffle(sentences)

# Reconstruct scrambled text
scrambled_text = ' '.join(sentences)

In [6]:
# Tokenization: split at whitespaces and convert to lowercase
tokens = re.split(r'\s+', scrambled_text.lower())

In [7]:
# Remove punctuation and digits
tokens_cleaned = [re.sub(r'[^a-zA-ZçÇëË]', '', token) for token in tokens if token.strip()]

In [8]:
# Filter words using is_albanian function
albanian_tokens = [word for word in tokens_cleaned if is_albanian(word)]

print(f"Total Albanian tokens: {len(albanian_tokens)}")

Total Albanian tokens: 1916075


In [9]:
# Save tokens to a text file
with open("albanian_tokens.txt", "w", encoding="utf-8") as file:
    for token in albanian_tokens:
        file.write(token + "\n")

print("Tokens saved to albanian_tokens.txt")


Tokens saved to albanian_tokens.txt


In [10]:
import csv

# Save tokens to a CSV file
with open("albanian_tokens.csv", "w", encoding="utf-8", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Token"])  # Add a header
    for token in albanian_tokens:
        writer.writerow([token])

print("Tokens saved to albanian_tokens.csv")


Tokens saved to albanian_tokens.csv


In [11]:
from collections import Counter

# Split into two roughly equal parts
midpoint = len(albanian_tokens) // 2
subcorpus1 = albanian_tokens[:midpoint]
subcorpus2 = albanian_tokens[midpoint:]

In [12]:
# Create frequency counters
counter1 = Counter(subcorpus1)
counter2 = Counter(subcorpus2)

In [13]:
# Print top-50 most common words in each subcorpus

print("Top-50 most common words in Subcorpus 1:")
print(counter1.most_common(50))

print("\nTop-50 most common words in Subcorpus 2:")
print(counter2.most_common(50))


Top-50 most common words in Subcorpus 1:
[('të', 59102), ('e', 49333), ('në', 40851), ('', 27005), ('dhe', 26163), ('i', 22272), ('një', 16847), ('me', 15007), ('për', 12860), ('nga', 12350), ('që', 9385), ('është', 9165), ('më', 8858), ('u', 8190), ('si', 6568), ('ka', 6061), ('së', 5552), ('tij', 4824), ('se', 4619), ('ai', 4285), ('ishte', 3924), ('edhe', 3783), ('te', 3780), ('nuk', 3446), ('duke', 3428), ('janë', 3303), ('shumë', 2966), ('vitin', 2780), ('prej', 2654), ('pas', 2495), ('por', 2313), ('do', 2308), ('ne', 2248), ('tyre', 2140), ('mund', 2083), ('saj', 2054), ('ajo', 2050), ('parë', 1923), ('gjatë', 1902), ('disa', 1817), ('këtë', 1771), ('ose', 1753), ('dy', 1739), ('ku', 1648), ('kjo', 1620), ('deri', 1608), ('kishte', 1549), ('kur', 1489), ('kanë', 1437), ('atë', 1336)]

Top-50 most common words in Subcorpus 2:
[('të', 58612), ('e', 48948), ('në', 40664), ('', 26996), ('dhe', 26301), ('i', 22164), ('një', 16992), ('me', 15208), ('për', 12879), ('nga', 12689), ('që'

In [14]:
# Print top-50 most common words in each subcorpus one by one
print("Top-50 most common words in Subcorpus 1:")
print(counter1.most_common(50))


Top-50 most common words in Subcorpus 1:
[('të', 59102), ('e', 49333), ('në', 40851), ('', 27005), ('dhe', 26163), ('i', 22272), ('një', 16847), ('me', 15007), ('për', 12860), ('nga', 12350), ('që', 9385), ('është', 9165), ('më', 8858), ('u', 8190), ('si', 6568), ('ka', 6061), ('së', 5552), ('tij', 4824), ('se', 4619), ('ai', 4285), ('ishte', 3924), ('edhe', 3783), ('te', 3780), ('nuk', 3446), ('duke', 3428), ('janë', 3303), ('shumë', 2966), ('vitin', 2780), ('prej', 2654), ('pas', 2495), ('por', 2313), ('do', 2308), ('ne', 2248), ('tyre', 2140), ('mund', 2083), ('saj', 2054), ('ajo', 2050), ('parë', 1923), ('gjatë', 1902), ('disa', 1817), ('këtë', 1771), ('ose', 1753), ('dy', 1739), ('ku', 1648), ('kjo', 1620), ('deri', 1608), ('kishte', 1549), ('kur', 1489), ('kanë', 1437), ('atë', 1336)]


In [15]:
print("\nTop-50 most common words in Subcorpus 2:")
print(counter2.most_common(50))


Top-50 most common words in Subcorpus 2:
[('të', 58612), ('e', 48948), ('në', 40664), ('', 26996), ('dhe', 26301), ('i', 22164), ('një', 16992), ('me', 15208), ('për', 12879), ('nga', 12689), ('që', 9330), ('është', 9231), ('më', 8911), ('u', 8027), ('si', 6667), ('ka', 6189), ('së', 5509), ('se', 4840), ('tij', 4834), ('ai', 4320), ('te', 3931), ('ishte', 3897), ('edhe', 3826), ('nuk', 3488), ('duke', 3397), ('janë', 3316), ('shumë', 2882), ('vitin', 2826), ('prej', 2525), ('pas', 2439), ('ne', 2311), ('por', 2311), ('do', 2190), ('mund', 2171), ('saj', 2135), ('tyre', 2125), ('ajo', 2063), ('gjatë', 1996), ('parë', 1987), ('këtë', 1815), ('ose', 1781), ('dy', 1774), ('disa', 1731), ('kjo', 1654), ('deri', 1654), ('ku', 1556), ('kanë', 1539), ('kishte', 1538), ('kur', 1529), ('atë', 1338)]


In [16]:
# Words unique to each subcorpus
unique_to_1 = set(counter1.keys()) - set(counter2.keys())
unique_to_2 = set(counter2.keys()) - set(counter1.keys())

print(f"\nWords unique to Subcorpus 1: {unique_to_1}")



Words unique to Subcorpus 1: {'hapërsirë', 'vinçencin', 'jovanoviq', 'septumi', 'shakespearean', 'atmes', 'pumpernickel', 'protagoras', 'strehohen', 'kosha', 'katma', 'kershtlindjeve', 'përlarja', 'ekstremisht', 'inquiry', 'zuccotto', 'hiperplasisë', 'imagjinojmë', 'res', 'beranës', 'zejtaret', 'zuffi', 'rishpikur', 'paralizuara', 'joui', 'kaihuang', 'kreisau', 'ligamente', 'ulqinak', 'abdrefievich', 'bubar', 'parmenidi', 'shëpiake', 'antilinear', 'njëkohësish', 'camie', 'antithemeluese', 'laurini', 'flourin', 'shkanë', 'bërrylat', 'allma', 'bashkëredaktuar', 'ripërtëriu', 'mieloproliferative', 'ngadhënjimin', 'alxherrah', 'penestëve', 'popovica', 'prejt', 'nasale', 'mrekulluar', 'nëmur', 'boccacciano', 'pinguli', 'tongës', 'flagrancë', 'liruesi', 'usllupçan', 'hechtle', 'vetëquajtën', 'junçaj', 'kokaines', 'latinizua', 'chevalier', 'heydrich', 'bruçaj', 'ndërqelizor', 'oqeanikekurse', 'atintanit', 'ortakëve', 'mirënjohjesh', 'krebs', 'pafundtë', 'sanitizuara', 'prekomura', 'vendvarri

In [17]:
# Words unique to each subcorpus
unique_to_1 = set(counter1.keys()) - set(counter2.keys())
unique_to_2 = set(counter2.keys()) - set(counter1.keys())

print(f"\nWords unique to Subcorpus 1: {unique_to_1}")



Words unique to Subcorpus 1: {'hapërsirë', 'vinçencin', 'jovanoviq', 'septumi', 'shakespearean', 'atmes', 'pumpernickel', 'protagoras', 'strehohen', 'kosha', 'katma', 'kershtlindjeve', 'përlarja', 'ekstremisht', 'inquiry', 'zuccotto', 'hiperplasisë', 'imagjinojmë', 'res', 'beranës', 'zejtaret', 'zuffi', 'rishpikur', 'paralizuara', 'joui', 'kaihuang', 'kreisau', 'ligamente', 'ulqinak', 'abdrefievich', 'bubar', 'parmenidi', 'shëpiake', 'antilinear', 'njëkohësish', 'camie', 'antithemeluese', 'laurini', 'flourin', 'shkanë', 'bërrylat', 'allma', 'bashkëredaktuar', 'ripërtëriu', 'mieloproliferative', 'ngadhënjimin', 'alxherrah', 'penestëve', 'popovica', 'prejt', 'nasale', 'mrekulluar', 'nëmur', 'boccacciano', 'pinguli', 'tongës', 'flagrancë', 'liruesi', 'usllupçan', 'hechtle', 'vetëquajtën', 'junçaj', 'kokaines', 'latinizua', 'chevalier', 'heydrich', 'bruçaj', 'ndërqelizor', 'oqeanikekurse', 'atintanit', 'ortakëve', 'mirënjohjesh', 'krebs', 'pafundtë', 'sanitizuara', 'prekomura', 'vendvarri

In [18]:
print(f"Words unique to Subcorpus 2: {unique_to_2}")

Words unique to Subcorpus 2: {'vakufeve', 'fiziologjik', 'bakteriofagjeve', 'rujenit', 'serezin', 'urinare', 'tinca', 'brahmotsavamët', 'antologjitë', 'lotsjellës', 'kinsey', 'niueans', 'bunge', 'barletta', 'qiellzës', 'inkorporuara', 'kosovainfest', 'veshshpuari', 'stanislavsky', 'misvaku', 'akhetaton', 'bruiser', 'bashkëfshatar', 'rabotnicki', 'lucky', 'finland', 'harxhimet', 'czechoslovak', 'proper', 'sipërfaqjen', 'ministira', 'mda', 'dembelizimin', 'zhveshurpa', 'yorkvrau', 'gislaved', 'darko', 'soyons', 'shumllojshmeria', 'korcës', 'rabini', 'ecni', 'deni', 'prodhimtarine', 'emphatics', 'ndërnyje', 'herca', 'shëtitores', 'lucretia', 'pulcheriopolis', 'naklij', 'maitreya', 'pinguinin', 'treblovës', 'vancouver', 'engelbrekt', 'fruit', 'paftali', 'dhembë', 'albanologë', 'shëruesi', 'zjodhi', 'vëllaznie', 'kaçinarit', 'oviparous', 'këshillo', 'keery', 'unifikuara', 'perëndivet', 'decentralizim', 'kartala', 'inkorporimi', 'sigjecë', 'rrënuara', 'lumët', 'minitelekamera', 'mitër', 'liq

In [19]:
import csv

# Get the maximum length to ensure both columns align
max_length = max(len(unique_to_1), len(unique_to_2))

# Convert sets to lists and pad shorter list with empty strings
unique_to_1_list = list(unique_to_1) + [""] * (max_length - len(unique_to_1))
unique_to_2_list = list(unique_to_2) + [""] * (max_length - len(unique_to_2))

# Save both lists into a single CSV file
with open("unique_words_subcorpora.csv", "w", encoding="utf-8", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Unique Words in Subcorpus 1", "Unique Words in Subcorpus 2"])  # Header
    for word1, word2 in zip(unique_to_1_list, unique_to_2_list):
        writer.writerow([word1, word2])

print("Unique words from both subcorpora saved to unique_words_subcorpora.csv")

Unique words from both subcorpora saved to unique_words_subcorpora.csv


In [20]:
from collections import defaultdict

def split_into_quantiles(counter, k=10):
    """
    Splits tokens in a frequency counter into quantiles based on cumulative probability.

    Args:
        counter (Counter): A Counter object containing token frequencies.
        k (int): The number of quantiles (e.g., k=10 for deciles, k=4 for quartiles).

    Returns:
        defaultdict: A dictionary mapping quantile numbers (1, 2, ..., k) to sets of tokens.
    """
    # Sort tokens by frequency in descending order
    sorted_tokens = sorted(counter.items(), key=lambda x: x[1], reverse=True)

    # Calculate total number of tokens to determine probabilities
    total_count = sum(counter.values())

    # Initialize cumulative probability and quantile mapping
    cumulative_probability = 0
    quantiles = defaultdict(set)

    # Iterate through sorted tokens, tracking cumulative probability
    for token, freq in sorted_tokens:
        cumulative_probability += freq / total_count  # Update cumulative probability
        
        # Determine which quantile the token belongs to
        quantile = min(k, int(cumulative_probability * k) + 1)
        quantiles[quantile].add(token)

    return quantiles

In [21]:
quantiles_subcorpus1 = split_into_quantiles(counter1, k=10)
quantiles_subcorpus2 = split_into_quantiles(counter2, k=10)

# Print the quantiles
print("Quantiles for Subcorpus 1:")
for quantile, tokens in quantiles_subcorpus1.items():
    print(f"Quantile {quantile}: {tokens}")

print("\nQuantiles for Subcorpus 2:")
for quantile, tokens in quantiles_subcorpus2.items():
    print(f"Quantile {quantile}: {tokens}")


Quantiles for Subcorpus 1:
Quantile 1: {'të'}
Quantile 2: {'', 'në', 'e'}
Quantile 3: {'i', 'një', 'dhe', 'për', 'me', 'nga'}
Quantile 4: {'shumë', 'tij', 'së', 'se', 'vitin', 'te', 'prej', 'nuk', 'edhe', 'që', 'si', 'ka', 'duke', 'është', 'u', 'më', 'pas', 'ai', 'ishte', 'janë'}
Quantile 5: {'bë', 'dy', 'tek', 'bërë', 'madhe', 'do', 'ashtu', 'por', 'para', 'shqiptare', 'pjesë', 'kur', 'pasi', 'km', 'vend', 'saj', 'mënyrë', 'ata', 'sipas', 'dytë', 'gjitha', 'jetë', 'viteve', 'lartë', 'tjetër', 'kishin', 'luftës', 'kësaj', 'sa', 'fund', 'qenë', 'çdo', 'viti', 'kanë', 'mund', 'duhet', 'banorë', 'mbi', 'ky', 'disa', 'kundër', 'vitet', 'ndaj', 'cili', 'nën', 'cilat', 'ishin', 'këtë', 'ne', 'përbëhet', 'qe', 'ta', 'kështu', 'kishte', 'tyre', 'midis', 'ti', 'madh', 'këto', 'deri', 'tjera', 'komuna', 'apo', 'tjerë', 'ose', 'popullsi', 'rreth', 'ato', 'po', 'nje', 'jo', 'vonë', 'ndërsa', 'sipërfaqe', 'pa', 'pak', 'ndryshme', 'kohë', 'mirë', 'parë', 'cila', 'shkak', 'vitit', 'pasur', 'atë', 'ak

In [22]:
import csv

# Save quantile results to a CSV file
with open("quantile_results.csv", "w", encoding="utf-8", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Quantile", "Subcorpus 1 Tokens", "Subcorpus 2 Tokens"])  # Header

    # Get maximum number of tokens per quantile to align columns
    max_quantiles = max(len(quantiles_subcorpus1), len(quantiles_subcorpus2))
    
    for q in range(1, max_quantiles + 1):
        tokens1 = ", ".join(quantiles_subcorpus1.get(q, set()))  # Convert to string
        tokens2 = ", ".join(quantiles_subcorpus2.get(q, set()))  # Convert to string
        writer.writerow([q, tokens1, tokens2])

print("Quantile results saved to quantile_results.csv")

Quantile results saved to quantile_results.csv


In [23]:
import pandas as pd  # For table representation

# Compute overlap sizes between corresponding decile subsets
decile_overlap = {}

for q in range(1, 11):  # Since we have deciles (k=10)
    overlap_size = len(quantiles_subcorpus1.get(q, set()) & quantiles_subcorpus2.get(q, set()))
    decile_overlap[q] = overlap_size

# Convert results into a DataFrame for better formatting
overlap_table = pd.DataFrame(list(decile_overlap.items()), columns=["Decile", "Overlap Size"])
print(overlap_table)


   Decile  Overlap Size
0       1             1
1       2             3
2       3             6
3       4            20
4       5            96
5       6           427
6       7          1235
7       8          3094
8       9          8926
9      10         22246


In [24]:
# Merge token frequency counters from both subcorpora
combined_counter = counter1 + counter2

# Sort tokens by frequency
sorted_tokens = sorted(combined_counter.items(), key=lambda x: x[1], reverse=True)

# Calculate cumulative probability for different coverage thresholds
total_tokens = sum(combined_counter.values())
coverage_thresholds = [0.2, 0.5, 0.7, 0.8, 0.9]  # 20%, 50%, 70%, 80%, 90%
word_counts_needed = {}

cumulative_probability = 0
words_seen = 0

for token, freq in sorted_tokens:
    cumulative_probability += freq / total_tokens
    words_seen += 1

    # Check if we reach a coverage threshold
    for threshold in coverage_thresholds:
        if threshold not in word_counts_needed and cumulative_probability >= threshold:
            word_counts_needed[threshold] = words_seen

# Convert results into a table format
coverage_table = pd.DataFrame(list(word_counts_needed.items()), columns=["Coverage Percentage", "Words Needed"])
print(coverage_table)


   Coverage Percentage  Words Needed
0                  0.2             5
1                  0.5           131
2                  0.7          2122
3                  0.8          6439
4                  0.9         22676
