# Unicode Character Extraction and Filtering
This notebook extracts Unicode characters from a file and filters words that contain only Albanian letters.


In [1]:
# Load the file and extract unique Unicode characters
def extract_unicode_chars(filename):
    with open(filename, encoding='utf-8') as file:
        text = file.read()
    return set(text)

# Example usage:
file_path = "sq-sample.txt"
unicode_chars = extract_unicode_chars(file_path)
print("Unique Unicode characters:", unicode_chars)


Unique Unicode characters: {'9', '冷', 'Ō', 'ÿ', '騎', '吳', '.', '厚', 'ண', 'U', 'ボ', 'у', '美', '不', '会', '下', '夫', '~', '切', 'ニ', 'ש', '塔', 'ッ', '社', '抜', 'ń', 'А', 'd', '獄', 'ශ', '₄', '鯱', 'ø', 'ூ', 'Í', 'ḏ', 'ﻦ', '生', 'ھ', 'ム', 'ϝ', 'ч', '代', 'i', '天', '‐', 'ı', 'É', 'イ', 'ú', 'Ȅ', 'C', 'j', 'ス', '³', '，', '興', 'Å', 'A', 'm', 'ḍ', '嘯', 'υ', 'Q', 'ἀ', '文', 'ἠ', 'ق', 'ზ', 'С', '遊', 'ɛ', 'ウ', 'ἐ', 'あ', '卸', 'y', '栗', 'ì', 'ი', 'テ', 'מ', 'α', 'å', 'け', 'ま', 'F', 'Ć', 'ă', 'ه', 'ü', '擺', 'ر', 'В', '8', '«', 'ء', '’', 'х', '術', '‘', '説', '邸', 'à', 'カ', 'フ', 'ो', 'G', 'н', 'Ž', '宮', 'Š', 'ʏ', 'ର', 'ś', 'ا', '4', 'в', '话', 'Ł', '限', 'Ç', 'Ħ', 'Μ', 'ラ', '札', 'ł', '三', '5', 'Ë', '0', 'д', 'ص', ')', '仁', '板', '‰', '奈', 'ර', '餅', '₂', 'ј', 'ë', 'プ', 'Х', 'ν', '́', 'ň', 'Y', '碓', '方', '雄', '金', '”', 'ṭ', 'ƒ', 'ギ', '議', '鍵', 'ோ', '्', '嵐', 'ע', 'バ', 'ب', '講', '¾', 'ح', '敢', 'û', '蛍', 'ạ', '）', 'ﻭ', 'ක', 'Β', '相', 'ḥ', '先', 'Θ', '浮', '—', 'ك', 'ἵ', '්', 'ة', 'வ', 'ु', 'г', 'β', 't', 'ソ', ';', 'Ș', 'ვ

In [2]:
# Define the allowed Albanian letters (both uppercase and lowercase)
allowed_chars = set("abcçdeëfghijklmnopqrstuvxyzABCÇDEËFGHIJKLMNOPQRSTUVXYZ")

def is_albanian(word):
    return all(char in allowed_chars for char in word)




In [3]:
# Define the allowed Albanian letters (both uppercase and lowercase)
allowed_chars = set("abcçdeëfghijklmnopqrstuvxyzABCÇDEËFGHIJKLMNOPQRSTUVXYZ")

def is_albanian(word):
    return all(char in allowed_chars for char in word)

    # Example usage:
test_words = ["shtëpi", "hello", "tungjatjeta", "12345","şörle"]
filtered_words = [word for word in test_words if is_albanian(word)]
print("Filtered Albanian words:", filtered_words)




Filtered Albanian words: ['shtëpi', 'hello', 'tungjatjeta']


In [4]:
import random
import re

# Load the file
def load_text(filename):
    with open(filename, encoding='utf-8') as file:
        text = file.read()
    return text

file_path = "sq-sample.txt"
text = load_text(file_path)

In [5]:
# Split text into sentences
sentences = re.split(r'(?<=[.!?])\s+', text)

# Shuffle the sentences randomly
random.shuffle(sentences)

# Reconstruct scrambled text
scrambled_text = ' '.join(sentences)

In [6]:
# Tokenization: split at whitespaces and convert to lowercase
tokens = re.split(r'\s+', scrambled_text.lower())

In [7]:
# Remove punctuation and digits
tokens_cleaned = [re.sub(r'[^a-zA-ZçÇëË]', '', token) for token in tokens if token.strip()]

In [8]:
# Filter words using is_albanian function
albanian_tokens = [word for word in tokens_cleaned if is_albanian(word)]

print(f"Total Albanian tokens: {len(albanian_tokens)}")

Total Albanian tokens: 1916075


In [9]:
# Save tokens to a text file
with open("albanian_tokens.txt", "w", encoding="utf-8") as file:
    for token in albanian_tokens:
        file.write(token + "\n")

print("Tokens saved to albanian_tokens.txt")


Tokens saved to albanian_tokens.txt


In [10]:
import csv

# Save tokens to a CSV file
with open("albanian_tokens.csv", "w", encoding="utf-8", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Token"])  # Add a header
    for token in albanian_tokens:
        writer.writerow([token])

print("Tokens saved to albanian_tokens.csv")


Tokens saved to albanian_tokens.csv


In [11]:
from collections import Counter

# Split into two roughly equal parts
midpoint = len(albanian_tokens) // 2
subcorpus1 = albanian_tokens[:midpoint]
subcorpus2 = albanian_tokens[midpoint:]

In [12]:
# Create frequency counters
counter1 = Counter(subcorpus1)
counter2 = Counter(subcorpus2)

In [13]:
# Print top-50 most common words in each subcorpus

print("Top-50 most common words in Subcorpus 1:")
print(counter1.most_common(50))

print("\nTop-50 most common words in Subcorpus 2:")
print(counter2.most_common(50))


Top-50 most common words in Subcorpus 1:
[('të', 58923), ('e', 49167), ('në', 40912), ('', 27169), ('dhe', 26192), ('i', 22235), ('një', 16848), ('me', 15112), ('për', 13055), ('nga', 12498), ('që', 9378), ('është', 9240), ('më', 8900), ('u', 8037), ('si', 6583), ('ka', 6178), ('së', 5540), ('se', 4815), ('tij', 4798), ('ai', 4281), ('ishte', 3942), ('te', 3765), ('edhe', 3760), ('duke', 3420), ('nuk', 3420), ('janë', 3226), ('shumë', 2941), ('vitin', 2808), ('prej', 2580), ('pas', 2511), ('do', 2342), ('ne', 2283), ('por', 2225), ('mund', 2177), ('tyre', 2120), ('saj', 2079), ('ajo', 2027), ('parë', 1965), ('gjatë', 1942), ('këtë', 1793), ('disa', 1768), ('ose', 1766), ('dy', 1738), ('kjo', 1694), ('deri', 1609), ('ku', 1583), ('kishte', 1526), ('kur', 1524), ('kanë', 1475), ('atë', 1331)]

Top-50 most common words in Subcorpus 2:
[('të', 58791), ('e', 49114), ('në', 40603), ('', 26832), ('dhe', 26272), ('i', 22201), ('një', 16991), ('me', 15103), ('për', 12684), ('nga', 12541), ('që'

In [14]:
# Print top-50 most common words in each subcorpus one by one
print("Top-50 most common words in Subcorpus 1:")
print(counter1.most_common(50))


Top-50 most common words in Subcorpus 1:
[('të', 58923), ('e', 49167), ('në', 40912), ('', 27169), ('dhe', 26192), ('i', 22235), ('një', 16848), ('me', 15112), ('për', 13055), ('nga', 12498), ('që', 9378), ('është', 9240), ('më', 8900), ('u', 8037), ('si', 6583), ('ka', 6178), ('së', 5540), ('se', 4815), ('tij', 4798), ('ai', 4281), ('ishte', 3942), ('te', 3765), ('edhe', 3760), ('duke', 3420), ('nuk', 3420), ('janë', 3226), ('shumë', 2941), ('vitin', 2808), ('prej', 2580), ('pas', 2511), ('do', 2342), ('ne', 2283), ('por', 2225), ('mund', 2177), ('tyre', 2120), ('saj', 2079), ('ajo', 2027), ('parë', 1965), ('gjatë', 1942), ('këtë', 1793), ('disa', 1768), ('ose', 1766), ('dy', 1738), ('kjo', 1694), ('deri', 1609), ('ku', 1583), ('kishte', 1526), ('kur', 1524), ('kanë', 1475), ('atë', 1331)]


In [15]:
print("\nTop-50 most common words in Subcorpus 2:")
print(counter2.most_common(50))


Top-50 most common words in Subcorpus 2:
[('të', 58791), ('e', 49114), ('në', 40603), ('', 26832), ('dhe', 26272), ('i', 22201), ('një', 16991), ('me', 15103), ('për', 12684), ('nga', 12541), ('që', 9337), ('është', 9156), ('më', 8869), ('u', 8180), ('si', 6652), ('ka', 6072), ('së', 5521), ('tij', 4860), ('se', 4644), ('ai', 4324), ('te', 3946), ('ishte', 3879), ('edhe', 3849), ('nuk', 3514), ('duke', 3405), ('janë', 3393), ('shumë', 2907), ('vitin', 2798), ('prej', 2599), ('pas', 2423), ('por', 2399), ('ne', 2276), ('do', 2156), ('tyre', 2145), ('saj', 2110), ('ajo', 2086), ('mund', 2077), ('gjatë', 1956), ('parë', 1945), ('këtë', 1793), ('disa', 1780), ('dy', 1775), ('ose', 1768), ('deri', 1653), ('ku', 1621), ('kjo', 1580), ('kishte', 1561), ('kanë', 1501), ('kur', 1494), ('atë', 1343)]


In [16]:
# Words unique to each subcorpus
unique_to_1 = set(counter1.keys()) - set(counter2.keys())
unique_to_2 = set(counter2.keys()) - set(counter1.keys())

print(f"\nWords unique to Subcorpus 1: {unique_to_1}")



Words unique to Subcorpus 1: {'fanatic', 'epilogu', 'sllavishtjaavic', 'cilësoheshin', 'bukuren', 'fdd', 'sunnite', 'kristja', 'begum', 'lëngëzorë', 'iatrogenic', 'njerëzitpeshq', 'bases', 'variolës', 'sucka', 'aulick', 'lakimi', 'shipckes', 'breech', 'shejbes', 'jylland', 'pensionistëve', 'prozen', 'rimbau', 'vector', 'shjta', 'kokteji', 'tangsilla', 'jashtëbotë', 'intonacionit', 'mccartney', 'çertifikate', 'tiganëve', 'tolstojët', 'proteza', 'harxhim', 'orenditë', 'dëmtonin', 'kelelës', 'paleontologjistë', 'kolgecaj', 'kvha', 'celësit', 'injektojnë', 'oblastima', 'cln', 'pasuronte', 'prononcuan', 'përforcojnë', 'istaias', 'antlers', 'ngulmin', 'shkroj', 'shqipet', 'kaosin', 'thonjeve', 'transvestizmin', 'bënjës', 'presidential', 'parnassum', 'pasagjereve', 'lirijon', 'diodori', 'alegar', 'plaines', 'giordano', 'amaratit', 'logjikes', 'terbinafine', 'feto', 'shumëllojshmëria', 'lennonu', 'vejsel', 'xcm', 'udherrefyes', 'stonington', 'provim', 'gjalpa', 'mëshiroheni', 'sabotatorësh', 

In [17]:
# Words unique to each subcorpus
unique_to_1 = set(counter1.keys()) - set(counter2.keys())
unique_to_2 = set(counter2.keys()) - set(counter1.keys())

print(f"\nWords unique to Subcorpus 1: {unique_to_1}")



Words unique to Subcorpus 1: {'fanatic', 'epilogu', 'sllavishtjaavic', 'cilësoheshin', 'bukuren', 'fdd', 'sunnite', 'kristja', 'begum', 'lëngëzorë', 'iatrogenic', 'njerëzitpeshq', 'bases', 'variolës', 'sucka', 'aulick', 'lakimi', 'shipckes', 'breech', 'shejbes', 'jylland', 'pensionistëve', 'prozen', 'rimbau', 'vector', 'shjta', 'kokteji', 'tangsilla', 'jashtëbotë', 'intonacionit', 'mccartney', 'çertifikate', 'tiganëve', 'tolstojët', 'proteza', 'harxhim', 'orenditë', 'dëmtonin', 'kelelës', 'paleontologjistë', 'kolgecaj', 'kvha', 'celësit', 'injektojnë', 'oblastima', 'cln', 'pasuronte', 'prononcuan', 'përforcojnë', 'istaias', 'antlers', 'ngulmin', 'shkroj', 'shqipet', 'kaosin', 'thonjeve', 'transvestizmin', 'bënjës', 'presidential', 'parnassum', 'pasagjereve', 'lirijon', 'diodori', 'alegar', 'plaines', 'giordano', 'amaratit', 'logjikes', 'terbinafine', 'feto', 'shumëllojshmëria', 'lennonu', 'vejsel', 'xcm', 'udherrefyes', 'stonington', 'provim', 'gjalpa', 'mëshiroheni', 'sabotatorësh', 

In [18]:
import csv

# Get the maximum length to ensure both columns align
max_length = max(len(unique_to_1), len(unique_to_2))

# Convert sets to lists and pad shorter list with empty strings
unique_to_1_list = list(unique_to_1) + [""] * (max_length - len(unique_to_1))
unique_to_2_list = list(unique_to_2) + [""] * (max_length - len(unique_to_2))

# Save both lists into a single CSV file
with open("unique_words_subcorpora.csv", "w", encoding="utf-8", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Unique Words in Subcorpus 1", "Unique Words in Subcorpus 2"])  # Header
    for word1, word2 in zip(unique_to_1_list, unique_to_2_list):
        writer.writerow([word1, word2])

print("Unique words from both subcorpora saved to unique_words_subcorpora.csv")

Unique words from both subcorpora saved to unique_words_subcorpora.csv


In [19]:
from collections import defaultdict

def split_into_quantiles(counter, k=10):
    """
    Splits tokens in a frequency counter into quantiles based on cumulative probability.

    Args:
        counter (Counter): A Counter object containing token frequencies.
        k (int): The number of quantiles (e.g., k=10 for deciles, k=4 for quartiles).

    Returns:
        defaultdict: A dictionary mapping quantile numbers (1, 2, ..., k) to sets of tokens.
    """
    # Sort tokens by frequency in descending order
    sorted_tokens = sorted(counter.items(), key=lambda x: x[1], reverse=True)

    # Calculate total number of tokens to determine probabilities
    total_count = sum(counter.values())

    # Initialize cumulative probability and quantile mapping
    cumulative_probability = 0
    quantiles = defaultdict(set)

    # Iterate through sorted tokens, tracking cumulative probability
    for token, freq in sorted_tokens:
        cumulative_probability += freq / total_count  # Update cumulative probability
        
        # Determine which quantile the token belongs to
        quantile = min(k, int(cumulative_probability * k) + 1)
        quantiles[quantile].add(token)

    return quantiles

In [20]:
quantiles_subcorpus1 = split_into_quantiles(counter1, k=10)
quantiles_subcorpus2 = split_into_quantiles(counter2, k=10)

# Print the quantiles
print("Quantiles for Subcorpus 1:")
for quantile, tokens in quantiles_subcorpus1.items():
    print(f"Quantile {quantile}: {tokens}")

print("\nQuantiles for Subcorpus 2:")
for quantile, tokens in quantiles_subcorpus2.items():
    print(f"Quantile {quantile}: {tokens}")


Quantiles for Subcorpus 1:
Quantile 1: {'të'}
Quantile 2: {'', 'në', 'e'}
Quantile 3: {'një', 'i', 'dhe', 'nga', 'për', 'me'}
Quantile 4: {'janë', 'prej', 'tij', 'ai', 'vitin', 'më', 'ishte', 'u', 'edhe', 'që', 'ka', 'duke', 'pas', 'te', 'shumë', 'si', 'është', 'se', 'së', 'nuk'}
Quantile 5: {'ndaj', 'midis', 'ta', 'para', 'mund', 'madhe', 'gjithë', 'luftës', 'sepse', 'pasur', 'ndërsa', 'ne', 'tre', 'gjithashtu', 'ku', 'lartë', 'vitit', 'nën', 'banorë', 'kishin', 'dytë', 'tek', 'fundit', 'ata', 'km', 'filloi', 'vonë', 'sipërfaqe', 'kishte', 'do', 'rreth', 'mënyrë', 'kështu', 'deri', 'fund', 'sot', 'jo', 'qe', 'saj', 'kundër', 'kësaj', 'kjo', 'bërë', 'bë', 'sipas', 'gjitha', 'madh', 'cila', 'përbëhet', 'komuna', 'shkak', 'pa', 'pjesë', 'vend', 'parë', 'nje', 'mori', 'atë', 'pasi', 'cilat', 'mes', 'po', 'kanë', 'tyre', 'duhet', 'tjetër', 'ajo', 'jetë', 'pak', 'kur', 'aktualisht', 'ishin', 'viti', 'ti', 'ky', 'kohë', 'nëse', 'shqiptare', 'vetëm', 'ato', 'gjatë', 'herë', 'këto', 'mbi', 'di

In [21]:
import csv

# Save quantile results to a CSV file
with open("quantile_results.csv", "w", encoding="utf-8", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Quantile", "Subcorpus 1 Tokens", "Subcorpus 2 Tokens"])  # Header

    # Get maximum number of tokens per quantile to align columns
    max_quantiles = max(len(quantiles_subcorpus1), len(quantiles_subcorpus2))
    
    for q in range(1, max_quantiles + 1):
        tokens1 = ", ".join(quantiles_subcorpus1.get(q, set()))  # Convert to string
        tokens2 = ", ".join(quantiles_subcorpus2.get(q, set()))  # Convert to string
        writer.writerow([q, tokens1, tokens2])

print("Quantile results saved to quantile_results.csv")

Quantile results saved to quantile_results.csv


In [22]:
import pandas as pd  # For table representation

# Compute overlap sizes between corresponding decile subsets
decile_overlap = {}

for q in range(1, 11):  # Since we have deciles (k=10)
    overlap_size = len(quantiles_subcorpus1.get(q, set()) & quantiles_subcorpus2.get(q, set()))
    decile_overlap[q] = overlap_size

# Convert results into a DataFrame for better formatting
overlap_table = pd.DataFrame(list(decile_overlap.items()), columns=["Decile", "Overlap Size"])
print(overlap_table)


   Decile  Overlap Size
0       1             1
1       2             3
2       3             6
3       4            20
4       5            92
5       6           426
6       7          1247
7       8          3091
8       9          8888
9      10         22137


In [23]:
# Merge token frequency counters from both subcorpora
combined_counter = counter1 + counter2

# Sort tokens by frequency
sorted_tokens = sorted(combined_counter.items(), key=lambda x: x[1], reverse=True)

# Calculate cumulative probability for different coverage thresholds
total_tokens = sum(combined_counter.values())
coverage_thresholds = [0.2, 0.5, 0.7, 0.8, 0.9]  # 20%, 50%, 70%, 80%, 90%
word_counts_needed = {}

cumulative_probability = 0
words_seen = 0

for token, freq in sorted_tokens:
    cumulative_probability += freq / total_tokens
    words_seen += 1

    # Check if we reach a coverage threshold
    for threshold in coverage_thresholds:
        if threshold not in word_counts_needed and cumulative_probability >= threshold:
            word_counts_needed[threshold] = words_seen

# Convert results into a table format
coverage_table = pd.DataFrame(list(word_counts_needed.items()), columns=["Coverage Percentage", "Words Needed"])
print(coverage_table)


   Coverage Percentage  Words Needed
0                  0.2             5
1                  0.5           131
2                  0.7          2122
3                  0.8          6439
4                  0.9         22676


In [24]:
# Filter words using is_albanian function
albanian_tokens = [word for word in tokens_cleaned if is_albanian(word)]

print( {len(albanian_tokens)})

{1916075}


In [25]:
import string
from collections import Counter

# Define allowed Albanian letters (lowercase)
albanian_letters = set("abcçdeëfghijklmnopqrstuvxyz")

def is_pure_albanian(word):
    return all(c.lower() in albanian_letters for c in word)

# Read the file
with open("sq-sample.txt", "r", encoding="utf-8") as file:
    text = file.read()

# Preprocess text: lowercase, split, strip punctuation
words = text.lower().split()
words = [word.strip(string.punctuation) for word in words]

# Filter pure Albanian words
pure_albanian_words = [word for word in words if is_pure_albanian(word)]

# Count total and unique words
total_words = len(pure_albanian_words)
unique_words = set(pure_albanian_words)
print(f"Total valid Albanian word tokens: {total_words}")
print(f"Number of unique valid Albanian words: {len(unique_words)}")

# Count word frequencies
word_counts = Counter(pure_albanian_words)
most_common = word_counts.most_common()

# CEFR levels and vocabulary sizes
cefr_levels = {
    "A1": 625,
    "A2": 1250,
    "B1": 2500,
    "B2": 5000
}

# Calculate cumulative coverage for each CEFR level
print("\nCEFR Coverage Estimates:")
print(f"{'Level':<5} {'Words':<6} {'Coverage (%)':<15} {'% of Unique Words Covered'}")
for level, cutoff in cefr_levels.items():
    top_words = [word for word, _ in most_common[:cutoff]]
    coverage = sum(word_counts[word] for word in top_words if word in word_counts)
    coverage_percent = (coverage / total_words) * 100
    unique_covered = len([word for word in unique_words if word in top_words])
    unique_percent = (unique_covered / len(unique_words)) * 100
    print(f"{level:<5} {cutoff:<6} {coverage_percent:<15.2f} {unique_percent:.2f}")


Total valid Albanian word tokens: 1836581
Number of unique valid Albanian words: 130663

CEFR Coverage Estimates:
Level Words  Coverage (%)    % of Unique Words Covered
A1    625    59.79           0.48
A2    1250   65.29           0.96
B1    2500   71.34           1.91
B2    5000   77.84           3.83
