# Unicode Character Extraction and Filtering
This notebook extracts Unicode characters from a file and filters words that contain only Albanian letters.


In [1]:
# Load the file and extract unique Unicode characters
def extract_unicode_chars(filename):
    with open(filename, encoding='utf-8') as file:
        text = file.read()
    return set(text)

# Example usage:
file_path = "sq-sample.txt"
unicode_chars = extract_unicode_chars(file_path)
print("Unique Unicode characters:", unicode_chars)


Unique Unicode characters: {'ِ', 'Π', '揚', '遊', 'К', 'T', '里', 'д', '婚', '8', '9', '鯉', 'ூ', '段', 'ê', '₈', 'ス', 'ソ', 'Ş', 'ł', 'म', 'ἀ', 'ε', '越', 'り', 'î', 'ι', 'з', 'じ', '者', 'у', 'v', 'ح', '«', '領', 'ת', 'л', 'ल', '敢', '山', '迪', 'ž', 'h', 'õ', 'ὴ', 'र', 'ˇ', '無', 'リ', '木', '上', 'ϝ', 'ợ', 'უ', 'வ', '!', 'כ', '郑', '省', 'µ', 'ø', 'ҡ', '한', '界', 'ĭ', '士', 'ấ', ';', 'ר', '井', 'μ', 'の', '⁄', 'p', '¤', '’', 'б', 'ב', 'し', 'й', '赤', '大', 'ί', 'Š', '(', 'ん', 'ď', 'É', '雷', '郎', 'A', 'i', '훈', 'ţ', 'Т', 'प', '雀', 'ギ', 'す', 'ҫ', 'Ć', '星', '石', 'ė', 'م', 'щ', '号', '‑', '板', 'ַ', 'с', '-', '邸', 'ƒ', 'ء', '砕', '座', '語', '김', 'χ', '&', 'ī', 'ヘ', 'ج', 'ˤ', '舞', 'ἠ', 'Ή', 'l', '>', 'в', 'г', 'Ś', 'Z', '議', '式', '羽', '̠', 'Α', '珍', '雨', 'ˁ', 'ך', 'ῦ', '冷', '™', 'א', "'", 'ஆ', 'ô', '№', 'ல', '安', '伎', 'サ', 'н', 'Đ', '´', 'ı', 'ō', 'Р', 'ع', 'Ἱ', '心', '校', '健', 'È', '写', 'Ö', 'М', 'L', 'フ', '眠', '鞭', 'В', ':', 'ä', 'Ω', '奈', 'ボ', '四', 'n', '禅', 'ダ', '代', '‖', 'э', 'х', '⟨', '門', 'न', 'ق', 'შ', '公', '堂

In [2]:
# Define the allowed Albanian letters (both uppercase and lowercase)
allowed_chars = set("abcçdeëfghijklmnopqrstuvxyzABCÇDEËFGHIJKLMNOPQRSTUVXYZ")

def is_albanian(word):
    return all(char in allowed_chars for char in word)




In [3]:
# Define the allowed Albanian letters (both uppercase and lowercase)
allowed_chars = set("abcçdeëfghijklmnopqrstuvxyzABCÇDEËFGHIJKLMNOPQRSTUVXYZ")

def is_albanian(word):
    return all(char in allowed_chars for char in word)

    # Example usage:
test_words = ["shtëpi", "hello", "tungjatjeta", "12345","şörle"]
filtered_words = [word for word in test_words if is_albanian(word)]
print("Filtered Albanian words:", filtered_words)




Filtered Albanian words: ['shtëpi', 'hello', 'tungjatjeta']


In [4]:
import random
import re

# Load the file
def load_text(filename):
    with open(filename, encoding='utf-8') as file:
        text = file.read()
    return text

file_path = "sq-sample.txt"
text = load_text(file_path)

In [5]:
# Split text into sentences
sentences = re.split(r'(?<=[.!?])\s+', text)

# Shuffle the sentences randomly
random.shuffle(sentences)

# Reconstruct scrambled text
scrambled_text = ' '.join(sentences)

In [6]:
# Tokenization: split at whitespaces and convert to lowercase
tokens = re.split(r'\s+', scrambled_text.lower())

In [7]:
# Remove punctuation and digits
tokens_cleaned = [re.sub(r'[^a-zA-ZçÇëË]', '', token) for token in tokens if token.strip()]

In [8]:
# Filter words using is_albanian function
albanian_tokens = [word for word in tokens_cleaned if is_albanian(word)]

print(f"Total Albanian tokens: {len(albanian_tokens)}")

Total Albanian tokens: 1916075


In [9]:
# Save tokens to a text file
with open("albanian_tokens.txt", "w", encoding="utf-8") as file:
    for token in albanian_tokens:
        file.write(token + "\n")

print("Tokens saved to albanian_tokens.txt")


Tokens saved to albanian_tokens.txt


In [10]:
import csv

# Save tokens to a CSV file
with open("albanian_tokens.csv", "w", encoding="utf-8", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Token"])  # Add a header
    for token in albanian_tokens:
        writer.writerow([token])

print("Tokens saved to albanian_tokens.csv")


Tokens saved to albanian_tokens.csv


In [11]:
from collections import Counter

# Split into two roughly equal parts
midpoint = len(albanian_tokens) // 2
subcorpus1 = albanian_tokens[:midpoint]
subcorpus2 = albanian_tokens[midpoint:]

In [12]:
# Create frequency counters
counter1 = Counter(subcorpus1)
counter2 = Counter(subcorpus2)

In [13]:
# Print top-50 most common words in each subcorpus

print("Top-50 most common words in Subcorpus 1:")
print(counter1.most_common(50))

print("\nTop-50 most common words in Subcorpus 2:")
print(counter2.most_common(50))


Top-50 most common words in Subcorpus 1:
[('të', 58746), ('e', 49357), ('në', 40946), ('', 26935), ('dhe', 26220), ('i', 22106), ('një', 17044), ('me', 15264), ('për', 12804), ('nga', 12499), ('që', 9212), ('është', 9126), ('më', 8865), ('u', 8084), ('si', 6579), ('ka', 6143), ('së', 5592), ('tij', 4892), ('se', 4717), ('ai', 4288), ('ishte', 3941), ('te', 3831), ('edhe', 3805), ('nuk', 3483), ('duke', 3466), ('janë', 3290), ('shumë', 2963), ('vitin', 2778), ('prej', 2577), ('pas', 2450), ('ne', 2349), ('por', 2266), ('do', 2211), ('mund', 2164), ('saj', 2122), ('tyre', 2108), ('ajo', 2051), ('parë', 1975), ('gjatë', 1958), ('këtë', 1795), ('disa', 1788), ('ose', 1770), ('dy', 1750), ('kjo', 1686), ('deri', 1631), ('ku', 1619), ('kishte', 1584), ('kanë', 1501), ('kur', 1490), ('atë', 1338)]

Top-50 most common words in Subcorpus 2:
[('të', 58968), ('e', 48924), ('në', 40569), ('', 27066), ('dhe', 26244), ('i', 22330), ('një', 16795), ('me', 14951), ('për', 12935), ('nga', 12540), ('që'

In [14]:
# Print top-50 most common words in each subcorpus one by one
print("Top-50 most common words in Subcorpus 1:")
print(counter1.most_common(50))


Top-50 most common words in Subcorpus 1:
[('të', 58746), ('e', 49357), ('në', 40946), ('', 26935), ('dhe', 26220), ('i', 22106), ('një', 17044), ('me', 15264), ('për', 12804), ('nga', 12499), ('që', 9212), ('është', 9126), ('më', 8865), ('u', 8084), ('si', 6579), ('ka', 6143), ('së', 5592), ('tij', 4892), ('se', 4717), ('ai', 4288), ('ishte', 3941), ('te', 3831), ('edhe', 3805), ('nuk', 3483), ('duke', 3466), ('janë', 3290), ('shumë', 2963), ('vitin', 2778), ('prej', 2577), ('pas', 2450), ('ne', 2349), ('por', 2266), ('do', 2211), ('mund', 2164), ('saj', 2122), ('tyre', 2108), ('ajo', 2051), ('parë', 1975), ('gjatë', 1958), ('këtë', 1795), ('disa', 1788), ('ose', 1770), ('dy', 1750), ('kjo', 1686), ('deri', 1631), ('ku', 1619), ('kishte', 1584), ('kanë', 1501), ('kur', 1490), ('atë', 1338)]


In [15]:
print("\nTop-50 most common words in Subcorpus 2:")
print(counter2.most_common(50))


Top-50 most common words in Subcorpus 2:
[('të', 58968), ('e', 48924), ('në', 40569), ('', 27066), ('dhe', 26244), ('i', 22330), ('një', 16795), ('me', 14951), ('për', 12935), ('nga', 12540), ('që', 9503), ('është', 9270), ('më', 8904), ('u', 8133), ('si', 6656), ('ka', 6107), ('së', 5469), ('tij', 4766), ('se', 4742), ('ai', 4317), ('te', 3880), ('ishte', 3880), ('edhe', 3804), ('nuk', 3451), ('duke', 3359), ('janë', 3329), ('shumë', 2885), ('vitin', 2828), ('prej', 2602), ('pas', 2484), ('por', 2358), ('do', 2287), ('ne', 2210), ('tyre', 2157), ('mund', 2090), ('saj', 2067), ('ajo', 2062), ('gjatë', 1940), ('parë', 1935), ('këtë', 1791), ('ose', 1764), ('dy', 1763), ('disa', 1760), ('deri', 1631), ('kjo', 1588), ('ku', 1585), ('kur', 1528), ('kishte', 1503), ('kanë', 1475), ('atë', 1336)]


In [16]:
# Words unique to each subcorpus
unique_to_1 = set(counter1.keys()) - set(counter2.keys())
unique_to_2 = set(counter2.keys()) - set(counter1.keys())

print(f"\nWords unique to Subcorpus 1: {unique_to_1}")



Words unique to Subcorpus 1: {'panterave', 'milloshit', 'teknollogjike', 'maurepas', 'annën', 'gratia', 'përdridhen', 'sockhenheim', 'vikefortin', 'qliruara', 'vegim', 'pasqyronte', 'garantuesi', 'pickër', 'tidusin', 'nëntëmbëdhjetin', 'prosimianë', 'franksët', 'engine', 'kahrom', 'kundërvihej', 'mbrojtjesh', 'bouglon', 'çiftimin', 'gura', 'detailer', 'thks', 'bodrume', 'nivellartesi', 'dig', 'langsamen', 'mojo', 'hilafetin', 'skenarësh', 'musikhochschule', 'pastërtisë', 'ambjentim', 'bojan', 'vinçat', 'ehailës', 'sistemimit', 'abdulvehabi', 'zakonshmet', 'anomalike', 'remarque', 'vrinë', 'suga', 'telajo', 'ishrit', 'duodeni', 'sikistët', 'rrezikosh', 'kufo', 'kallabllak', 'ngurtësi', 'kalimtaret', 'mekanine', 'doktorinës', 'qëdy', 'qyre', 'masses', 'myzaferi', 'producuar', 'mesimore', 'riarmatosur', 'tijnuk', 'emaili', 'varroset', 'lyliinen', 'mirësevini', 'meziu', 'shtatëmbdhjetë', 'repishti', 'grosco', 'gardat', 'lundrime', 'filizin', 'dridhjëtvjeçare', 'shkencorprofesor', 'anglish

In [None]:
# Words unique to each subcorpus
unique_to_1 = set(counter1.keys()) - set(counter2.keys())
unique_to_2 = set(counter2.keys()) - set(counter1.keys())

print(f"\nWords unique to Subcorpus 1: {unique_to_1}")


In [17]:
import csv

# Get the maximum length to ensure both columns align
max_length = max(len(unique_to_1), len(unique_to_2))

# Convert sets to lists and pad shorter list with empty strings
unique_to_1_list = list(unique_to_1) + [""] * (max_length - len(unique_to_1))
unique_to_2_list = list(unique_to_2) + [""] * (max_length - len(unique_to_2))

# Save both lists into a single CSV file
with open("unique_words_subcorpora.csv", "w", encoding="utf-8", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Unique Words in Subcorpus 1", "Unique Words in Subcorpus 2"])  # Header
    for word1, word2 in zip(unique_to_1_list, unique_to_2_list):
        writer.writerow([word1, word2])

print("Unique words from both subcorpora saved to unique_words_subcorpora.csv")

Unique words from both subcorpora saved to unique_words_subcorpora.csv


In [18]:
from collections import defaultdict

def split_into_quantiles(counter, k=10):
    """
    Splits tokens in a frequency counter into quantiles based on cumulative probability.

    Args:
        counter (Counter): A Counter object containing token frequencies.
        k (int): The number of quantiles (e.g., k=10 for deciles, k=4 for quartiles).

    Returns:
        defaultdict: A dictionary mapping quantile numbers (1, 2, ..., k) to sets of tokens.
    """
    # Sort tokens by frequency in descending order
    sorted_tokens = sorted(counter.items(), key=lambda x: x[1], reverse=True)

    # Calculate total number of tokens to determine probabilities
    total_count = sum(counter.values())

    # Initialize cumulative probability and quantile mapping
    cumulative_probability = 0
    quantiles = defaultdict(set)

    # Iterate through sorted tokens, tracking cumulative probability
    for token, freq in sorted_tokens:
        cumulative_probability += freq / total_count  # Update cumulative probability
        
        # Determine which quantile the token belongs to
        quantile = min(k, int(cumulative_probability * k) + 1)
        quantiles[quantile].add(token)

    return quantiles

In [19]:
import csv

# Save quantile results to a CSV file
with open("quantile_results.csv", "w", encoding="utf-8", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Quantile", "Subcorpus 1 Tokens", "Subcorpus 2 Tokens"])  # Header

    # Get maximum number of tokens per quantile to align columns
    max_quantiles = max(len(quantiles_subcorpus1), len(quantiles_subcorpus2))
    
    for q in range(1, max_quantiles + 1):
        tokens1 = ", ".join(quantiles_subcorpus1.get(q, set()))  # Convert to string
        tokens2 = ", ".join(quantiles_subcorpus2.get(q, set()))  # Convert to string
        writer.writerow([q, tokens1, tokens2])

print("Quantile results saved to quantile_results.csv")

NameError: name 'quantiles_subcorpus1' is not defined

In [None]:
import pandas as pd  # For table representation

# Compute overlap sizes between corresponding decile subsets
decile_overlap = {}

for q in range(1, 11):  # Since we have deciles (k=10)
    overlap_size = len(quantiles_subcorpus1.get(q, set()) & quantiles_subcorpus2.get(q, set()))
    decile_overlap[q] = overlap_size

# Convert results into a DataFrame for better formatting
overlap_table = pd.DataFrame(list(decile_overlap.items()), columns=["Decile", "Overlap Size"])
print(overlap_table)


In [None]:
# Merge token frequency counters from both subcorpora
combined_counter = counter1 + counter2

# Sort tokens by frequency
sorted_tokens = sorted(combined_counter.items(), key=lambda x: x[1], reverse=True)

# Calculate cumulative probability for different coverage thresholds
total_tokens = sum(combined_counter.values())
coverage_thresholds = [0.2, 0.5, 0.7, 0.8, 0.9]  # 20%, 50%, 70%, 80%, 90%
word_counts_needed = {}

cumulative_probability = 0
words_seen = 0

for token, freq in sorted_tokens:
    cumulative_probability += freq / total_tokens
    words_seen += 1

    # Check if we reach a coverage threshold
    for threshold in coverage_thresholds:
        if threshold not in word_counts_needed and cumulative_probability >= threshold:
            word_counts_needed[threshold] = words_seen

# Convert results into a table format
coverage_table = pd.DataFrame(list(word_counts_needed.items()), columns=["Coverage Percentage", "Words Needed"])
print(coverage_table)


In [None]:
# Filter words using is_albanian function
albanian_tokens = [word for word in tokens_cleaned if is_albanian(word)]

print( {len(albanian_tokens)})

In [None]:
import string
from collections import Counter

# Define allowed Albanian letters (lowercase)
albanian_letters = set("abcçdeëfghijklmnopqrstuvxyz")

def is_pure_albanian(word):
    return all(c.lower() in albanian_letters for c in word)

# Read the file
with open("sq-sample.txt", "r", encoding="utf-8") as file:
    text = file.read()

# Preprocess text: lowercase, split, strip punctuation
words = text.lower().split()
words = [word.strip(string.punctuation) for word in words]

# Filter pure Albanian words
pure_albanian_words = [word for word in words if is_pure_albanian(word)]

# Count total and unique words
total_words = len(pure_albanian_words)
unique_words = set(pure_albanian_words)
print(f"Total valid Albanian word tokens: {total_words}")
print(f"Number of unique valid Albanian words: {len(unique_words)}")

# Count word frequencies
word_counts = Counter(pure_albanian_words)
most_common = word_counts.most_common()

# CEFR levels and vocabulary sizes
cefr_levels = {
    "A1": 625,
    "A2": 1250,
    "B1": 2500,
    "B2": 5000
}

# Calculate cumulative coverage for each CEFR level
print("\nCEFR Coverage Estimates:")
print(f"{'Level':<5} {'Words':<6} {'Coverage (%)':<15} {'% of Unique Words Covered'}")
for level, cutoff in cefr_levels.items():
    top_words = [word for word, _ in most_common[:cutoff]]
    coverage = sum(word_counts[word] for word in top_words if word in word_counts)
    coverage_percent = (coverage / total_words) * 100
    unique_covered = len([word for word in unique_words if word in top_words])
    unique_percent = (unique_covered / len(unique_words)) * 100
    print(f"{level:<5} {cutoff:<6} {coverage_percent:<15.2f} {unique_percent:.2f}")
