# Unicode Character Extraction and Filtering
This notebook extracts Unicode characters from a file and filters words that contain only Albanian letters.


In [1]:
# Load the file and extract unique Unicode characters
def extract_unicode_chars(filename):
    with open(filename, encoding='utf-8') as file:
        text = file.read()
    return set(text)

# Example usage:
file_path = "sq-sample.txt"
unicode_chars = extract_unicode_chars(file_path)
print("Unique Unicode characters:", unicode_chars)


Unique Unicode characters: {'ශ', '，', '¼', '腹', 'φ', 'Ч', 'δ', 'უ', 'Č', '仁', 'Ş', 'س', 'ة', 'Ħ', 'ч', 't', 'و', 'Е', '日', 'Φ', 'დ', 'ï', 'đ', 'γ', 'ė', 'É', '준', 'ה', 'Ḫ', '¾', 'S', '′', '領', 'h', 'ˤ', '/', 'ὄ', '½', '紅', '松', '>', '円', '<', '擺', 'ṭ', '餅', '越', '邸', 'у', 'ῖ', '慈', '鬼', '"', ',', '‑', '蛍', 'ạ', 'シ', 'Θ', '看', 'з', '取', '密', 'ﻭ', 'Ć', 'ј', 'Δ', 'в', '校', '本', 'е', '⟩', '男', '須', '歌', 'ῦ', 'ド', 'ං', '„', 'ì', 'ñ', 'ი', 'd', 'Ä', 'ﬁ', '्', '屠', '鯱', 'ك', '夫', '¥', 'त', '二', 'И', '先', 'Ἱ', 'ड', 'ש', 'ρ', 'ë', 'მ', '³', '流', 'श', '¤', 'イ', '絵', 'υ', 'ך', 'ゅ', 'ư', 'j', '星', 'а', 'گ', '€', 'I', 'ү', '龙', 'ι', 'ῳ', '恵', '切', 'ợ', ';', 'ר', '훈', '‘', 'マ', 'А', '三', 'ண', '₈', '美', 'Œ', '鯉', '良', 'ז', '\u2009', 'ɛ', 'の', 'ク', '−', '術', '四', 'ட', ')', '代', '公', '—', 'Ё', '天', 'ú', 'ž', 'ლ', 'ェ', 'r', 'ý', '‖', 'н', '浮', '₄', 'ø', '獄', 'ා', 'ầ', 'â', 'ग', 'ص', 'Ε', '\u200e', 'Ј', 'D', 'n', '相', 'ძ', 'H', 'М', '議', 'Ø', 'w', 'ර', 'N', 'ṃ', '鞭', '«', 'ḏ', 'ύ', '$', 'О', 'ж', '\u200b

In [2]:
# Define the allowed Albanian letters (both uppercase and lowercase)
allowed_chars = set("abcçdeëfghijklmnopqrstuvxyzABCÇDEËFGHIJKLMNOPQRSTUVXYZ")

def is_albanian(word):
    return all(char in allowed_chars for char in word)




In [3]:
# Define the allowed Albanian letters (both uppercase and lowercase)
allowed_chars = set("abcçdeëfghijklmnopqrstuvxyzABCÇDEËFGHIJKLMNOPQRSTUVXYZ")

def is_albanian(word):
    return all(char in allowed_chars for char in word)

    # Example usage:
test_words = ["shtëpi", "hello", "tungjatjeta", "12345","şörle"]
filtered_words = [word for word in test_words if is_albanian(word)]
print("Filtered Albanian words:", filtered_words)




Filtered Albanian words: ['shtëpi', 'hello', 'tungjatjeta']


In [4]:
import random
import re

# Load the file
def load_text(filename):
    with open(filename, encoding='utf-8') as file:
        text = file.read()
    return text

file_path = "sq-sample.txt"
text = load_text(file_path)

In [5]:
# Split text into sentences
sentences = re.split(r'(?<=[.!?])\s+', text)

# Shuffle the sentences randomly
random.shuffle(sentences)

# Reconstruct scrambled text
scrambled_text = ' '.join(sentences)

In [6]:
# Tokenization: split at whitespaces and convert to lowercase
tokens = re.split(r'\s+', scrambled_text.lower())

In [7]:
# Remove punctuation and digits
tokens_cleaned = [re.sub(r'[^a-zA-ZçÇëË]', '', token) for token in tokens if token.strip()]

In [8]:
# Filter words using is_albanian function
albanian_tokens = [word for word in tokens_cleaned if is_albanian(word)]

print(f"Total Albanian tokens: {len(albanian_tokens)}")

Total Albanian tokens: 1916075


In [9]:
# Save tokens to a text file
with open("albanian_tokens.txt", "w", encoding="utf-8") as file:
    for token in albanian_tokens:
        file.write(token + "\n")

print("Tokens saved to albanian_tokens.txt")


Tokens saved to albanian_tokens.txt


In [10]:
import csv

# Save tokens to a CSV file
with open("albanian_tokens.csv", "w", encoding="utf-8", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Token"])  # Add a header
    for token in albanian_tokens:
        writer.writerow([token])

print("Tokens saved to albanian_tokens.csv")


Tokens saved to albanian_tokens.csv


In [11]:
from collections import Counter

# Split into two roughly equal parts
midpoint = len(albanian_tokens) // 2
subcorpus1 = albanian_tokens[:midpoint]
subcorpus2 = albanian_tokens[midpoint:]

In [12]:
# Create frequency counters
counter1 = Counter(subcorpus1)
counter2 = Counter(subcorpus2)

In [13]:
# Print top-50 most common words in each subcorpus

print("Top-50 most common words in Subcorpus 1:")
print(counter1.most_common(50))

print("\nTop-50 most common words in Subcorpus 2:")
print(counter2.most_common(50))


Top-50 most common words in Subcorpus 1:
[('të', 58976), ('e', 49124), ('në', 40961), ('', 27064), ('dhe', 26457), ('i', 22153), ('një', 17008), ('me', 15081), ('për', 12852), ('nga', 12418), ('që', 9387), ('është', 9243), ('më', 8970), ('u', 8055), ('si', 6646), ('ka', 6045), ('së', 5479), ('tij', 4756), ('se', 4655), ('ai', 4234), ('ishte', 3887), ('edhe', 3761), ('te', 3712), ('nuk', 3463), ('duke', 3401), ('janë', 3251), ('shumë', 2874), ('vitin', 2789), ('prej', 2591), ('pas', 2497), ('por', 2258), ('do', 2243), ('ne', 2232), ('ajo', 2130), ('tyre', 2125), ('saj', 2114), ('mund', 2114), ('parë', 1924), ('gjatë', 1891), ('ose', 1782), ('këtë', 1773), ('disa', 1755), ('dy', 1751), ('deri', 1652), ('kjo', 1651), ('ku', 1609), ('kishte', 1569), ('kur', 1538), ('kanë', 1455), ('atë', 1322)]

Top-50 most common words in Subcorpus 2:
[('të', 58738), ('e', 49157), ('në', 40554), ('', 26937), ('dhe', 26007), ('i', 22283), ('një', 16831), ('me', 15134), ('për', 12887), ('nga', 12621), ('që'

In [14]:
# Print top-50 most common words in each subcorpus one by one
print("Top-50 most common words in Subcorpus 1:")
print(counter1.most_common(50))


Top-50 most common words in Subcorpus 1:
[('të', 58976), ('e', 49124), ('në', 40961), ('', 27064), ('dhe', 26457), ('i', 22153), ('një', 17008), ('me', 15081), ('për', 12852), ('nga', 12418), ('që', 9387), ('është', 9243), ('më', 8970), ('u', 8055), ('si', 6646), ('ka', 6045), ('së', 5479), ('tij', 4756), ('se', 4655), ('ai', 4234), ('ishte', 3887), ('edhe', 3761), ('te', 3712), ('nuk', 3463), ('duke', 3401), ('janë', 3251), ('shumë', 2874), ('vitin', 2789), ('prej', 2591), ('pas', 2497), ('por', 2258), ('do', 2243), ('ne', 2232), ('ajo', 2130), ('tyre', 2125), ('saj', 2114), ('mund', 2114), ('parë', 1924), ('gjatë', 1891), ('ose', 1782), ('këtë', 1773), ('disa', 1755), ('dy', 1751), ('deri', 1652), ('kjo', 1651), ('ku', 1609), ('kishte', 1569), ('kur', 1538), ('kanë', 1455), ('atë', 1322)]


In [15]:
print("\nTop-50 most common words in Subcorpus 2:")
print(counter2.most_common(50))


Top-50 most common words in Subcorpus 2:
[('të', 58738), ('e', 49157), ('në', 40554), ('', 26937), ('dhe', 26007), ('i', 22283), ('një', 16831), ('me', 15134), ('për', 12887), ('nga', 12621), ('që', 9328), ('është', 9153), ('më', 8799), ('u', 8162), ('si', 6589), ('ka', 6205), ('së', 5582), ('tij', 4902), ('se', 4804), ('ai', 4371), ('te', 3999), ('ishte', 3934), ('edhe', 3848), ('nuk', 3471), ('duke', 3424), ('janë', 3368), ('shumë', 2974), ('vitin', 2817), ('prej', 2588), ('pas', 2437), ('por', 2366), ('ne', 2327), ('do', 2255), ('mund', 2140), ('tyre', 2140), ('saj', 2075), ('gjatë', 2007), ('parë', 1986), ('ajo', 1983), ('këtë', 1813), ('disa', 1793), ('dy', 1762), ('ose', 1752), ('kjo', 1623), ('deri', 1610), ('ku', 1595), ('kanë', 1521), ('kishte', 1518), ('kur', 1480), ('atë', 1352)]


In [16]:
# Words unique to each subcorpus
unique_to_1 = set(counter1.keys()) - set(counter2.keys())
unique_to_2 = set(counter2.keys()) - set(counter1.keys())

print(f"\nWords unique to Subcorpus 1: {unique_to_1}")



Words unique to Subcorpus 1: {'percaktimin', 'klimatografia', 'rrëfejnë', 'dedushajve', 'prediqi', 'veshtira', 'garrigue', 'mclaren', 'demaliaj', 'tarantinos', 'shitnin', 'procesesit', 'groggy', 'lohan', 'cikopulla', 'insistuar', 'panathinaikos', 'arcooh', 'polonizatimi', 'gërmuese', 'kërs', 'parvata', 'xheva', 'shkencoreteknologjike', 'kinematorafik', 'herderi', 'fierak', 'imponimit', 'soyons', 'stratigrafi', 'antidepresivët', 'draganit', 'sepredoni', 'ryshfeti', 'girona', 'germëvogël', 'postrribës', 'revestsaintmartin', 'gkova', 'arsimorë', 'pamps', 'jerusalemin', 'cëntrali', 'nasima', 'breslin', 'amjad', 'gypash', 'relativist', 'arvanishte', 'autorizoheshin', 'pequonnock', 'brae', 'bahias', 'epahto', 'filluarnga', 'vizitimit', 'gene', 'tarsierët', 'ngjyrosej', 'unicornis', 'ipje', 'patër', 'hebraicae', 'fondib', 'uluni', 'revizionistë', 'prodani', 'mynyrë', 'njëmbëdhjetëshin', 'parasyssh', 'rezistuese', 'teg', 'leshuar', 'koty', 'volkskammer', 'llagapit', 'daler', 'uzurpatori', 'st

In [17]:
# Words unique to each subcorpus
unique_to_1 = set(counter1.keys()) - set(counter2.keys())
unique_to_2 = set(counter2.keys()) - set(counter1.keys())

print(f"\nWords unique to Subcorpus 1: {unique_to_1}")



Words unique to Subcorpus 1: {'percaktimin', 'klimatografia', 'rrëfejnë', 'dedushajve', 'prediqi', 'veshtira', 'garrigue', 'mclaren', 'demaliaj', 'tarantinos', 'shitnin', 'procesesit', 'groggy', 'lohan', 'cikopulla', 'insistuar', 'panathinaikos', 'arcooh', 'polonizatimi', 'gërmuese', 'kërs', 'parvata', 'xheva', 'shkencoreteknologjike', 'kinematorafik', 'herderi', 'fierak', 'imponimit', 'soyons', 'stratigrafi', 'antidepresivët', 'draganit', 'sepredoni', 'ryshfeti', 'girona', 'germëvogël', 'postrribës', 'revestsaintmartin', 'gkova', 'arsimorë', 'pamps', 'jerusalemin', 'cëntrali', 'nasima', 'breslin', 'amjad', 'gypash', 'relativist', 'arvanishte', 'autorizoheshin', 'pequonnock', 'brae', 'bahias', 'epahto', 'filluarnga', 'vizitimit', 'gene', 'tarsierët', 'ngjyrosej', 'unicornis', 'ipje', 'patër', 'hebraicae', 'fondib', 'uluni', 'revizionistë', 'prodani', 'mynyrë', 'njëmbëdhjetëshin', 'parasyssh', 'rezistuese', 'teg', 'leshuar', 'koty', 'volkskammer', 'llagapit', 'daler', 'uzurpatori', 'st

In [18]:
print(f"Words unique to Subcorpus 2: {unique_to_2}")

Words unique to Subcorpus 2: {'redeveloped', 'jinchuriki', 'sulley', 'thermia', 'gjermanoaziatik', 'nairobi', 'kzn', 'trasmetuar', 'fotogrammetrisë', 'cmpie', 'fjalëpërfjaltë', 'tredhë', 'angjelokastrës', 'umuti', 'challandes', 'shpërdorimet', 'siguroheshin', 'lpuani', 'cettie', 'sistemikë', 'pashtershme', 'onkologjisë', 'shtytur', 'ricciardos', 'bashkëqytetari', 'matesich', 'qendrim', 'vidoe', 'athanasi', 'ishinsanya', 'istikam', 'oragnizata', 'çlirojnë', 'sikuliotëve', 'reperuarit', 'llagapet', 'rëpjetë', 'elnaimi', 'altiero', 'maoiste', 'rruzulin', 'prenden', 'vrapoj', 'kosovësnë', 'autograf', 'kalibe', 'ibbyt', 'surry', 'ulysess', 'rilindore', 'plenumit', 'gjuajte', 'balczar', 'riqarkullimit', 'tarsi', 'brengoset', 'napolitane', 'kompasi', 'kobalti', 'fnopcsa', 'radht', 'svirrca', 'bis', 'katoliket', 'shkëlqesisë', 'intrigon', 'myqerrem', 'zëvendësimit', 'arrti', 'mbullur', 'civilëndërsa', 'hepaticus', 'nënvizim', 'soufriere', 'teriparatide', 'ciftezuar', 'rivlerësimin', 'etimologj

In [19]:
import csv

# Get the maximum length to ensure both columns align
max_length = max(len(unique_to_1), len(unique_to_2))

# Convert sets to lists and pad shorter list with empty strings
unique_to_1_list = list(unique_to_1) + [""] * (max_length - len(unique_to_1))
unique_to_2_list = list(unique_to_2) + [""] * (max_length - len(unique_to_2))

# Save both lists into a single CSV file
with open("unique_words_subcorpora.csv", "w", encoding="utf-8", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Unique Words in Subcorpus 1", "Unique Words in Subcorpus 2"])  # Header
    for word1, word2 in zip(unique_to_1_list, unique_to_2_list):
        writer.writerow([word1, word2])

print("Unique words from both subcorpora saved to unique_words_subcorpora.csv")

Unique words from both subcorpora saved to unique_words_subcorpora.csv


In [20]:
from collections import defaultdict

def split_into_quantiles(counter, k=10):
    """
    Splits tokens in a frequency counter into quantiles based on cumulative probability.

    Args:
        counter (Counter): A Counter object containing token frequencies.
        k (int): The number of quantiles (e.g., k=10 for deciles, k=4 for quartiles).

    Returns:
        defaultdict: A dictionary mapping quantile numbers (1, 2, ..., k) to sets of tokens.
    """
    # Sort tokens by frequency in descending order
    sorted_tokens = sorted(counter.items(), key=lambda x: x[1], reverse=True)

    # Calculate total number of tokens to determine probabilities
    total_count = sum(counter.values())

    # Initialize cumulative probability and quantile mapping
    cumulative_probability = 0
    quantiles = defaultdict(set)

    # Iterate through sorted tokens, tracking cumulative probability
    for token, freq in sorted_tokens:
        cumulative_probability += freq / total_count  # Update cumulative probability
        
        # Determine which quantile the token belongs to
        quantile = min(k, int(cumulative_probability * k) + 1)
        quantiles[quantile].add(token)

    return quantiles

In [21]:
quantiles_subcorpus1 = split_into_quantiles(counter1, k=10)
quantiles_subcorpus2 = split_into_quantiles(counter2, k=10)

# Print the quantiles
print("Quantiles for Subcorpus 1:")
for quantile, tokens in quantiles_subcorpus1.items():
    print(f"Quantile {quantile}: {tokens}")

print("\nQuantiles for Subcorpus 2:")
for quantile, tokens in quantiles_subcorpus2.items():
    print(f"Quantile {quantile}: {tokens}")
,

Quantiles for Subcorpus 1:
Quantile 1: {'të'}
Quantile 2: {'', 'e', 'në'}
Quantile 3: {'për', 'me', 'i', 'nga', 'një', 'dhe'}
Quantile 4: {'së', 'janë', 'edhe', 'më', 'duke', 'ishte', 'tij', 'ka', 'si', 'është', 'se', 'që', 'shumë', 'u', 'prej', 'ai', 'nuk', 'vitin', 'te', 'pas'}
Quantile 5: {'tek', 'gjatë', 'duhet', 'bë', 'cili', 'jo', 'ndaj', 'këtë', 'mënyrë', 'çdo', 'pasur', 'rreth', 'ndryshme', 'nje', 'cila', 'kjo', 'ajo', 'mori', 'pasi', 'fund', 'kundër', 'mund', 'vitit', 'popullsi', 'kohë', 'pjesë', 'dytë', 'do', 'dy', 'tyre', 'ta', 'kanë', 'cilat', 'para', 'qe', 'vitet', 'ose', 'tre', 'sa', 'pa', 'atë', 'përbëhet', 'kishte', 'ku', 'bërë', 'kur', 'vend', 'fundit', 'kështu', 'vetëm', 'madh', 'tjetër', 'sipas', 'kishin', 'viteve', 'nëse', 'madhe', 'lartë', 'sipërfaqe', 'parë', 'apo', 'ishin', 'ky', 'ato', 'saj', 'filloi', 'këto', 'herë', 'ne', 'ata', 'komuna', 'njohur', 'deri', 'midis', 'vonë', 'viti', 'kësaj', 'por', 'jetë', 'banorë', 'nën', 'ndërsa', 'mirë', 'emrin', 'marrë', 'di

''

In [None]:
import csv

# Save quantile results to a CSV file
with open("quantile_results.csv", "w", encoding="utf-8", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Quantile", "Subcorpus 1 Tokens", "Subcorpus 2 Tokens"])  # Header

    # Get maximum number of tokens per quantile to align columns
    max_quantiles = max(len(quantiles_subcorpus1), len(quantiles_subcorpus2))
    
    for q in range(1, max_quantiles + 1):
        tokens1 = ", ".join(quantiles_subcorpus1.get(q, set()))  # Convert to string
        tokens2 = ", ".join(quantiles_subcorpus2.get(q, set()))  # Convert to string
        writer.writerow([q, tokens1, tokens2])

print("Quantile results saved to quantile_results.csv")

In [None]:
import matplotlib.pyplot as plt

# Compute number of tokens in each quantile
quantile_counts1 = {q: len(tokens) for q, tokens in quantiles_subcorpus1.items()}
quantile_counts2 = {q: len(tokens) for q, tokens in quantiles_subcorpus2.items()}

# Sort quantiles for consistency
quantile_labels = sorted(set(quantile_counts1.keys()).union(set(quantile_counts2.keys())))

# Get frequencies in order
values1 = [quantile_counts1.get(q, 0) for q in quantile_labels]
values2 = [quantile_counts2.get(q, 0) for q in quantile_labels]

# Plot bar chart
plt.figure(figsize=(10, 5))
plt.bar(quantile_labels, values1, width=0.4, label="Subcorpus 1", alpha=0.7)
plt.bar([q + 0.4 for q in quantile_labels], values2, width=0.4, label="Subcorpus 2", alpha=0.7)

plt.xlabel("Quantile")
plt.ylabel("Number of Tokens")
plt.title("Token Distribution Across Quantiles")
plt.legend()
plt.xticks(quantile_labels)
plt.grid(axis="y", linestyle="--", alpha=0.5)
plt.show()
