# Unicode Character Extraction and Filtering
This notebook extracts Unicode characters from a file and filters words that contain only Albanian letters.


In [1]:
# Load the file and extract unique Unicode characters
def extract_unicode_chars(filename):
    with open(filename, encoding='utf-8') as file:
        text = file.read()
    return set(text)

# Example usage:
file_path = "sq-sample.txt"
unicode_chars = extract_unicode_chars(file_path)
print("Unique Unicode characters:", unicode_chars)


Unique Unicode characters: {'ע', '政', 'ἠ', '話', '南', '吳', 'К', 'ण', '領', '⃗', '—', '藤', '古', 'ま', '盟', '座', '京', 'k', 'َ', '₈', '今', 'ď', '’', 'ü', 'ε', 'し', '-', 'ί', '舞', 'ב', 'х', 'ニ', '居', 'ъ', 'ő', '秀', 'ż', 'ˤ', 'Ṣ', 'ι', '°', '恵', 'Ј', 'ċ', '9', 'δ', '.', '¬', 'ḍ', 'S', '글', '《', '蓮', 'ה', 'ჩ', 'X', 'ლ', 'ῦ', '0', '丸', 'ἴ', '‚', 'U', '官', 'ـ', 'フ', 'ˁ', '一', '′', 'ž', 'ギ', '栗', '宫', 'ò', 'ي', 'த', 'Ś', 'न', '（', '\n', '‐', 'ḫ', '講', 'Í', '¥', 'Ō', '佛', '雨', 'ή', 'С', 'E', 'L', '∟', 'ﻳ', '須', 'स', '野', 'რ', '門', 'з', 'Ä', '伎', 'い', 'ė', 'æ', '猛', '\u2009', '्', 'Č', 'カ', '議', 'ର', 'ו', 'A', '8', 'j', 'ﻭ', 'ो', '&', '川', '越', 'д', 'Χ', 'а', '鯉', 'ش', 'ク', '赋', '結', '山', 'q', 'ِ', 'נ', '》', 'ś', '肇', 'Г', 'ბ', 'ŷ', 'უ', '美', '眼', '為', '人', '宮', '婚', 'G', '省', '地', '狂', 'グ', 'त', 'c', 'ب', 'ვ', 'ă', 'ソ', '会', 'ग', '≥', 'び', '₆', '正', 'მ', 'V', 'î', 'र', 'D', 'ァ', 'ャ', '²', 'o', 'a', 'ƒ', 'µ', 'ё', '%', '˚', '‰', ' ', '́', '浮', 'Ή', 'Ć', 'ி', '堂', 'Ἱ', '界', 'ř', 'э', 'テ', 'μ', '×', '

In [2]:
# Define the allowed Albanian letters (both uppercase and lowercase)
allowed_chars = set("abcçdeëfghijklmnopqrstuvxyzABCÇDEËFGHIJKLMNOPQRSTUVXYZ")

def is_albanian(word):
    return all(char in allowed_chars for char in word)




In [3]:
# Define the allowed Albanian letters (both uppercase and lowercase)
allowed_chars = set("abcçdeëfghijklmnopqrstuvxyzABCÇDEËFGHIJKLMNOPQRSTUVXYZ")

def is_albanian(word):
    return all(char in allowed_chars for char in word)

    # Example usage:
test_words = ["shtëpi", "hello", "tungjatjeta", "12345","şörle"]
filtered_words = [word for word in test_words if is_albanian(word)]
print("Filtered Albanian words:", filtered_words)




Filtered Albanian words: ['shtëpi', 'hello', 'tungjatjeta']


In [4]:
import random
import re

# Load the file
def load_text(filename):
    with open(filename, encoding='utf-8') as file:
        text = file.read()
    return text

file_path = "sq-sample.txt"
text = load_text(file_path)

In [5]:
# Split text into sentences
sentences = re.split(r'(?<=[.!?])\s+', text)

# Shuffle the sentences randomly
random.shuffle(sentences)

# Reconstruct scrambled text
scrambled_text = ' '.join(sentences)

In [6]:
# Tokenization: split at whitespaces and convert to lowercase
tokens = re.split(r'\s+', scrambled_text.lower())

In [7]:
# Remove punctuation and digits
tokens_cleaned = [re.sub(r'[^a-zA-ZçÇëË]', '', token) for token in tokens if token.strip()]

In [8]:
# Filter words using is_albanian function
albanian_tokens = [word for word in tokens_cleaned if is_albanian(word)]

print(f"Total Albanian tokens: {len(albanian_tokens)}")

Total Albanian tokens: 1916075


In [9]:
# Save tokens to a text file
with open("albanian_tokens.txt", "w", encoding="utf-8") as file:
    for token in albanian_tokens:
        file.write(token + "\n")

print("Tokens saved to albanian_tokens.txt")


Tokens saved to albanian_tokens.txt


In [10]:
import csv

# Save tokens to a CSV file
with open("albanian_tokens.csv", "w", encoding="utf-8", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Token"])  # Add a header
    for token in albanian_tokens:
        writer.writerow([token])

print("Tokens saved to albanian_tokens.csv")


Tokens saved to albanian_tokens.csv


In [11]:
from collections import Counter

# Split into two roughly equal parts
midpoint = len(albanian_tokens) // 2
subcorpus1 = albanian_tokens[:midpoint]
subcorpus2 = albanian_tokens[midpoint:]

In [12]:
# Create frequency counters
counter1 = Counter(subcorpus1)
counter2 = Counter(subcorpus2)

In [13]:
# Print top-50 most common words in each subcorpus

print("Top-50 most common words in Subcorpus 1:")
print(counter1.most_common(50))

print("\nTop-50 most common words in Subcorpus 2:")
print(counter2.most_common(50))


Top-50 most common words in Subcorpus 1:
[('të', 58836), ('e', 49184), ('në', 40578), ('', 26912), ('dhe', 26303), ('i', 22208), ('një', 16838), ('me', 15164), ('për', 12894), ('nga', 12601), ('që', 9409), ('është', 9164), ('më', 8852), ('u', 8113), ('si', 6592), ('ka', 6075), ('së', 5587), ('tij', 4911), ('se', 4708), ('ai', 4274), ('te', 3995), ('ishte', 3833), ('edhe', 3760), ('nuk', 3439), ('duke', 3426), ('janë', 3321), ('shumë', 2939), ('vitin', 2857), ('prej', 2611), ('pas', 2474), ('por', 2298), ('do', 2272), ('ne', 2267), ('mund', 2159), ('saj', 2124), ('tyre', 2085), ('ajo', 2057), ('parë', 1981), ('gjatë', 1920), ('disa', 1793), ('këtë', 1778), ('dy', 1756), ('ose', 1750), ('deri', 1640), ('ku', 1611), ('kjo', 1589), ('kishte', 1526), ('kanë', 1514), ('kur', 1502), ('atë', 1344)]

Top-50 most common words in Subcorpus 2:
[('të', 58878), ('e', 49097), ('në', 40937), ('', 27089), ('dhe', 26161), ('i', 22228), ('një', 17001), ('me', 15051), ('për', 12845), ('nga', 12438), ('që'

In [14]:
# Print top-50 most common words in each subcorpus one by one
print("Top-50 most common words in Subcorpus 1:")
print(counter1.most_common(50))


Top-50 most common words in Subcorpus 1:
[('të', 58836), ('e', 49184), ('në', 40578), ('', 26912), ('dhe', 26303), ('i', 22208), ('një', 16838), ('me', 15164), ('për', 12894), ('nga', 12601), ('që', 9409), ('është', 9164), ('më', 8852), ('u', 8113), ('si', 6592), ('ka', 6075), ('së', 5587), ('tij', 4911), ('se', 4708), ('ai', 4274), ('te', 3995), ('ishte', 3833), ('edhe', 3760), ('nuk', 3439), ('duke', 3426), ('janë', 3321), ('shumë', 2939), ('vitin', 2857), ('prej', 2611), ('pas', 2474), ('por', 2298), ('do', 2272), ('ne', 2267), ('mund', 2159), ('saj', 2124), ('tyre', 2085), ('ajo', 2057), ('parë', 1981), ('gjatë', 1920), ('disa', 1793), ('këtë', 1778), ('dy', 1756), ('ose', 1750), ('deri', 1640), ('ku', 1611), ('kjo', 1589), ('kishte', 1526), ('kanë', 1514), ('kur', 1502), ('atë', 1344)]


In [15]:
print("\nTop-50 most common words in Subcorpus 2:")
print(counter2.most_common(50))


Top-50 most common words in Subcorpus 2:
[('të', 58878), ('e', 49097), ('në', 40937), ('', 27089), ('dhe', 26161), ('i', 22228), ('një', 17001), ('me', 15051), ('për', 12845), ('nga', 12438), ('që', 9306), ('është', 9232), ('më', 8917), ('u', 8104), ('si', 6643), ('ka', 6175), ('së', 5474), ('se', 4751), ('tij', 4747), ('ai', 4331), ('ishte', 3988), ('edhe', 3849), ('te', 3716), ('nuk', 3495), ('duke', 3399), ('janë', 3298), ('shumë', 2909), ('vitin', 2749), ('prej', 2568), ('pas', 2460), ('por', 2326), ('ne', 2292), ('do', 2226), ('tyre', 2180), ('mund', 2095), ('saj', 2065), ('ajo', 2056), ('gjatë', 1978), ('parë', 1929), ('këtë', 1808), ('ose', 1784), ('dy', 1757), ('disa', 1755), ('kjo', 1685), ('deri', 1622), ('ku', 1593), ('kishte', 1561), ('kur', 1516), ('kanë', 1462), ('atë', 1330)]


In [16]:
# Words unique to each subcorpus
unique_to_1 = set(counter1.keys()) - set(counter2.keys())
unique_to_2 = set(counter2.keys()) - set(counter1.keys())

print(f"\nWords unique to Subcorpus 1: {unique_to_1}")



Words unique to Subcorpus 1: {'hipokrenit', 'nikel', 'parashurama', 'kërkuesja', 'andhas', 'sobën', 'siecus', 'escape', 'nesma', 'tafa', 'konsanguinile', 'atëdheut', 'lidasit', 'demostrojë', 'northumberland', 'dreqin', 'beth', 'spekulua', 'dobrune', 'tanker', 'robbie', 'kanopi', 'rifillonte', 'yliopisto', 'spektrometria', 'kliko', 'përzotshme', 'palavli', 'epromet', 'ndëraleate', 'sajojë', 'asimilua', 'zpetek', 'kompjuteret', 'hysejni', 'junction', 'idrizaj', 'rajamalla', 'mullabazimici', 'organizonte', 'arkivuara', 'sidornos', 'saintpierredescorps', 'furrave', 'treletër', 'privatizuara', 'decahedron', 'egzarhitë', 'kajetana', 'danzigprusia', 'gligorov', 'ankorimi', 'fabrikua', 'ossu', 'shtjellojnë', 'pillitu', 'përqëndrime', 'distalisht', 'zzap', 'sarmatian', 'ultësirave', 'vishishtadvaita', 'spahinj', 'knull', 'anglofrëngjisht', 'evitur', 'ndërtosh', 'instikteve', 'ndërshtesa', 'biennio', 'yamaha', 'pompei', 'bilobed', 'projektmarrëveshje', 'melin', 'vendosuara', 'nysj', 'rusvelt', 

In [17]:
# Words unique to each subcorpus
unique_to_1 = set(counter1.keys()) - set(counter2.keys())
unique_to_2 = set(counter2.keys()) - set(counter1.keys())

print(f"\nWords unique to Subcorpus 1: {unique_to_1}")



Words unique to Subcorpus 1: {'hipokrenit', 'nikel', 'parashurama', 'kërkuesja', 'andhas', 'sobën', 'siecus', 'escape', 'nesma', 'tafa', 'konsanguinile', 'atëdheut', 'lidasit', 'demostrojë', 'northumberland', 'dreqin', 'beth', 'spekulua', 'dobrune', 'tanker', 'robbie', 'kanopi', 'rifillonte', 'yliopisto', 'spektrometria', 'kliko', 'përzotshme', 'palavli', 'epromet', 'ndëraleate', 'sajojë', 'asimilua', 'zpetek', 'kompjuteret', 'hysejni', 'junction', 'idrizaj', 'rajamalla', 'mullabazimici', 'organizonte', 'arkivuara', 'sidornos', 'saintpierredescorps', 'furrave', 'treletër', 'privatizuara', 'decahedron', 'egzarhitë', 'kajetana', 'danzigprusia', 'gligorov', 'ankorimi', 'fabrikua', 'ossu', 'shtjellojnë', 'pillitu', 'përqëndrime', 'distalisht', 'zzap', 'sarmatian', 'ultësirave', 'vishishtadvaita', 'spahinj', 'knull', 'anglofrëngjisht', 'evitur', 'ndërtosh', 'instikteve', 'ndërshtesa', 'biennio', 'yamaha', 'pompei', 'bilobed', 'projektmarrëveshje', 'melin', 'vendosuara', 'nysj', 'rusvelt', 

In [18]:
print(f"Words unique to Subcorpus 2: {unique_to_2}")

Words unique to Subcorpus 2: {'czerny', 'horizontëve', 'osroene', 'odrin', 'daler', 'ghuha', 'gotrës', 'egërsinë', 'parashtresë', 'indoevropianishtes', 'cronaca', 'egistin', 'kojubi', 'shqipërie', 'gillespie', 'kërkuam', 'ljudi', 'kapërcime', 'braces', 'autobiographical', 'italienisch', 'contest', 'përtu', 'katedralsës', 'deit', 'vrapo', 'vacherin', 'ngiste', 'mmol', 'republikine', 'alemdari', 'tpmt', 'spartakasë', 'noimagepng', 'vejçare', 'pasditën', 'qikat', 'preshevar', 'çautsun', 'përdoruesiuser', 'glennes', 'kundërvajtësit', 'polakja', 'hidrosanitare', 'txhelozojn', 'gërçanjë', 'konaparvatamin', 'mercuer', 'xhibril', 'ote', 'primus', 'egaleo', 'cornificiusin', 'paapelueshëm', 'nevojshem', 'brisku', 'mahindra', 'mirëpop', 'natori', 'perkusion', 'bec', 'kongjestive', 'moderneditës', 'budall', 'gjembezimi', 'astraphobia', 'arbeitsbuch', 'kameleonë', 'respektueshëm', 'lekure', 'kusrse', 'ishtiaq', 'balestra', 'lazareviqit', 'bellosi', 'adlerbeth', 'kyçeve', 'vixhen', 'osmanishtes', 'g

In [19]:
import csv

# Get the maximum length to ensure both columns align
max_length = max(len(unique_to_1), len(unique_to_2))

# Convert sets to lists and pad shorter list with empty strings
unique_to_1_list = list(unique_to_1) + [""] * (max_length - len(unique_to_1))
unique_to_2_list = list(unique_to_2) + [""] * (max_length - len(unique_to_2))

# Save both lists into a single CSV file
with open("unique_words_subcorpora.csv", "w", encoding="utf-8", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Unique Words in Subcorpus 1", "Unique Words in Subcorpus 2"])  # Header
    for word1, word2 in zip(unique_to_1_list, unique_to_2_list):
        writer.writerow([word1, word2])

print("Unique words from both subcorpora saved to unique_words_subcorpora.csv")

Unique words from both subcorpora saved to unique_words_subcorpora.csv
