### D7041-Lab 2

In [None]:
"""
D7041-Lab 2
Deborah Aittoklllio debait-2
Joel Willén Joewil-2

"""

### Part 1: Language Classification with High-Dimensional Distributed Representations


#### Task 1.1: Import Datasets


In [18]:
# Import necessary libraries
import numpy as np              # For numerical operations and arrays
import os                        # For file and folder operations
import re                        # For text cleaning (remove punctuation)
from collections import defaultdict  # For storing data in dictionaries
import glob                      # For finding files with patterns

In [19]:
# Define the path to my language data folder
data_folder = r"C:\Users\Deborah Aittokallio\OneDrive - ltu.se\University Courses\Applied Artificial Intelligence D7041E (CURRENT)\Labs\Lab 2\lab2_HD_SOM_BACKPROP\lab2_HD_SOM_BACKPROP\News_Languages"

# Check if the folder exists
print("Checking folder contents...")
print(f"Folder path: {data_folder}")
print(f"Folder exists: {os.path.exists(data_folder)}")
print(f"Is it a directory: {os.path.isdir(data_folder)}")

# List all files and folders in the directory
if os.path.exists(data_folder):
    all_items = os.listdir(data_folder)
    print(f"\nTotal items found: {len(all_items)}")
    print("\nFirst 20 items:")
    for i, item in enumerate(all_items[:20]):
        full_path = os.path.join(data_folder, item)
        item_type = "FOLDER" if os.path.isdir(full_path) else "FILE"
        print(f"  [{item_type}] {item}")

Checking folder contents...
Folder path: C:\Users\Deborah Aittokallio\OneDrive - ltu.se\University Courses\Applied Artificial Intelligence D7041E (CURRENT)\Labs\Lab 2\lab2_HD_SOM_BACKPROP\lab2_HD_SOM_BACKPROP\News_Languages
Folder exists: True
Is it a directory: True

Total items found: 21

First 20 items:
  [FOLDER] bul_news_2020_100K
  [FOLDER] ces_news_2020_100K
  [FOLDER] dan_news_2020_100K
  [FOLDER] deu_news_2020_100K
  [FOLDER] ell_news_2020_100K
  [FOLDER] eng_news_2020_100K
  [FOLDER] est_news_2020_100K
  [FOLDER] fin_news_2020_100K
  [FOLDER] fra_news_2020_100K
  [FOLDER] hun_news_2020_100K
  [FOLDER] ita_news_2020_100K
  [FOLDER] lav_news_2020_100K
  [FOLDER] lit_news_2020_100K
  [FOLDER] nld_news_2020_100K
  [FOLDER] pol_news_2020_100K
  [FOLDER] por_news_2020_100K
  [FOLDER] ron_news_2020_100K
  [FOLDER] slk_news_2020_100K
  [FOLDER] slv_news_2020_100K
  [FOLDER] spa_news_2020_100K


In [20]:
# Function to load text from a file
def load_text_file(file_path):
    """
    Read text from a file
    Input: file path (string)
    Output: text content (string)
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        return text
    except:
        # If UTF-8 doesn't work, try latin-1 encoding
        with open(file_path, 'r', encoding='latin-1') as f:
            text = f.read()
        return 
    



In [21]:
# Function to clean text (remove punctuation and extra spaces)
def clean_text(text):
    """
    Remove punctuation and convert to lowercase
    Input: raw text (string)
    Output: cleaned text (string)
    """
    # Convert to lowercase
    text = text.lower()
    
    # Keep only letters and spaces (remove punctuation, numbers, etc.)
    text = re.sub(r'[^a-zäöüßàáâãäåæçèéêëìíîïñòóôõöøùúûüýÿ\s]', '', text)
    
    # Replace multiple spaces with single space
    text = re.sub(r'\s+', ' ', text)
    
    # Remove leading/trailing spaces
    text = text.strip()
    
    return text

In [25]:
def load_language_data_from_nested_subfolders(base_folder):
    """
    Load all language text files from nested subfolders
    Input: base folder path (string)
    Output: dictionary with language names as keys and text as values
    """
    language_data = {}  # Create empty dictionary to store data
    
    # Get all subfolders in the base folder
    subfolders = [f for f in os.listdir(base_folder) if os.path.isdir(os.path.join(base_folder, f))]
    
    print(f"Found {len(subfolders)} language folders")
    
    # Loop through each language subfolder
    for subfolder in subfolders:
        # Extract language code (first 3 letters: bul, eng, fra, etc.)
        language_code = subfolder[:3]
        
        # Path to the language folder
        lang_folder_path = os.path.join(base_folder, subfolder)
        
        # Check if there's a nested folder with the same name
        nested_folder_path = os.path.join(lang_folder_path, subfolder)
        
        # Use nested folder if it exists, otherwise use the main folder
        if os.path.isdir(nested_folder_path):
            search_path = nested_folder_path
        else:
            search_path = lang_folder_path
        
        # Find all .txt files in the search path
        txt_files = glob.glob(os.path.join(search_path, "*.txt"))
        
        if len(txt_files) == 0:
            print(f"  ⚠ {language_code}: No .txt files found in {subfolder}")
            continue
        
        # We'll use the "sentences.txt" file if it exists (cleanest data)
        # Otherwise combine all .txt files
        sentences_file = os.path.join(search_path, f"{subfolder}-sentences.txt")
        
        if os.path.exists(sentences_file):
            # Use only the sentences file (best for language classification)
            text = load_text_file(sentences_file)
            cleaned_text = clean_text(text)
            print(f"  ✓ {language_code}: Loaded sentences file, {len(cleaned_text)} characters")
        else:
            # Combine all text files for this language
            all_text = ""
            for txt_file in txt_files:
                text = load_text_file(txt_file)
                all_text += text + " "  # Add space between files
            
            cleaned_text = clean_text(all_text)
            print(f"  ✓ {language_code}: Loaded {len(txt_files)} file(s), {len(cleaned_text)} characters")
        
        # Store in dictionary
        language_data[language_code] = cleaned_text
    
    return language_data

In [26]:
# Load all language data from nested subfolders
print("Loading language data from nested subfolders...")
language_texts = load_language_data_from_nested_subfolders(data_folder)
print(f"\n✓ Total languages loaded: {len(language_texts)}")
print(f"Languages: {sorted(list(language_texts.keys()))}")

# Split each language's data into train/val/test
train_data = {}  # Dictionary to store training data for each language
val_data = {}    # Dictionary to store validation data for each language
test_data = {}   # Dictionary to store test data for each language

print("\nSplitting data into train/validation/test sets (70%/15%/15%)...")

for language, text in language_texts.items():
    # Split the text for this language
    train_text, val_text, test_text = split_data(text)
    
    # Store the splits
    train_data[language] = train_text
    val_data[language] = val_text
    test_data[language] = test_text
    
    print(f"  {language}: Train={len(train_text)} | Val={len(val_text)} | Test={len(test_text)} characters")

print("\n✓ Data loading and splitting complete!")

Loading language data from nested subfolders...
Found 21 language folders
  ✓ bul: Loaded sentences file, 39908 characters
  ✓ ces: Loaded sentences file, 8541308 characters
  ✓ dan: Loaded sentences file, 10426448 characters
  ✓ deu: Loaded sentences file, 10360606 characters
  ✓ ell: Loaded sentences file, 180880 characters
  ✓ eng: Loaded sentences file, 11234400 characters
  ✓ est: Loaded sentences file, 10606005 characters
  ✓ fin: Loaded sentences file, 9620216 characters
  ✓ fra: Loaded sentences file, 11535536 characters
  ✓ hun: Loaded sentences file, 11758448 characters
  ✓ ita: Loaded sentences file, 11654444 characters
  ✓ lav: Loaded sentences file, 10856836 characters
  ✓ lit: Loaded sentences file, 9894916 characters
  ✓ nld: Loaded sentences file, 8593819 characters
  ✓ pol: Loaded sentences file, 8804624 characters
  ✓ por: Loaded sentences file, 11499232 characters
  ✓ ron: Loaded sentences file, 10705686 characters
  ✓ slk: Loaded sentences file, 9667070 characters
 

In [27]:
# Display a sample from one language to verify
sample_language = sorted(list(train_data.keys()))[0]  # Get first language alphabetically
print(f"\nSample from '{sample_language}' training data (first 200 characters):")
print(train_data[sample_language][:200])


Sample from 'bul' training data (first 200 characters):
kritikar aaaa abbvie astrazeneca aci iata adobe magento leader magic quadrant gartner it airbnb ako ako allianz amazon anna koleva kadirova anthrax spreading the disease accept pandemic anton petrov a


In [None]:
# TODO:


"""Import libraries (numpy, os, re, etc.) ✓
Load News Wortschatz Corpora ✓
Load Euro Parliament Parallel Corpus
Preprocess data (remove punctuation, etc.) ✓
"""

#### Task 1.2: Constructing High-Dimensional Centroids


In [None]:
# TODO:

"""
Implement n-gram encoding (n=3, tri-grams) 
Create HD vectors with d=100 and d=1000
Build language centroids (21 languages)
Answer questions about conventional n-gram representations
"""

#### Task 1.3: Classification using Hyperdimensional Centroids


In [None]:
# TODO:

"""
Implement cosine similarity
Classify text samples
Display confusion matrix
Calculate accuracy and F1-score

"""

### Part 2: Unsupervised Learning with Self-Organizing 



#### Task 2.1: Unsupervised Learning of Hand-Written Digits with SOM



In [None]:
# TODO: Copy SOM code from D7041E-lab4_SOM.ipynb
# TODO: Adapt code to use MNIST instead of zoo.txt

"""
# Step 1: Load the MNIST dataset
# TODO: Load MNIST

# Step 2: Use the flattened (1D) array of pixels of each image as a feature vector
# TODO: Flatten images to 784 features

# Step 3: Initialize weights in SOM network randomly, train SOM with grid sizes
# TODO: Train SOM with grid 20x20
# TODO: Train SOM with grid 40x40
# TODO: Train SOM with grid 80x80

# Step 4: Display initial, intermediate (at 50%), and final learned weights
# TODO: Display initial weights as 28x28 images
# TODO: Display intermediate weights (at 50% of iterations) as 28x28 images
# TODO: Display final learned weights as 28x28 images

# Step 5: Assign labels to neurons by passing TRAINING examples through trained SOM
# TODO: Pass training examples and record statistics
# TODO: Assign labels to neurons
# TODO: Display confusion matrix for TRAINING SET
# TODO: Display confusion matrix for TEST SET

# Step 6: Experiment with learning rate
# TODO: Increase learning rate (fixed iterations)
# TODO: Decrease learning rate (fixed iterations)
# TODO: Answer: What is the resulting effect?


## Question 6: What is the resulting effect of changing learning rate? Answer: [YOUR ANSWER HERE]

# Step 7: Experiment with neighborhood decay
# TODO: For fixed iterations and best learning rate, increase exponential decay of neighbourhood parameter
# TODO: For fixed iterations and best learning rate, decrease exponential decay of neighbourhood parameter


## Question 8: What is the effect? Answer: [YOUR ANSWER HERE] 
## Question 9: What is a biological neuron? How does it relate to the concept of neurons in SOM? Answer: [YOUR ANSWER HERE]

"""

### Part 3: Fundamentals of Artificial Neural Networks and Backpropagation

#### Task 3.1: Multi-layer perceptron and backpropagation


In [None]:
# TODO: Copy backpropagation code from ANN_backprop.ipynb


### Task 3.1.1: Understand the implementation structure of the multilayer 
"""
Task 3.1.a: Be able to explain the principle of backpropagation algorithm Answer: [YOUR ANSWER HERE] 
Task 3.1.b: Be able to explain the meaning and the role of the Softmax function Answer: [YOUR ANSWER HERE] 
Task 3.1.c: Be able to name typically used non-linear output functions and implications of choosing one or another for implementation Answer: [YOUR ANSWER HERE] 
Task 3.1.d: Find the places in the code where execution breaks, answer the questions, comment out the exit line
# TODO: Find Question 1 in backpropagate() method
# TODO: Answer Question 1: What is computed in the next line of code?


Question 1: What is computed in the delta calculation? Answer: [YOUR ANSWER HERE]
# TODO: Comment out exit_with_err() for Question 1
# TODO: Find Question 2 in backpropagate() method
# TODO: Answer Question 2: What does this 'for' loop do?

Question 2: What does the backpropagation 'for' loop do? Answer: [YOUR ANSWER HERE]
# TODO: Comment out exit_with_err() for Question 2
# TODO: Find Question 3 in evaluate() method
# TODO: Answer Question 3: How is weight update implemented? What is eta?

Question 3: How is weight update implemented? What is eta? Answer: [YOUR ANSWER HERE]
# TODO: Comment out exit_with_err() for Question 3

"""

In [None]:
## Task 3.1.2: Run with default hyperparameters
# TODO: Run with epochs=70, learning_rate=0.05
# TODO: Record classification 

"""
Question: What is the classification accuracy? Answer: [YOUR ANSWER HERE]

"""

In [None]:
## Task 3.1.3: Run with different learning rates
# TODO: Run with learning_rate=0.005
# TODO: Run with learning_rate=0.5
# TODO: Compare results
"""
Question: Explain the observed differences in the functionality of the multi-layer perceptron Answer: [YOUR ANSWER HERE] 

"""

In [None]:
## Task 3.1.4: Implement ReLU activation function
# TODO: Implement f_relu() function with forward pass
# TODO: Implement f_relu() derivative
# TODO: Run perceptron with ReLU, epochs=70, learning_rate=0.05
# TODO: Record classification accuracy with ReLU
# TODO: Find learning rate values that give comparable accuracy to Sigmoid

"""
Question: What is the classification accuracy with ReLU? Answer: [YOUR ANSWER HERE] Question: What learning rate gives comparable accuracy to Sigmoid? Answer: [YOUR ANSWER HERE]

"""