<a href="https://colab.research.google.com/github/eiziiaizii1/ceng442-assignment1-GroupTAFB/blob/main/Notebook_ceng442_assignment1_GroupTAFB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CENG442 Assignment 1 - Azerbaijani Text Preprocessing & Word Embeddings

**Group Members:**
* Talha Ubeydullah Gamga | 20050111078
* Aziz Önder | 22050141021
* Muhammed Fatih Asan | 23050151026
* Buğra Bildiren | 20050111022

## Azerbaijani Text Preprocessing Pipeline

This notebook contains a full pipeline for cleaning, normalizing, and preparing Azerbaijani text data for machine learning, with a special focus on sentiment analysis.

In [1]:
# RUN THIS CELL INITIALLY, IF YOU ARE RUNNING IN COLAB
!git clone https://github.com/eiziiaizii1/ceng442-assignment1-GroupTAFB.git
%cd ceng442-assignment1-GroupTAFB
!pip install pandas gensim openpyxl regex ftfy scikit-learn

Cloning into 'ceng442-assignment1-GroupTAFB'...
remote: Enumerating objects: 85, done.[K
remote: Counting objects: 100% (85/85), done.[K
remote: Compressing objects: 100% (67/67), done.[K
remote: Total 85 (delta 41), reused 50 (delta 17), pack-reused 0 (from 0)[K
Receiving objects: 100% (85/85), 15.73 MiB | 18.26 MiB/s, done.
Resolving deltas: 100% (41/41), done.
/content/ceng442-assignment1-GroupTAFB
Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Collecting ftfy
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m55.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [10]:
import re, html, unicodedata
import pandas as pd
from pathlib import Path

# Try to import 'ftfy' for fixing text encoding
try:
    from ftfy import fix_text
# If 'ftfy' is not installed, create a dummy function
except Exception:
    def fix_text(s): return s

## Core Normalization Helpers

First, we define a function for language-specific lowercasing, as Azerbaijani has unique 'i' and 'I' characters.

In [12]:
# Azerbaijani-aware lowercase
def lower_az(s: str) -> str:
    # Check if input is a string
    if not isinstance(s, str): return ""
    # Normalize unicode characters
    s = unicodedata.normalize("NFC", s)
    # Convert Turkish 'I' to 'ı' and 'İ' to 'i'
    s = s.replace("I", "ı").replace("İ", "i")
    # Standard lowercase and fix a common issue
    s = s.lower().replace("i̇", "i")
    return s

## Regex Definitions and Mappings

Here we define all the regular expressions (regex) and data maps we will use to find and replace patterns in the text.

In [13]:
# --- Regex Definitions ---

# Finds HTML tags (e.g., <br>, <strong>)
HTML_TAG_RE = re.compile(r"<[^>]+>")
# Finds URLs (http://... or www....)
URL_RE = re.compile(r"(https?://\S+|www\.\S+)", re.IGNORECASE)
# Finds email addresses
EMAIL_RE = re.compile(r"\b[\w\.-]+@[\w\.-]+\.\w+\b", re.IGNORECASE)
# Finds phone numbers
PHONE_RE = re.compile(r"\+?\d[\d\-\s\(\)]{6,}\d")
# Finds user mentions (like @username)
USER_RE = re.compile(r"@\w+")
# Finds repeated punctuation (e.g., "!!", "???")
MULTI_PUNCT = re.compile(r"([!?.,;:])\1{1,}")
# Finds extra spaces
MULTI_SPACE = re.compile(r"\s+")
# Finds characters repeated 3+ times (e.g., "sooo")
REPEAT_CHARS= re.compile(r"(.)\1{2,}", flags=re.UNICODE)

# The main rule to find (tokenize) words, numbers, or special tags
TOKEN_RE = re.compile(
    r"[A-Za-zƏəĞğIıİiÖöÜüÇçŞşXxQq]+(?:'[A-Za-zƏəĞğIıİiÖöÜüÇçŞşXxQq]+)?"  # Words
    r"|<NUM>|URL|EMAIL|PHONE|USER|EMO(?:POS|NEG)"  # Tags
)

# --- Mappings ---

# Map for converting emojis to tags (EMO_POS, EMO_NEG)
EMO_MAP = {"🙂":"EMO_POS", "😀":"EMO_POS", "😍":"EMO_POS", "😊":"EMO_POS", "👍":"EMO_POS",
           "☹":"EMO_NEG", "🙁":"EMO_NEG", "😠":"EMO_NEG", "😡":"EMO_NEG", "👎":"EMO_NEG"}

# Map for correcting common slangs
SLANG_MAP = {"slm":"salam", "tmm":"tamam", "sagol":"sağol", "cox":"çox", "yaxsi":"yaxşı"}

# Words that indicate negation
NEGATORS = {"yox", "deyil", "heç", "qətiyyən", "yoxdur"}

## Domain-Specific Processing

These functions detect the "domain" (topic) of the text (e.g., News, Reviews) and apply special cleaning rules only for that domain.

In [14]:
# --- Domain detection ---
# Keywords to detect 'news' domain
NEWS_HINTS = re.compile(r"\b(apa|trend|azertac|reuters|bloomberg|dha|aa)\b", re.I)
# Keywords/symbols for 'social' media domain
SOCIAL_HINTS = re.compile(r"\b(rt)\b|@|#|(?:😂|😍|😊|👍|👎|😡|🙂)")
# Keywords for 'reviews' domain
REV_HINTS = re.compile(r"\b(azn|manat|qiymət|aldım|ulduz|çox yaxşı|çox pis)\b", re.I)

# Function to check text and assign a domain
def detect_domain(text: str) -> str:
    s = text.lower()
    if NEWS_HINTS.search(s): return "news"
    if SOCIAL_HINTS.search(s): return "social"
    if REV_HINTS.search(s): return "reviews"
    return "general"

# --- Domain-specific normalization (reviews) ---
# Finds prices (e.g., "10 azn") for reviews
PRICE_RE = re.compile(r"\b\d+\s*(azn|manat)\b", re.I)
# Finds star ratings (e.g., "5 ulduz") for reviews
STARS_RE = re.compile(r"\b([1-5])\s*ulduz\b", re.I)
# Finds positive phrases for reviews
POS_RATE = re.compile(r"\bçox yaxşı\b")
# Finds negative phrases for reviews
NEG_RATE = re.compile(r"\bçox pis\b")

# Function to apply special rules based on domain
def domain_specific_normalize(cleaned: str, domain: str) -> str:
    # Only apply these rules for 'reviews'
    if domain == "reviews":
        # Replace price with <PRICE> tag
        s = PRICE_RE.sub(" <PRICE> ", cleaned)
        # Replace stars with <STARS_n> tag
        s = STARS_RE.sub(lambda m: f" <STARS_{m.group(1)}> ", s)
        # Replace positive phrase
        s = POS_RATE.sub(" <RATING_POS> ", s)
        # Replace negative phrase
        s = NEG_RATE.sub(" <RATING_NEG> ", s)
        # Clean up extra spaces
        return " ".join(s.split())
    # Return original text if not 'reviews'
    return cleaned

# --- Domain tag token for corpus (no punctuation) ---
# Adds a domain tag (e.g., 'domnews') to the start of a line
def add_domain_tag(line: str, domain: str) -> str:
    return f"dom{domain} " + line # e.g., 'domnews', 'domreviews'

## Main Text Normalization Function

This is the main function that combines all the previous helpers to perform a full text-cleaning pipeline.

In [15]:
# The main function to clean one string of text
def normalize_text_az(s: str, numbers_to_token=True, keep_sentence_punct=False) -> str:

    if not isinstance(s, str): return ""

    # First, replace all emojis using the map
    for emo, tag in EMO_MAP.items():
        s = s.replace(emo, f" {tag} ")

    # Fix potential text encoding issues
    s = fix_text(s)
    # Convert HTML entities (like &amp;)
    s = html.unescape(s)
    # Remove HTML tags
    s = HTML_TAG_RE.sub(" ", s)
    # Replace URLs with 'URL' token
    s = URL_RE.sub(" URL ", s)
    # Replace emails with 'EMAIL' token
    s = EMAIL_RE.sub(" EMAIL ", s)
    # Replace phones with 'PHONE' token
    s = PHONE_RE.sub(" PHONE ", s)
    # Handle hashtags, try to split camelCase
    s = re.sub(r"#([A-Za-z0-9_]+)", lambda m: " " + re.sub('([a-z])([A-Z])', r'\1 \2', m.group(1)) + " ", s)
    # Replace user mentions with 'USER' token
    s = USER_RE.sub(" USER ", s)
    # Apply the Azeri-specific lowercase
    s = lower_az(s)
    # Reduce repeated punctuation (e.g., "!!" -> "!")
    s = MULTI_PUNCT.sub(r"\1", s)

    # If option is True
    if numbers_to_token:
        # Replace numbers with '<NUM>' token
        s = re.sub(r"\d+", " <NUM> ", s)

    # If keeping sentence endings
    if keep_sentence_punct:
        # Remove all symbols *except* sentence punctuation
        s = re.sub(r"[^\w\s<>'əğıöşüçƏĞIİÖŞÜÇxqXQ.!?]", " ", s)
    else:
        # Remove all symbols
        s = re.sub(r"[^\w\s<>'əğıöşüçƏĞIİÖŞÜÇxqXQ]", " ", s)

    # Fix extra spaces
    s = MULTI_SPACE.sub(" ", s).strip()

    # Split the clean string into a list of tokens
    toks = TOKEN_RE.findall(s)

    norm = []
    mark_neg = 0 # Counter for negation

    # Loop through each token
    for t in toks:
        # Reduce repeated chars (e.g., "goood" -> "good")
        t = REPEAT_CHARS.sub(r"\1\1", t)
        # Correct slang if found in the map
        t = SLANG_MAP.get(t, t)

        # If this token is a negation word
        if t in NEGATORS:
            norm.append(t);
            # Mark the next 3 words as negative
            mark_neg = 3;
            continue

        # If a word is marked
        if mark_neg > 0 and t not in {"URL", "EMAIL", "PHONE", "USER"}:
            # Add a '_NEG' suffix
            norm.append(t + "_NEG");
            mark_neg -= 1
        else:
            norm.append(t)

    # Final small clean-up (remove single letters)
    norm = [t for t in norm if not (len(t) == 1 and t not in {"o", "e", "ə"})]
    # Join tokens back into a string
    return " ".join(norm).strip()

## Sentiment Value Mapping

This function converts different types of labels (e.g., "positive", "1", "mənfi") into a single, standard numeric format (0.0, 0.5, 1.0).

In [16]:
# Function to convert different labels to a standard number
def map_sentiment_value(v, scheme: str):

    # For binary (0/1) classification
    if scheme == "binary":
        try: return 1.0 if int(v) == 1 else 0.0
        except Exception: return None

    s = str(v).strip().lower()

    # Check for positive labels
    if s in {"pos", "positive", "1", "müsbət", "pozitiv", "good"}: return 1.0
    # Check for neutral labels
    if s in {"neu", "neutral", "2", "neytral"}: return 0.5
    # Check for negative labels
    if s in {"neg", "negative", "0", "mənfi", "neqativ", "bad"}: return 0.0

    return None

## Main File Processing Function

This final function reads an Excel file, applies all the cleaning and mapping functions to the correct columns, and saves the result as a new two-column (text, sentiment) Excel file.

In [17]:
# The main function to process an entire Excel file
def process_file(in_path, text_col, label_col, scheme, out_two_col_path,
                 remove_stopwords=False):

    # Read the input Excel file
    df = pd.read_excel(in_path)

    # Remove extra columns if they exist
    for c in ["Unnamed: 0", "index"]:
        if c in df.columns: df = df.drop(columns=[c])

    # Check that required columns exist
    assert text_col in df.columns and label_col in df.columns, f"Missing columns in {in_path}"

    # --- Data Cleaning Steps ---
    # Remove rows with no text
    df = df.dropna(subset=[text_col])
    # Remove rows with empty text
    df = df[df[text_col].astype(str).str.strip().str.len() > 0]
    # Remove duplicate rows
    df = df.drop_duplicates(subset=[text_col])

    # Create 'cleaned_text' column by applying main normalizer
    df["cleaned_text"] = df[text_col].astype(str).apply(lambda s: normalize_text_az(s))

    # Detect the domain for each text
    df["__domain__"] = df[text_col].astype(str).apply(detect_domain)
    # Apply domain-specific normalization *after* general cleaning
    df["cleaned_text"] = df.apply(lambda r:
                                domain_specific_normalize(r["cleaned_text"], r["__domain__"]), axis=1)

    # If stopwatch removal is enabled
    if remove_stopwords:
        # Define the set of stopwords
        sw = set(["və", "ilə", "amma", "ancaq", "lakin", "ya", "həm", "artıq", "çox", "heç",
                  "qətiyyən", "ki", "bu", "bir", "o", "biz", "siz", "sən", "mən", "az", "ən",
                  "orada", "burada", "bütün", "hər", "də", "da", "üçün"])
        # Make sure *not* to remove sentiment-related words
        for keep in ["deyil", "yox", "yoxdur"]:
            sw.discard(keep)
        # Apply the stopword removal
        df["cleaned_text"] = df["cleaned_text"].apply(lambda s: " ".join([t for t in
                                                               s.split() if t not in sw]))

    # Create 'sentiment_value' column by mapping labels
    df["sentiment_value"] = df[label_col].apply(lambda v: map_sentiment_value(v, scheme))
    # Remove rows where sentiment could not be mapped
    df = df.dropna(subset=["sentiment_value"])
    df["sentiment_value"] = df["sentiment_value"].astype(float)

    # --- Save Output ---
    # Select only the final two columns
    out_df = df[["cleaned_text", "sentiment_value"]].reset_index(drop=True)

    # Create the output directory if it doesn't exist
    Path(out_two_col_path).parent.mkdir(parents=True, exist_ok=True)
    # Save the final data to a new Excel file
    out_df.to_excel(out_two_col_path, index=False)

    # Print a confirmation message
    print(f"Saved: {out_two_col_path} (rows={len(out_df)})")

# Step 6: Building the domain-tagged corpus_all.txt file

This process involves:
1.  Looping through the **raw** datasets defined in `datasets_to_process`.
2.  Using the `ozel_temizlik.detect_domain` function on the raw text.
3.  Using the `normalize_text_az` function to clean the text.
4.  Prepending the lowercase domain tag (e.g., `domsocial`) as specified in the PDF.
5.  Saving all lines to a single `corpus_all.txt` file .

In [7]:
# ----------------------------------------------------------------
# 6: Build the domain-tagged corpus_all.txt file
# (Based on the skeleton from PDF Section 7.1 )
# ----------------------------------------------------------------
print(f"Starting generation of 'corpus_all.txt'...")

# This list is already in memory from Step 4
# datasets_to_process = [...]

output_corpus_file = "corpus_all.txt"
total_lines = 0

# Open the single output file to write to
with open(output_corpus_file, "w", encoding="utf-8") as f_out:

    # Loop over the *raw* datasets
    for dataset in datasets_to_process:
        in_file = dataset["in_file"]
        text_col = dataset["text_col"]
        print(f"Reading raw data from: {in_file}...")

        try:
            df = pd.read_excel(in_file)

            # Drop rows with missing text
            df.dropna(subset=[text_col], inplace=True)

            for raw_text in df[text_col].astype(str):
                # 1. Detect domain from raw text
                # (Using the function from ozel_temizlik.py)
                domain = ozel_temizlik.detect_domain(raw_text)

                # 2. Normalize text using the main pipeline from Step 3
                # This pipeline already lowercases and removes punctuation
                cleaned_text = normalize_text_az(raw_text, domain)

                # 3. Prepend the domain tag
                # e.g., "domsocial bu çox yaxşı"
                if cleaned_text:
                    # Using lowercase 'dom{domain}' format from PDF
                    line = f"dom{domain} {cleaned_text}"
                    f_out.write(line + "\n")
                    total_lines += 1

        except Exception as e:
            print(f"!!! ERROR processing {in_file} for corpus: {e}")

print("\n" + "="*30)
print(f"CORPUS GENERATION COMPLETE.")
print(f"Total lines saved to '{output_corpus_file}': {total_lines}")

Starting generation of 'corpus_all.txt'...
Reading raw data from: data/labeled-sentiment.xlsx...
Reading raw data from: data/test__1_.xlsx...
Reading raw data from: data/train__3_.xlsx...
Reading raw data from: data/train-00000-of-00001.xlsx...
Reading raw data from: data/merged_dataset_CSV__1_.xlsx...

CORPUS GENERATION COMPLETE.
Total lines saved to 'corpus_all.txt': 124431


## Step 7: Training Word2Vec and FastText embedding Models

Following the preprocessing steps, we now have five cleaned Excel files. The next task is to train the **Word2Vec** and **FastText** models as specified in the assignment.

The code below performs the following actions:
1.  Initializes an empty list called `sentences`.
2.  Loops through each of the five `_2col.xlsx` files and reads the `cleaned_text` column using `pandas`.
3.  Converts each row of cleaned text into a list of tokens (by splitting on spaces) and adds these lists to the main `sentences` collection.
4.  Creates the `embeddings/` directory if it doesn't already exist.
5.  Trains a `Word2Vec` model using the `sentences` corpus. Key parameters include `vector_size=300`, `window=5`, `min_count=3`, and `sg=1` (Skip-gram).
6.  Trains a `FastText` model using the same corpus and similar parameters, but also includes subword information (`min_n=3`, `max_n=6`).
7.  Saves both trained models to the `embeddings/` folder as `word2vec.model` and `fasttext.model`.

In [8]:
from gensim.models import Word2Vec, FastText
import pandas as pd
from pathlib import Path

files = [
    f"{OUTPUT_DIR}/labeled-sentiment_2col.xlsx",
    f"{OUTPUT_DIR}/test_1_2col.xlsx",
    f"{OUTPUT_DIR}/train_3_2col.xlsx",
    f"{OUTPUT_DIR}/train-00000-of-00001_2col.xlsx",
    f"{OUTPUT_DIR}/merged_dataset_CSV_1_2col.xlsx",
]

sentences = []
for f in files:
    df = pd.read_excel(f, usecols=["cleaned_text"])
    sentences.extend(df["cleaned_text"].astype(str).str.split().tolist())

Path("embeddings").mkdir(exist_ok=True)
w2v = Word2Vec(sentences=sentences, vector_size=300, window=5, min_count=3, sg=1,
negative=10, epochs=10)
w2v.save("embeddings/word2vec.model")
ft  = FastText(sentences=sentences, vector_size=300, window=5, min_count=3, sg=1,
min_n=3, max_n=6, epochs=10)
ft.save("embeddings/fasttext.model")
print("Saved embeddings.")

Saved embeddings.


## Step 7: Model Evaluation: Word2Vec vs. FastText (Quantitative & Qualitative Metrics)

Model Evaluation section presents a comparative evaluation of the generated Word2Vec and FastText models. To assess their respective strengths in capturing the semantics of the Azerbaijani corpus, the analysis employs three distinct evaluation metrics.

### 1. Lexical Coverage (Quantitative)

This metric quantifies the **vocabulary coverage** of each model, measuring the percentage of unique tokens from our cleaned datasets that are found within the model's learned vocabulary.

This is a critical test for comparing the two architectures. Word2Vec, being a word-level model, is inherently limited to its training vocabulary and cannot represent **out-of-vocabulary (OOV)** words. In contrast, FastText, which learns vectors for character n-grams (subwords), can construct vectors for *any* word, including neologisms, misspellings, or rare words not encountered during training.

### 2. Semantic Similarity (Quantitative)

A successful embedding model should capture meaningful **semantic relationships**, placing words with similar meanings close together in the vector space and words with opposite meanings far apart.

To quantify this, we measure the average **cosine similarity** for two predefined sets of word pairs:
* **Synonym Pairs** (e.g., `yaxşı`, `əla`): We expect a high similarity score (close to 1.0), indicating semantic proximity.
* **Antonym Pairs** (e.g., `yaxşı`, `pis`): We expect a low or negative similarity score (close to -1.0 or 0.0), indicating semantic distance.

A "Separation Score" (calculated as `Synonym Similarity - Antonym Similarity`) is then used to provide a single, robust measure of the model's ability to discriminate between semantic similarity and opposition. A higher separation score is better.

### 3. Nearest Neighbors Analysis (Qualitative)

Beyond quantitative scores, a **qualitative analysis** of the embedding space is performed by inspecting the **nearest neighbors** for a set of predefined seed words.

By examining the top 5 most similar words for a given seed (e.g., `bahalı` or `pis`), we can intuitively assess the quality of the learned representations. This helps us judge whether the model has learned logical contexts (e.g., are the neighbors of "expensive" other price-related words?) or if it has merely learned superficial co-occurrence patterns.

In [9]:
import pandas as pd
from gensim.models import Word2Vec, FastText
import re

w2v = Word2Vec.load("embeddings/word2vec.model")
ft  = FastText.load("embeddings/fasttext.model")

seed_words = ["yaxşı","pis","çox","bahalı","ucuz","mükəmməl","dəhşət","<PRICE>","<RATING_POS>"]
syn_pairs  = [("yaxşı","əla"), ("bahalı","qiymətli"), ("ucuz","sərfəli")]
ant_pairs  = [("yaxşı","pis"), ("bahalı","ucuz")]

def lexical_coverage(model, tokens):
    vocab = model.wv.key_to_index
    return sum(1 for t in tokens if t in vocab) / max(1,len(tokens))

files = [
    f"{OUTPUT_DIR}/labeled-sentiment_2col.xlsx",
    f"{OUTPUT_DIR}/test_1_2col.xlsx",
    f"{OUTPUT_DIR}/train_3_2col.xlsx",
    f"{OUTPUT_DIR}/train-00000-of-00001_2col.xlsx",
    f"{OUTPUT_DIR}/merged_dataset_CSV_1_2col.xlsx",
]

def read_tokens(f):
    df = pd.read_excel(f, usecols=["cleaned_text"])
    return [t for row in df["cleaned_text"].astype(str) for t in row.split()]

print("== Lexical coverage (per dataset) ==")
for f in files:
    toks = read_tokens(f)
    cov_w2v = lexical_coverage(w2v, toks)
    cov_ftv = lexical_coverage(ft, toks)  # FT still embeds OOV via subwords
    print(f"{f}: W2V={cov_w2v:.3f}, FT(vocab)={cov_ftv:.3f}")

from numpy import dot
from numpy.linalg import norm

def cos(a,b): return float(dot(a,b)/(norm(a)*norm(b)))

def pair_sim(model, pairs):
    vals = []
    for a,b in pairs:
        try: vals.append(model.wv.similarity(a,b))
        except KeyError: pass
    return sum(vals)/len(vals) if vals else float('nan')

syn_w2v = pair_sim(w2v, syn_pairs)
syn_ft  = pair_sim(ft,  syn_pairs)
ant_w2v = pair_sim(w2v, ant_pairs)
ant_ft  = pair_sim(ft,  ant_pairs)

print("\n== Similarity (higher better for synonyms; lower better for antonyms) ==")
print(f"Synonyms: W2V={syn_w2v:.3f}, FT={syn_ft:.3f}")
print(f"Antonyms: W2V={ant_w2v:.3f}, FT={ant_ft:.3f}")
print(f"Separation (Syn - Ant): W2V={(syn_w2v - ant_w2v):.3f}, FT={(syn_ft - ant_ft):.3f}")

def neighbors(model, word, k=5):
  try: return [w for w,_ in model.wv.most_similar(word, topn=k)]
  except KeyError: return []

print("\n== Nearest neighbors (qualitative) ==")
for w in seed_words:
  print(f"  W2V NN for '{w}':", neighbors(w2v, w))
  print(f"  FT  NN for '{w}':", neighbors(ft,  w))

# (Optional) domain drift if you train domain-specific models separately:
# drift(word, model_a, model_b) = 1 - cos(vec_a, vec_b)

== Lexical coverage (per dataset) ==
clean_data/labeled-sentiment_2col.xlsx: W2V=0.920, FT(vocab)=0.920
clean_data/test_1_2col.xlsx: W2V=0.973, FT(vocab)=0.973
clean_data/train_3_2col.xlsx: W2V=0.976, FT(vocab)=0.976
clean_data/train-00000-of-00001_2col.xlsx: W2V=0.914, FT(vocab)=0.914
clean_data/merged_dataset_CSV_1_2col.xlsx: W2V=0.929, FT(vocab)=0.929

== Similarity (higher better for synonyms; lower better for antonyms) ==
Synonyms: W2V=0.329, FT=0.452
Antonyms: W2V=0.308, FT=0.411
Separation (Syn - Ant): W2V=0.021, FT=0.041

== Nearest neighbors (qualitative) ==
  W2V NN for 'yaxşı': ['olardı', 'yaxshi', 'iyi', 'zor', 'olar.']
  FT  NN for 'yaxşı': ['yaxşı-yaxşı', 'yaxşı!', 'yaxşıı', 'yaxşı)', 'yaxşıkı']
  W2V NN for 'pis': ['<STARS_LOW>', 'pisdir', 'vərdişlərə', 'pisdi', 'bərbad']
  FT  NN for 'pis': ['pis!', '(pis', 'pis,', 'pis.', 'piis']
  W2V NN for 'çox': ['tətbiqidir', 'çoox', 'gözəldir', 'temu', 'çöx']
  FT  NN for 'çox': ['çoxçox', 'çox.çox', '(çox', 'çoxx', '"çox']
  W2V