In [12]:
# -*- coding: utf-8 -*-
"""
CENG442 Assignment 1 - Azerbaijani Text Preprocessing + Word Embeddings
"""

# --- Core imports ---
import re, html, unicodedata
from pathlib import Path

# --- Data handling ---
import pandas as pd

# --- Text cleaning ---
try:
    from ftfy import fix_text
except Exception:
    def fix_text(s): return s

# --- Embeddings ---
from gensim.models import Word2Vec, FastText

# --- Utilities ---
import numpy as np


In [13]:
# -*- coding: utf-8 -*-
"""
CENG442 Assignment 1 - Azerbaijani Text Preprocessing + Word Embeddings
Combined pipeline script (from PDF content)
Author(s): <your names>
"""

# --- Core imports ---
import re
import html
import unicodedata
from pathlib import Path

# --- Data handling ---
import pandas as pd

# --- Text cleaning and normalization ---
try:
    from ftfy import fix_text
except Exception:
    def fix_text(s): return s

# --- Word embeddings ---
from gensim.models import Word2Vec, FastText

# --- Utilities ---
from sklearn.model_selection import train_test_split  # optional
import numpy as np

# ---------------------------
# Azerbaijani-aware lowercase
# ---------------------------
def lower_az(s: str) -> str:
    if not isinstance(s, str): return ""
    s = unicodedata.normalize("NFC", s)
    s = s.replace("I", "ı").replace("İ", "i")
    # normalize dotted i artifacts
    s = s.lower().replace("i ̇","i")
    return s

# ---------------------------
# Regular expressions & maps
# ---------------------------
HTML_TAG_RE = re.compile(r"<[^>]+>")
URL_RE      = re.compile(r"(https?://\S+|www\.\S+)", re.IGNORECASE)
EMAIL_RE    = re.compile(r"\b[\w\.-]+@[\w\.-]+\.\w+\b", re.IGNORECASE)
PHONE_RE    = re.compile(r"\+?\d[\d\-\s\(\)]{6,}\d")
USER_RE     = re.compile(r"@\w+")
MULTI_PUNCT = re.compile(r"([!?.,;:])\1{1,}")
MULTI_SPACE = re.compile(r"\s+")
REPEAT_CHARS= re.compile(r"(.)\1{2,}", flags=re.UNICODE)

TOKEN_RE = re.compile(
    r"[A-Za-zƏəĞğIıİiÖöÜüÇçŞşXxQq]+(?:'[A-Za-zƏəĞğIıİiÖöÜüÇçŞşXxQq]+)?"
    r"|<NUM>|URL|EMAIL|PHONE|USER|EMO_(?:POS|NEG)"
)

EMO_MAP = {
    "🙂":"EMO_POS","😀":"EMO_POS","😍":"EMO_POS","😊":"EMO_POS","👍":"EMO_POS",
    "☹":"EMO_NEG","🙁":"EMO_NEG","😠":"EMO_NEG","😡":"EMO_NEG","👎":"EMO_NEG"
}

SLANG_MAP = {"slm":"salam","tmm":"tamam","sagol":"sağol","cox":"çox","yaxsi":"yaxşı"}
NEGATORS  = {"yox","deyil","heç","qətiyyən","yoxdur"}

# ---------------------------
# Domain detection helpers
# ---------------------------
NEWS_HINTS   = re.compile(r"\b(apa|trend|azertac|reuters|bloomberg|dha|aa)\b", re.I)
SOCIAL_HINTS = re.compile(r"\b(rt)\b|@|#|(?:😂|😍|😊|👍|👎|😡|🙂)")
REV_HINTS    = re.compile(r"\b(azn|manat|qiymət|aldım|ulduz|çox yaxşı|çox pis)\b", re.I)

PRICE_RE     = re.compile(r"\b\d+\s*(azn|manat)\b", re.I)
STARS_RE     = re.compile(r"\b([1-5])\s*ulduz\b", re.I)
POS_RATE     = re.compile(r"\bçox yaxşı\b")
NEG_RATE     = re.compile(r"\bçox pis\b")

def detect_domain(text: str) -> str:
    s = str(text).lower()
    if NEWS_HINTS.search(s): return "news"
    if SOCIAL_HINTS.search(s): return "social"
    if REV_HINTS.search(s):   return "reviews"
    return "general"

def domain_specific_normalize(cleaned: str, domain: str) -> str:
    if domain == "reviews":
        s = PRICE_RE.sub(" <PRICE> ", cleaned)
        # STARS_RE replacement uses the match number
        s = STARS_RE.sub(lambda m: f" <STARS_{m.group(1)}> ", s)
        s = POS_RATE.sub(" <RATING_POS> ", s)
        s = NEG_RATE.sub(" <RATING_NEG> ", s)
        return " ".join(s.split())
    return cleaned

def add_domain_tag(line: str, domain: str) -> str:
    return f"dom{domain} " + line  # e.g., 'domnews ...'

# ---------------------------
# Normalization function
# ---------------------------
def normalize_text_az(s: str, numbers_to_token=True, keep_sentence_punct=False) -> str:
    if not isinstance(s, str): return ""
    # emoji map first
    for emo, tag in EMO_MAP.items():
        s = s.replace(emo, f" {tag} ")
    s = fix_text(s)
    s = html.unescape(s)
    s = HTML_TAG_RE.sub(" ", s)
    s = URL_RE.sub(" URL ", s)
    s = EMAIL_RE.sub(" EMAIL ", s)
    s = PHONE_RE.sub(" PHONE ", s)
    # Hashtag: keep text, split camelCase
    s = re.sub(r"#([A-Za-z0-9_]+)", lambda m: " " + re.sub('([a-z])([A-Z])', r'\1 \2', m.group(1)) + " ", s)
    s = USER_RE.sub(" USER ", s)
    s = lower_az(s)
    s = MULTI_PUNCT.sub(r"\1", s)
    if numbers_to_token:
        s = re.sub(r"\d+", " <NUM> ", s)
    if keep_sentence_punct:
        s = re.sub(r"[^\w\s<>'əğıöşüçƏĞIİÖŞÜÇxqXQ.!?]", " ", s)
    else:
        s = re.sub(r"[^\w\s<>'əğıöşüçƏĞIİÖŞÜÇxqXQ]", " ", s)
    s = MULTI_SPACE.sub(" ", s).strip()
    toks = TOKEN_RE.findall(s)
    norm = []
    mark_neg = 0
    for t in toks:
        t = REPEAT_CHARS.sub(r"\1\1", t)
        t = SLANG_MAP.get(t, t)
        if t in NEGATORS:
            norm.append(t); mark_neg = 3; continue
        if mark_neg > 0 and t not in {"URL","EMAIL","PHONE","USER"}:
            norm.append(t + "_NEG"); mark_neg -= 1
        else:
            norm.append(t)
    norm = [t for t in norm if not (len(t) == 1 and t not in {"o","e"})]
    return " ".join(norm).strip()

# ---------------------------
# Sentiment mapping
# ---------------------------
def map_sentiment_value(v, scheme: str):
    if scheme == "binary":
        try:
            return 1.0 if int(v) == 1 else 0.0
        except Exception:
            return None
    s = str(v).strip().lower()
    if s in {"pos","positive","1","müsbət","good","pozitiv","müsbət"}: return 1.0
    if s in {"neu","neutral","2","neytral","neutral"}: return 0.5
    if s in {"neg","negative","0","mənfi","bad","neqativ"}: return 0.0
    return None

# ---------------------------
# File processing pipeline
# ---------------------------
def process_file(in_path, text_col, label_col, scheme, out_two_col_path, remove_stopwords=False):
    df = pd.read_excel(in_path)
    for c in ["Unnamed: 0","index"]:
        if c in df.columns:
            df = df.drop(columns=[c])
    assert text_col in df.columns and label_col in df.columns, f"Missing columns in {in_path}"
    # original text kept for domain detection
    df = df.dropna(subset=[text_col])
    df = df[df[text_col].astype(str).str.strip().str.len() > 0]
    df = df.drop_duplicates(subset=[text_col])

    # base clean
    df["cleaned_text"] = df[text_col].astype(str).apply(lambda s: normalize_text_az(s))
    # domain-aware tweak
    df["__domain__"] = df[text_col].astype(str).apply(detect_domain)
    df["cleaned_text"] = df.apply(lambda r: domain_specific_normalize(r["cleaned_text"], r["__domain__"]), axis=1)

    # optional stopwords (kept minimal, sentiment words preserved)
    if remove_stopwords:
        sw = set(["və","ilə","amma","ancaq","lakin","ya","həm","ki","bu","bir","o","biz","siz","mən","sən","orada","burada","bütün","hər","artıq","çox","az","ən","də","da","üçün"])
        for keep in ["deyil","yox","heç","qətiyyən","yoxdur"]:
            sw.discard(keep)
        df["cleaned_text"] = df["cleaned_text"].apply(lambda s: " ".join([t for t in s.split() if t not in sw]))

    # sentiment mapping (0.0 / 0.5 / 1.0)
    df["sentiment_value"] = df[label_col].apply(lambda v: map_sentiment_value(v, scheme))
    df = df.dropna(subset=["sentiment_value"])
    df["sentiment_value"] = df["sentiment_value"].astype(float)

    # final two-column output
    out_df = df[["cleaned_text","sentiment_value"]].reset_index(drop=True)
    Path(out_two_col_path).parent.mkdir(parents=True, exist_ok=True)
    out_df.to_excel(out_two_col_path, index=False)
    print(f"Saved: {out_two_col_path} (rows={len(out_df)})")

# ---------------------------
# Build combined corpus (domain-tagged)
# ---------------------------
def build_corpus_txt(input_files, text_cols, out_txt="corpus_all.txt"):
    """Create domain-tagged, lowercase, punctuation-free corpus (one sentence per line)."""
    lines = []
    for (f, text_col) in zip(input_files, text_cols):
        df = pd.read_excel(f)
        for raw in df[text_col].dropna().astype(str):
            dom = detect_domain(raw)
            s = normalize_text_az(raw, keep_sentence_punct=True)
            parts = re.split(r"[.!?]+", s)
            for p in parts:
                p = p.strip()
                if not p: continue
                p = re.sub(r"[^\w\səğıöşüçƏĞIİÖŞÜÇxqXQ]", " ", p)  # remove punctuation
                p = " ".join(p.split()).lower()
                if p:
                    lines.append(f"dom{dom} " + p)
    Path(out_txt).parent.mkdir(parents=True, exist_ok=True)
    with open(out_txt, "w", encoding="utf-8") as w:
        for ln in lines:
            w.write(ln + "\n")
    print(f"Wrote {out_txt} with {len(lines)} lines")

# ---------------------------
# Train Word2Vec and FastText
# ---------------------------
def train_embeddings(two_col_files, emb_dir="embeddings", vector_size=300, window=5, min_count=3, epochs=10):
    Path(emb_dir).mkdir(parents=True, exist_ok=True)
    sentences = []
    for f in two_col_files:
        df = pd.read_excel(f, usecols=["cleaned_text"])
        sentences.extend(df["cleaned_text"].astype(str).str.split().tolist())

    print(f"Total sentences for training: {len(sentences)}")

    # --- Word2Vec ---
    w2v = Word2Vec(
        sentences=sentences,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        sg=1,
        negative=10,
        epochs=epochs
    )
    w2v.save(str(Path(emb_dir) / "word2vec.model"))

    # --- FastText ---
    ft = FastText(
        sentences=sentences,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        sg=1,
        min_n=3,
        max_n=6,
        epochs=epochs
    )
    ft.save(str(Path(emb_dir) / "fasttext.model"))

    print("Saved embeddings.")


# ---------------------------
# Simple comparison utilities
# ---------------------------
def lexical_coverage(model, tokens):
    vocab = model.wv.key_to_index
    return sum(1 for t in tokens if t in vocab) / max(1,len(tokens))

def pair_sim(model, pairs):
    vals = []
    for a,b in pairs:
        try: vals.append(model.wv.similarity(a,b))
        except KeyError: pass
    return sum(vals)/len(vals) if vals else float('nan')

def neighbors(model, word, k=5):
    try: return [w for w,_ in model.wv.most_similar(word, topn=k)]
    except KeyError: return []

# ---------------------------
# Example main (CFG from PDF)
# ---------------------------
if __name__ == "__main__":
    CFG = [
        ("sample_data/labeled-sentiment.xlsx",        "text", "sentiment", "tri"),
        ("sample_data/test__1_.xlsx",                 "text", "label",     "binary"),
        ("sample_data/train__3_.xlsx",                "text", "label",     "binary"),
        ("sample_data/train-00000-of-00001.xlsx",     "text", "labels",    "tri"),
        ("sample_data/merged_dataset_CSV__1_.xlsx",   "text", "labels",    "binary"),
    ]

    # If your files are in sample_data/ change the names e.g. "sample_data/labeled-sentiment.xlsx"
    # Create two-column outputs
    out_files = []
    for fname, tcol, lcol, scheme in CFG:
        out = f"{Path(fname).stem}_2col.xlsx"
        print(f"Processing {fname} -> {out}")
        try:
            process_file(fname, tcol, lcol, scheme, out, remove_stopwords=False)
            out_files.append(out)
        except AssertionError as e:
            print(f"Skipping {fname} - {e}")
        except FileNotFoundError:
            print(f"File not found: {fname} (skipping). Put your dataset in working dir or update CFG.")
        except Exception as ex:
            print(f"Error processing {fname}: {ex}")

    # combined domain-tagged, punctuation-free corpus
    # Use original text columns for corpus creation (CFG entries' text columns)
    files_for_corpus = [c[0] for c in CFG]
    text_cols_for_corpus = [c[1] for c in CFG]
    # Only build corpus if the files exist
    existing_pairs = []
    for f,tcol in zip(files_for_corpus, text_cols_for_corpus):
        if Path(f).exists():
            existing_pairs.append((f,tcol))
    if existing_pairs:
        build_corpus_txt([p[0] for p in existing_pairs], [p[1] for p in existing_pairs], out_txt="corpus_all.txt")
    else:
        print("No input files found for corpus build; place files or adjust CFG.")

    # Train embeddings if two-col outputs exist
    two_col_files = [f for f in out_files if Path(f).exists()]
    if two_col_files:
        train_embeddings(two_col_files, emb_dir="embeddings", vector_size=300, window=5, min_count=3, epochs=10)

        # Small comparison example
        try:
            w2v = Word2Vec.load("embeddings/word2vec.model")
            ft  = FastText.load("embeddings/fasttext.model")
            seed_words = ["yaxşı","pis","çox","bahalı","ucuz","mükəmməl","dəhşət","<PRICE>","<RATING_POS>"]
            syn_pairs  = [("yaxşı","əla"), ("bahalı","qiymətli"), ("ucuz","sərfəli")]
            ant_pairs  = [("yaxşı","pis"), ("bahalı","ucuz")]
            print("== Lexical coverage & sample neighbors ==")
            # read tokens from first two-col file as example
            sample_tokens = []
            if two_col_files:
                df = pd.read_excel(two_col_files[0], usecols=["cleaned_text"])
                sample_tokens = [t for row in df["cleaned_text"].astype(str) for t in row.split()]
            print(f"Coverage W2V (sample): {lexical_coverage(w2v, sample_tokens):.3f}")
            print(f"Coverage FT (sample): {lexical_coverage(ft, sample_tokens):.3f}")
            syn_w2v = pair_sim(w2v, syn_pairs)
            syn_ft  = pair_sim(ft,  syn_pairs)
            ant_w2v = pair_sim(w2v, ant_pairs)
            ant_ft  = pair_sim(ft,  ant_pairs)
            print(f"Synonyms: W2V={syn_w2v:.3f}, FT={syn_ft:.3f}")
            print(f"Antonyms: W2V={ant_w2v:.3f}, FT={ant_ft:.3f}")
            for w in seed_words:
                print(f"  W2V NN for '{w}':", neighbors(w2v, w))
                print(f"  FT  NN for '{w}':", neighbors(ft,  w))
        except Exception as e:
            print("Skipping embedding comparison due to error:", e)
    else:
        print("No two-column outputs produced; skip embedding training.")


Processing sample_data/labeled-sentiment.xlsx -> labeled-sentiment_2col.xlsx
Saved: labeled-sentiment_2col.xlsx (rows=2955)
Processing sample_data/test__1_.xlsx -> test__1__2col.xlsx
Saved: test__1__2col.xlsx (rows=4198)
Processing sample_data/train__3_.xlsx -> train__3__2col.xlsx
Saved: train__3__2col.xlsx (rows=19557)
Processing sample_data/train-00000-of-00001.xlsx -> train-00000-of-00001_2col.xlsx
Saved: train-00000-of-00001_2col.xlsx (rows=41756)
Processing sample_data/merged_dataset_CSV__1_.xlsx -> merged_dataset_CSV__1__2col.xlsx
Saved: merged_dataset_CSV__1__2col.xlsx (rows=55662)
Wrote corpus_all.txt with 124353 lines
Total sentences for training: 124128
Saved embeddings.
== Lexical coverage & sample neighbors ==
Coverage W2V (sample): 0.932
Coverage FT (sample): 0.932
Synonyms: W2V=0.356, FT=0.435
Antonyms: W2V=0.335, FT=0.421
  W2V NN for 'yaxşı': ['<RATING_POS>', 'iyi', 'yaxshi', 'yaxsı', 'awsome']
  FT  NN for 'yaxşı': ['yaxşıı', 'yaxşıkı', 'yaxşıca', 'yaxş', 'yaxşıya']
  