<a href="https://colab.research.google.com/github/goitstudent123/numerical_programming_python/blob/main/%D0%94%D0%973-%D0%93%D0%90%D0%A1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
!pip -q install gdown scikit-learn

import os, io, math, random, pickle
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import gdown

In [10]:
FILE_ID = "1281E0CDneuKdflWFBUvuyUzujpdGVImz"
URL = f"https://drive.usercontent.google.com/u/0/uc?id={FILE_ID}&export=download"
PATH = "/content/word_embeddings_subset.p"

random.seed(42)
np.random.seed(42)

# --- Download with fallback (URL → ID)
def download_if_needed(dest):
    # try URL first
    if not os.path.exists(dest):
        gdown.download(URL, dest, quiet=False)
    # if suspicious (tiny) or HTML, retry via file id
    if os.path.getsize(dest) < 200 or looks_like_html(dest):
        print("Re-downloading via file id (to avoid HTML/partial file)...")
        os.remove(dest)
        gdown.download(id=FILE_ID, output=dest, quiet=False)

def looks_like_html(path, n=256):
    try:
        with open(path, "rb") as f:
            head = f.read(n)
        head_l = head.lower()
        return b"<!doctype html" in head_l or b"<html" in head_l
    except Exception:
        return False

download_if_needed(PATH)
print(f"\n\nDownloaded file size: {os.path.getsize(PATH)} bytes")

Downloading...
From: https://drive.usercontent.google.com/u/0/uc?id=1281E0CDneuKdflWFBUvuyUzujpdGVImz&export=download
To: /content/word_embeddings_subset.p
100%|██████████| 21.0/21.0 [00:00<00:00, 11.9kB/s]


Re-downloading via file id (to avoid HTML/partial file)...


Downloading...
From: https://drive.google.com/uc?id=1281E0CDneuKdflWFBUvuyUzujpdGVImz
To: /content/word_embeddings_subset.p
100%|██████████| 309k/309k [00:00<00:00, 17.6MB/s]

Downloaded file size: 309156 bytes





In [11]:
# --- Detect file type (magic bytes) and load
def detect_type(path):
    with open(path, "rb") as f:
        head = f.read(8)
    # ZIP → .npz
    if head.startswith(b"PK\x03\x04"):
        return "npz"
    # NPY magic
    if head.startswith(b"\x93NUMPY"):
        return "npy"
    # Pickle often starts with \x80 (PROTO opcode), not guaranteed, but common
    if head[:1] == b"\x80":
        return "pickle"
    # If it looks like HTML, signal it
    if looks_like_html(path):
        return "html"
    # Otherwise try to sniff text
    try:
        with open(path, "rb") as f:
            chunk = f.read(4096).decode("utf-8")
        # simple heuristic: lines with tokens like "word 0.12 0.34 ..."
        lines = [ln.strip() for ln in chunk.splitlines() if ln.strip()]
        if lines:
            toks = lines[0].split()
            if len(toks) >= 4:
                # looks like "word + floats"
                floatable = 0
                for t in toks[1:]:
                    try:
                        float(t)
                        floatable += 1
                    except:
                        break
                if floatable >= 3:
                    return "text"
    except Exception:
        pass
    # final guess: try pickle in loader
    return "unknown"

ftype = detect_type(PATH)
print("Detected type:", ftype)

def load_any_embeddings(path):
    t = detect_type(path)
    if t == "html":
        raise ValueError("Downloaded HTML page instead of model (check access or file id).")
    # try pickle first when unknown (fast to fail)
    if t in ("pickle", "unknown"):
        try:
            with open(path, "rb") as f:
                obj = pickle.load(f)
            if isinstance(obj, dict):
                words = list(obj.keys())
                X = np.vstack([np.asarray(obj[w], dtype=np.float32) for w in words])
                return words, X
        except Exception as e:
            print("Pickle loader failed:", repr(e))
            # continue to other formats
    if t in ("npz", "unknown"):
        try:
            npz = np.load(path, allow_pickle=True)
            # common patterns
            if "words" in npz and "vectors" in npz:
                words = list(npz["words"])
                X = np.asarray(npz["vectors"], dtype=np.float32)
                return words, X
            # Heuristic: take first 2 arrays
            keys = list(npz.keys())
            if len(keys) >= 2:
                a, b = npz[keys[0]], npz[keys[1]]
                if a.dtype.kind in {"U","S","O"} and b.ndim == 2:
                    return list(a), np.asarray(b, dtype=np.float32)
                if b.dtype.kind in {"U","S","O"} and a.ndim == 2:
                    return list(b), np.asarray(a, dtype=np.float32)
        except Exception as e:
            print("NPZ loader failed:", repr(e))
    if t == "npy":
        try:
            arr = np.load(path, allow_pickle=True)
            # could be dict in npy
            if arr.dtype == object and hasattr(arr, "item"):
                d = arr.item()
                words = list(d["words"])
                X = np.asarray(d["vectors"], dtype=np.float32)
                return words, X
            # or just a 2D matrix with no words (synthesize tokens)
            if arr.ndim == 2:
                X = np.asarray(arr, dtype=np.float32)
                words = [f"token_{i}" for i in range(X.shape[0])]
                return words, X
        except Exception as e:
            print("NPY loader failed:", repr(e))
    # text: "word float float float ..."
    if t in ("text", "unknown"):
        try:
            words, vecs = [], []
            with open(path, "r", encoding="utf-8", errors="replace") as f:
                for line in f:
                    parts = line.strip().split()
                    if not parts:
                        continue
                    # skip header like "400000 300"
                    if len(parts) == 2 and all(p.isdigit() for p in parts):
                        continue
                    w, nums = parts[0], parts[1:]
                    # keep only numeric tail
                    vals = []
                    for n in nums:
                        try:
                            vals.append(float(n))
                        except:
                            break
                    if len(vals) >= 3:
                        words.append(w)
                        vecs.append(vals)
            if words:
                # pad/truncate to same dim
                d = min(len(v) for v in vecs)
                X = np.array([v[:d] for v in vecs], dtype=np.float32)
                return words, X
        except Exception as e:
            print("Text loader failed:", repr(e))
    raise ValueError("Could not load embeddings from the downloaded file.")

words_all, X_all = load_any_embeddings(PATH)
print(f"Loaded embeddings: {len(words_all)} words, dim={X_all.shape[1]}")


Detected type: pickle
Loaded embeddings: 243 words, dim=300


In [12]:

# --- Ensure >=3D, then reduce to 3D via PCA
if X_all.shape[1] < 3:
    raise ValueError("Embeddings have <3 dimensions; need >=3 to project/cross product.")
pca = PCA(n_components=3, random_state=42)
X3 = pca.fit_transform(X_all).astype(np.float32)
print("Projected to 3D:", X3.shape)

# --- Build DataFrame (word, x, y, z)
df = pd.DataFrame({
    "word": words_all,
    "x": X3[:, 0],
    "y": X3[:, 1],
    "z": X3[:, 2],
})
df.head()

Projected to 3D: (243, 3)


Unnamed: 0,word,x,y,z
0,country,0.746037,-0.387964,-0.482691
1,city,0.102492,0.140384,-1.189891
2,China,0.831055,-0.1295,-0.312599
3,Iraq,0.656337,-0.177791,-0.338381
4,oil,0.658345,-0.432464,-0.621213


In [15]:
# --- Prepare lookup and normalization
def safe_normalize(M, eps=1e-12):
    n = np.linalg.norm(M, axis=1, keepdims=True)
    n = np.maximum(n, eps)
    return M / n

V = X3
Vn = safe_normalize(V)
word_to_idx = {w: i for i, w in enumerate(words_all)}

# --- Basic helpers
def vector_for(word):
    idx = word_to_idx.get(word)
    if idx is None:
        raise KeyError(f"Word not in vocab: {word}")
    return V[idx]

def nearest_word(vec, topk=1, exclude=None):
    v = np.asarray(vec, dtype=np.float32)
    v = v / max(np.linalg.norm(v), 1e-12)
    sims = Vn @ v
    if exclude:
        for ex in exclude:
            j = word_to_idx.get(ex)
            if j is not None:
                sims[j] = -np.inf
    k = min(topk, len(sims))
    idxs = np.argpartition(-sims, range(k))[:k]
    idxs = idxs[np.argsort(-sims[idxs])]
    return [(words_all[i], float(sims[i])) for i in idxs]

def orthogonal_word(word_a, word_b):
    a, b = vector_for(word_a), vector_for(word_b)
    c = np.cross(a, b)
    if np.allclose(c, 0.0):
        return None, 0.0
    return nearest_word(c, topk=1, exclude=[word_a, word_b])[0]

# The function returns the cosine angle between two word vectors (in degrees).
# Smaller angle ⇒ higher semantic similarity; larger angle ⇒ lower similarity/opposition.
# Practical bands: <30° strong, 30–60° moderate, 60–120° weak/neutral, >120° dissimilar/opposed.
def angle_between_words(word_a, word_b):
    a, b = vector_for(word_a), vector_for(word_b)
    an, bn = a / np.linalg.norm(a), b / np.linalg.norm(b)
    cosv = float(np.clip(np.dot(an, bn), -1.0, 1.0))
    ang = math.degrees(math.acos(cosv))
    return ang, cosv

# --- Quick demos
sample = words_all[:8] if len(words_all) <= 200 else random.sample(words_all, 8)
print("Sample words:", sample)

w0 = sample[0]
hits = nearest_word(vector_for(w0), topk=5, exclude=[w0])
print(f"\nNearest to '{w0}' (excluding itself):")
for w, s in hits:
    print(f"  {w:20s} cosine={s:.4f}")

for a, b in [(sample[0], sample[1]), (sample[2], sample[3])]:
    w, s = orthogonal_word(a, b)
    if w is None:
        print(f"\nOrthogonal to ({a}, {b}): nearly parallel")
    else:
        print(f"\nOrthogonal to ({a}, {b}) → {w} (cos={s:.4f})")

a, b = sample[0], sample[1]
ang, cosv = angle_between_words(a, b)
print(f"\nAngle between '{a}' and '{b}': {ang:.2f}° (cos={cosv:.4f})")

print("\nDataFrame shape:", df.shape)
df.head(10)

Sample words: ['Dakar', 'Paris', 'Belize', 'Tunis', 'Vientiane', 'Warsaw', 'Spain', 'Macedonia']

Nearest to 'Dakar' (excluding itself):
  Banjul               cosine=0.9991
  Harare               cosine=0.9987
  Kampala              cosine=0.9971
  Nairobi              cosine=0.9965
  Lusaka               cosine=0.9964

Orthogonal to (Dakar, Paris) → Belize (cos=0.9998)

Orthogonal to (Belize, Tunis) → Lisbon (cos=0.9989)

Angle between 'Dakar' and 'Paris': 91.87° (cos=-0.0326)

DataFrame shape: (243, 4)


Unnamed: 0,word,x,y,z
0,country,0.746037,-0.387964,-0.482691
1,city,0.102492,0.140384,-1.189891
2,China,0.831055,-0.1295,-0.312599
3,Iraq,0.656337,-0.177791,-0.338381
4,oil,0.658345,-0.432464,-0.621213
5,town,0.378076,-0.080444,-1.053469
6,Canada,1.191961,-0.407709,-0.741816
7,London,-0.297986,0.253387,-1.151863
8,England,1.103978,-0.110078,-0.893429
9,Australia,1.049533,-0.452267,-0.835772


* Using cosine (unit-normalized vectors) makes results scale-invariant and stable across pairs.
* After PCA→3D, angles still carry meaning, but note some information loss vs original 300-D space.
* Results depend on the embedding quality and domain; noisy or sparse vocab lowers reliability.
