# Install package

In [1]:
%pip -q install spacy==3.7.5 spacy-lookups-data
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.7.1
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


# Import

In [2]:
import spacy, numpy as np, re, os, zipfile, urllib.request
from pathlib import Path

nlp = spacy.load("en_core_web_md")  # including vectors 300d
TOKEN_RE = re.compile(r"[A-Za-z]+(?:'[A-Za-z]+)?")
STOP = nlp.Defaults.stop_words

def normalize(text: str):
    toks = [t.lower() for t in TOKEN_RE.findall(text)]
    return [t for t in toks if t not in STOP and len(t) > 2]

def doc_vector_spacy(text: str):
    doc = nlp(" ".join(normalize(text)))
    vecs = [t.vector for t in doc if t.has_vector]
    if not vecs:
        return np.zeros(nlp.vocab.vectors_length, dtype=np.float32)
    return np.vstack(vecs).mean(axis=0).astype(np.float32)

def cosine_sim(a,b,eps=1e-9):
    a = a/(np.linalg.norm(a)+eps); b = b/(np.linalg.norm(b)+eps)
    return float(a@b)

print("Ready ✓ | vector dim:", nlp.vocab.vectors_length)


Ready ✓ | vector dim: 300


# Download and extract dataset

In [3]:
DATA_URL = "https://s3.amazonaws.com/video.udacity-data.com/topher/2019/January/5c4147f9_data/data.zip"
DATA_DIR = Path("/content/data")
ZIP_PATH = DATA_DIR / "data.zip"

DATA_DIR.mkdir(parents=True, exist_ok=True)
if not ZIP_PATH.exists():
    print("Downloading dataset...")
    urllib.request.urlretrieve(DATA_URL, ZIP_PATH)
else:
    print("Found cached zip.")

with zipfile.ZipFile(ZIP_PATH, "r") as zf:
    zf.extractall(DATA_DIR)

for root, _, files in os.walk(DATA_DIR):
    print(Path(root).relative_to(DATA_DIR), "->", len(files), "files")


Downloading dataset...
. -> 1 files
__MACOSX -> 1 files
__MACOSX/data -> 102 files
data -> 103 files


# Peek files

In [4]:
def peek_some_txt(base, n=3):
    base = Path(base)
    texts = sorted(base.rglob("*.txt"))
    print("Found", len(texts), "txt files")
    for p in texts[:n]:
        s = p.read_text(encoding="utf-8", errors="ignore")[:400].replace("\n"," ")
        print("\n---", p, "---\n", s[:400], "...")

peek_some_txt(DATA_DIR, n=3)


Found 199 txt files

--- /content/data/__MACOSX/data/._g0pA_taska.txt ---
     Mac OS X            	   2                                               ATTR             F                     F  com.apple.quarantine q/0001;5c13fb82;Google\x20Chrome;E3D29904-6090-44D1-96D7-AA41117F14B4  ...

--- /content/data/__MACOSX/data/._g0pA_taskb.txt ---
     Mac OS X            	   2                                               ATTR             F                     F  com.apple.quarantine q/0001;5c13fb82;Google\x20Chrome;E3D29904-6090-44D1-96D7-AA41117F14B4  ...

--- /content/data/__MACOSX/data/._g0pA_taskc.txt ---
     Mac OS X            	   2                                               ATTR             F                     F  com.apple.quarantine q/0001;5c13fb82;Google\x20Chrome;E3D29904-6090-44D1-96D7-AA41117F14B4  ...


# Extract sourrces and queries

In [5]:
BASE = DATA_DIR
CAND_SRC = ["source", "sources", "original", "orig"]
CAND_QRY = ["plagiarism", "plagiarized", "suspicious", "queries", "student"]

def find_subdirs(base, names):
    return [base/n for n in names if (base/n).exists()]

SRC_DIRS = find_subdirs(BASE, CAND_SRC)
QRY_DIRS = find_subdirs(BASE, CAND_QRY)

def read_txts(dir_list):
    items = []
    for d in dir_list:
        for p in sorted(Path(d).rglob("*.txt")):
            try:
                items.append((str(p), Path(p).read_text(encoding="utf-8", errors="ignore")))
            except:
                pass
    return items

src_items = read_txts(SRC_DIRS) or read_txts([BASE])  # fallback
qry_items = read_txts(QRY_DIRS)

# if no queries → split 80/20 from source for demo
if not qry_items:
    split = int(0.8 * len(src_items))
    qry_items = src_items[split:]
    src_items = src_items[:split]
    print("(Fallback split) sources:", len(src_items), "| queries:", len(qry_items))
else:
    print("sources:", len(src_items), "| queries:", len(qry_items))


(Fallback split) sources: 159 | queries: 40


# DocSim

In [6]:
class DocSimSpaCy:
    def __init__(self):
        self.ids, self.vecs = [], None
        self.texts = {}

    def add_corpus(self, items):
        vec_list = []
        for doc_id, text in items:
            v = doc_vector_spacy(text)
            self.ids.append(doc_id)
            self.texts[doc_id] = text
            vec_list.append(v)
        self.vecs = np.vstack(vec_list) if vec_list else None
        print("Corpus:", len(self.ids), "| dim:", 0 if self.vecs is None else self.vecs.shape[1])

    def most_similar(self, query_text, topn=5):
        if self.vecs is None: return []
        qv = doc_vector_spacy(query_text)
        V = self.vecs / (np.linalg.norm(self.vecs, axis=1, keepdims=True)+1e-9)
        q = qv / (np.linalg.norm(qv)+1e-9)
        sims = (V @ q).astype(float)
        idx = np.argsort(-sims)[:topn]
        return [{"doc_id": self.ids[i], "similarity": float(sims[i])} for i in idx]

docs = DocSimSpaCy()
docs.add_corpus(src_items)


Corpus: 159 | dim: 300


# Top-K

In [11]:
TOPK = 5
THRESH = 0.75

for q_id, q_text in qry_items[:5]:  # print first 5 queries
    print("\nQuery:", q_id)
    for r in docs.most_similar(q_text, topn=TOPK):
        mark = " ⛳" if r["similarity"] >= THRESH else ""
        print(f"  -> {r['doc_id']} | sim={r['similarity']:.3f}{mark}")



Query: /content/data/data/g3pA_taska.txt
  -> /content/data/data/g1pA_taska.txt | sim=0.969 ⛳
  -> /content/data/data/g1pD_taska.txt | sim=0.965 ⛳
  -> /content/data/data/g2pC_taska.txt | sim=0.962 ⛳
  -> /content/data/data/g0pC_taska.txt | sim=0.959 ⛳
  -> /content/data/data/g2pE_taska.txt | sim=0.957 ⛳

Query: /content/data/data/g3pA_taskb.txt
  -> /content/data/data/g2pA_taskb.txt | sim=0.989 ⛳
  -> /content/data/data/g2pC_taskb.txt | sim=0.980 ⛳
  -> /content/data/data/g1pB_taskb.txt | sim=0.980 ⛳
  -> /content/data/data/g0pA_taskb.txt | sim=0.979 ⛳
  -> /content/data/data/g2pB_taskb.txt | sim=0.977 ⛳

Query: /content/data/data/g3pA_taskc.txt
  -> /content/data/data/g0pB_taskc.txt | sim=0.996 ⛳
  -> /content/data/data/g2pA_taskc.txt | sim=0.995 ⛳
  -> /content/data/data/g1pB_taskc.txt | sim=0.992 ⛳
  -> /content/data/data/g0pA_taskc.txt | sim=0.991 ⛳
  -> /content/data/data/g0pD_taskc.txt | sim=0.990 ⛳

Query: /content/data/data/g3pA_taskd.txt
  -> /content/data/data/g2pB_taskd.tx

# Calculate accuracy

In [12]:
import re
def extract_id(path):
    s = Path(path).stem
    m = re.findall(r"\d+", s)
    return m[0] if m else s.lower()

id2src = {}
for doc_id, _ in src_items:
    id2src.setdefault(extract_id(doc_id), []).append(doc_id)

correct = total = 0
for q_id, q_text in qry_items:
    sims = docs.most_similar(q_text, topn=1)
    if not sims: continue
    total += 1
    if extract_id(q_id) in id2src and sims[0]["doc_id"] in id2src[extract_id(q_id)]:
        correct += 1
print("Approx. Top-1 Match Accuracy:", f"{correct}/{total}" if total else "N/A")


Approx. Top-1 Match Accuracy: 0/40


# Export csv

In [9]:
import pandas as pd

rows = []
for q_id, q_text in qry_items:
    sims = docs.most_similar(q_text, topn=5)
    row = {"query": q_id}
    for i, r in enumerate(sims, 1):
        row[f"rank{i}_doc"] = r["doc_id"]
        row[f"rank{i}_sim"] = r["similarity"]
    rows.append(row)

df = pd.DataFrame(rows).sort_values("query")
out_path = "/content/plagiarism_results_spacy.csv"
df.to_csv(out_path, index=False)
print("Saved:", out_path)
df.head(10)


Saved: /content/plagiarism_results_spacy.csv


Unnamed: 0,query,rank1_doc,rank1_sim,rank2_doc,rank2_sim,rank3_doc,rank3_sim,rank4_doc,rank4_sim,rank5_doc,rank5_sim
0,/content/data/data/g3pA_taska.txt,/content/data/data/g1pA_taska.txt,0.969265,/content/data/data/g1pD_taska.txt,0.965443,/content/data/data/g2pC_taska.txt,0.961606,/content/data/data/g0pC_taska.txt,0.959104,/content/data/data/g2pE_taska.txt,0.95717
1,/content/data/data/g3pA_taskb.txt,/content/data/data/g2pA_taskb.txt,0.988727,/content/data/data/g2pC_taskb.txt,0.980321,/content/data/data/g1pB_taskb.txt,0.979656,/content/data/data/g0pA_taskb.txt,0.97879,/content/data/data/g2pB_taskb.txt,0.977009
2,/content/data/data/g3pA_taskc.txt,/content/data/data/g0pB_taskc.txt,0.996321,/content/data/data/g2pA_taskc.txt,0.994757,/content/data/data/g1pB_taskc.txt,0.991706,/content/data/data/g0pA_taskc.txt,0.991423,/content/data/data/g0pD_taskc.txt,0.990199
3,/content/data/data/g3pA_taskd.txt,/content/data/data/g2pB_taskd.txt,0.997184,/content/data/data/g0pC_taskd.txt,0.9942,/content/data/data/g0pB_taskd.txt,0.989654,/content/data/data/g1pA_taskd.txt,0.98841,/content/data/data/g1pB_taskd.txt,0.978408
4,/content/data/data/g3pA_taske.txt,/content/data/data/g1pD_taske.txt,0.976437,/content/data/data/g2pE_taske.txt,0.973385,/content/data/data/g2pC_taske.txt,0.973071,/content/data/data/g2pA_taske.txt,0.97163,/content/data/data/g1pB_taske.txt,0.970268
5,/content/data/data/g3pB_taska.txt,/content/data/data/g0pC_taska.txt,0.96546,/content/data/data/g1pA_taska.txt,0.965259,/content/data/data/g2pC_taska.txt,0.959672,/content/data/data/g1pD_taska.txt,0.959376,/content/data/data/g2pE_taska.txt,0.957758
6,/content/data/data/g3pB_taskb.txt,/content/data/data/g2pB_taskb.txt,0.97809,/content/data/data/g0pA_taskb.txt,0.970148,/content/data/data/g0pB_taskb.txt,0.969855,/content/data/data/g2pA_taskb.txt,0.965742,/content/data/data/g0pE_taskb.txt,0.963196
7,/content/data/data/g3pB_taskc.txt,/content/data/data/g0pB_taskc.txt,0.984724,/content/data/data/g2pB_taskc.txt,0.983115,/content/data/data/g2pA_taskc.txt,0.982793,/content/data/data/g1pB_taskc.txt,0.980698,/content/data/data/g0pD_taskc.txt,0.979569
8,/content/data/data/g3pB_taskd.txt,/content/data/data/g2pB_taskd.txt,0.954838,/content/data/data/g0pB_taskd.txt,0.950695,/content/data/data/g1pB_taskd.txt,0.948814,/content/data/data/g0pC_taskd.txt,0.948807,/content/data/data/g1pA_taskd.txt,0.944381
9,/content/data/data/g3pB_taske.txt,/content/data/data/g1pB_taske.txt,0.986712,/content/data/data/g0pE_taske.txt,0.986342,/content/data/data/g2pB_taske.txt,0.985281,/content/data/data/g0pC_taske.txt,0.98493,/content/data/data/g0pB_taske.txt,0.975147
