In [2]:
import pandas as pd
import spacy
from tqdm import tqdm

# Load the SpaCy English model
!python -m spacy download en_core_web_sm

# Enable tqdm integration with SpaCy for progress bars
tqdm.pandas()


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hInstalling collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
df = pd.read_csv("/Users/elifakdeniz/Desktop/Thesis_New/Notebooks/Jupyter_notebook/Future Engineering/sentiment/2_final_dataset_deduplicated.csv")  # Adjust path if needed


In [12]:
!python -m spacy download en_core_web_md


Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: en-core-web-md
Successfully installed en-core-web-md-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [13]:
import spacy
nlp = spacy.load("en_core_web_md")  # or "en_core_web_sm"


In [14]:
import spacy
import pandas as pd

# Load a model with parser (md is more accurate than sm)
nlp = spacy.load("en_core_web_md", disable=["ner","lemmatizer"])

# Ensure we have sentence boundaries even if parser is missing
if "parser" not in nlp.pipe_names:
    if "senter" in nlp.pipe_names:
        pass  # good, senter will handle sentence boundaries
    elif "sentencizer" not in nlp.pipe_names:
        nlp.add_pipe("sentencizer")  # rule-based sentence splitter

def compute_syntactic_features(text):
    # Guard: ensure a clean, non-empty string
    if not isinstance(text, str):
        return pd.Series([0.0, 0.0], index=["avg_sent_len", "avg_tree_depth"])
    text = text.strip()
    if not text:
        return pd.Series([0.0, 0.0], index=["avg_sent_len", "avg_tree_depth"])

    try:
        doc = nlp(text)

        # If sentence boundaries still missing, fall back to whole doc as one "sentence"
        sents = list(doc.sents)
        if not sents:
            sents = [doc]

        sentence_lengths = [len(sent) for sent in sents]

        # tree depth: longest ancestor chain per sentence (ignore pure punctuation tokens)
        def token_depth(tok):
            d = 0
            while tok.head is not tok:
                tok = tok.head
                d += 1
            return d

        tree_depths = []
        for sent in sents:
            content_toks = [t for t in sent if t.dep_ != "punct"]
            if not content_toks:
                tree_depths.append(0)
            else:
                tree_depths.append(max(token_depth(t) for t in content_toks))

        avg_sent_len = sum(sentence_lengths) / len(sentence_lengths) if sentence_lengths else 0.0
        avg_tree_depth = sum(tree_depths) / len(tree_depths) if tree_depths else 0.0

        return pd.Series([avg_sent_len, avg_tree_depth], index=["avg_sent_len", "avg_tree_depth"])
    except Exception as e:
        # Helpful for debugging instead of silent zeros:
        # print(f"parse error: {e}\nText sample: {text[:200]}")
        return pd.Series([0.0, 0.0], index=["avg_sent_len", "avg_tree_depth"])


In [30]:
# === Syntactic Features: One-File Final ===
import re, time, json, numpy as np, pandas as pd
from tqdm import tqdm
import spacy
from pathlib import Path

# -------------------- CONFIG --------------------
TEXT_COL = "description"                  # raw/lightly cleaned text (with punctuation + stopwords)
BATCH_SIZE = 64                           # try 64–128; lower if RAM is limited
N_PROCESS = 1                             # start with 1 (macOS safe); try 2–4 later
OUT_PATH = Path("final_with_syntactic_features.parquet")
SAVE_CSV_TOO = False                      # set True if you also want a CSV
# ------------------------------------------------

# 0) Ensure column exists & not-null strings
assert TEXT_COL in df.columns, f"Column '{TEXT_COL}' not found in df"
df[TEXT_COL] = df[TEXT_COL].fillna("").astype(str)

# 1) Load spaCy model with parser enabled (install first: python -m spacy download en_core_web_md)
nlp = spacy.load("en_core_web_md", disable=["ner","lemmatizer"])
# safeguard for long docs
max_chars = df[TEXT_COL].str.len().max()
nlp.max_length = max(2_000_000, int(max_chars * 1.2))
print("Pipeline:", nlp.pipe_names, "| nlp.max_length:", nlp.max_length)

# 2) Punctuation presence check (parsing needs it)
def has_punct(s: str) -> bool:
    return bool(re.search(r"[.!?;:,()\-]", s or ""))

punct_ratio = df[TEXT_COL].map(has_punct).mean()
print(f"Rows with punctuation in '{TEXT_COL}': {punct_ratio:.2%}")
if punct_ratio < 0.30:
    raise ValueError(
        f"'{TEXT_COL}' appears too stripped for parsing (punctuation in only {punct_ratio:.1%} of rows). "
        "Use a raw column that preserves punctuation + stopwords."
    )

# 3) Feature helpers
CLAUSE_DEPS = {"advcl","ccomp","xcomp","acl","relcl","csubj","csubjpass"}

def token_depth(tok):
    d = 0
    while tok.head is not tok:
        tok = tok.head; d += 1
    return d

def features_from_doc(doc):
    sents = list(doc.sents) or [doc]
    sent_lens = [len(s) for s in sents]

    # average head distance across tokens (dependency length)
    dep_dists = [0 if t.head is t else abs(t.i - t.head.i) for t in doc]
    avg_dep_len = float(np.mean(dep_dists)) if dep_dists else 0.0

    # per-sentence max tree depth (ignore punctuation tokens)
    depths, clause_count = [], 0
    for s in sents:
        toks = [t for t in s if t.dep_ != "punct"]
        depths.append(max((token_depth(t) for t in toks), default=0))
        clause_count += sum(t.dep_ in CLAUSE_DEPS for t in s)

    return (
        (sum(sent_lens)/len(sent_lens)) if sent_lens else 0.0,   # avg_sent_length
        (sum(depths)/len(depths))       if depths    else 0.0,   # avg_tree_depth
        avg_dep_len,                                             # avg_le
        clause_count,                                            # clause_count
        len(sents),                                              # n_sents
        len(doc),                                                # n_tokens
    )

# 4) Smoke test on first row (quick sanity)
t0 = time.time()
doc0 = nlp(df.iloc[0][TEXT_COL])
print("Smoke test — sentences in first row:", sum(1 for _ in doc0.sents), "| time:", round(time.time()-t0, 2), "s")

# 5) Batch parse entire column
texts = df[TEXT_COL].tolist()
rows = []
for doc in tqdm(nlp.pipe(texts, batch_size=BATCH_SIZE, n_process=N_PROCESS), total=len(texts)):
    rows.append(features_from_doc(doc))

cols = ["avg_sent_length","avg_tree_depth","avg_le","clause_count","n_sents","n_tokens"]
feats = pd.DataFrame(rows, columns=cols, index=df.index)

# 6) Join back + quick QA
df = df.join(feats)
print(df[cols].describe())
print("Approx tokens check (first 10):", (df["avg_sent_length"]*df["n_sents"]).head(10).round(1).to_list())

# 7) Save with metadata
meta = {
    "spacy_version": spacy.__version__,
    "model": "en_core_web_md",
    "pipe": nlp.pipe_names,
    "batch_size": BATCH_SIZE,
    "n_process": N_PROCESS,
    "text_col": TEXT_COL,
    "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
}
df.attrs["syntactic_meta"] = json.dumps(meta)
df.to_parquet(OUT_PATH)
if SAVE_CSV_TOO:
    df.to_csv(OUT_PATH.with_suffix(".csv"), index=False)

print("Saved:", OUT_PATH)
if SAVE_CSV_TOO:
    print("Saved:", OUT_PATH.with_suffix(".csv"))


Pipeline: ['tok2vec', 'tagger', 'parser', 'attribute_ruler'] | nlp.max_length: 2000000
Rows with punctuation in 'description': 100.00%
Smoke test — sentences in first row: 18 | time: 0.16 s


  0%|                                               | 0/21063 [1:21:27<?, ?it/s]


KeyboardInterrupt: 

In [7]:
df.to_csv("final_dataset_with_syntactic_new_features.csv", index=False)


In [9]:
import pandas as pd

# Load the updated dataset with syntactic features
df = pd.read_csv("/Users/elifakdeniz/Desktop/Thesis_New/Notebooks/Jupyter_notebook/Future Engineering/final_dataset_with_syntactic_new_features.csv")

# Display the first few rows
df.head()


Unnamed: 0,job_id,company_name,title,description,max_salary,pay_period,location,company_id,views,min_salary,...,description_clean,anger,disgust,fear,joy,neutral,sadness,surprise,avg_sent_length,avg_tree_depth
0,921716,Corcoran Sawyer Smith,Marketing Coordinator,Job descriptionA leading real estate firm in N...,20.0,HOURLY,"Princeton, NJ",2774458.0,20.0,17.0,...,job descriptiona leading real estate firm in n...,0.007679,0.00065,0.0019,0.911083,0.053122,0.002938,0.022628,0,0
1,91700727,Downtown Raleigh Alliance,Economic Development and Planning Intern,Job summary:The Economic Development & Plannin...,20.0,HOURLY,"Raleigh, NC",1481176.0,9.0,14.0,...,job summarythe economic development planning i...,0.063967,0.001047,0.016772,0.584575,0.154706,0.125784,0.05315,0,0
2,103254301,Raw Cereal,Producer,Company DescriptionRaw Cereal is a creative de...,300000.0,YEARLY,United States,81942316.0,7.0,60000.0,...,company descriptionraw cereal is a creative de...,0.01072,0.000936,0.009489,0.178955,0.573286,0.016077,0.210537,0,0
3,9615617,"Glastender, Inc.",Inside Customer Service Associate,Glastender Inc. is a family-owned manufacturer...,,,"Saginaw, MI",1194336.0,4.0,,...,glastender inc is a familyowned manufacturer o...,0.014659,0.000577,0.009076,0.817529,0.106623,0.005407,0.046128,0,0
4,111513530,United Methodists of Greater New Jersey,"Content Writer, Communications","Application opening date: April 24, 2024\nTitl...",,,Greater Philadelphia,4028816.0,10.0,,...,application opening date april title content w...,0.019976,0.000699,0.005662,0.809778,0.129329,0.008757,0.025797,0,0


In [10]:
import pandas as pd

# Load your updated dataset

# Select and display the relevant columns
selected_columns = df[["company_name", "industry", "description", "avg_sent_length", "avg_tree_depth"]]

# Display the first 10 rows
selected_columns.head(10)


Unnamed: 0,company_name,industry,description,avg_sent_length,avg_tree_depth
0,Corcoran Sawyer Smith,Real Estate,Job descriptionA leading real estate firm in N...,0,0
1,Downtown Raleigh Alliance,Non-profit Organizations,Job summary:The Economic Development & Plannin...,0,0
2,Raw Cereal,Design Services,Company DescriptionRaw Cereal is a creative de...,0,0
3,"Glastender, Inc.",Food and Beverage Services,Glastender Inc. is a family-owned manufacturer...,0,0
4,United Methodists of Greater New Jersey,Religious Institutions,"Application opening date: April 24, 2024\nTitl...",0,0
5,Shannon Waltchack,Real Estate,WORK @ SWShannon Waltchack (SW) is seeking a C...,0,0
6,Premier Family Clinic,Hospitals and Health Care,We are seeking a qualified Physician Assistant...,0,0
7,GOYT,Software Development,Job Description:GOYT is seeking a skilled and ...,0,0
8,Revesco Properties,Real Estate,About Revesco Properties:Revesco Properties is...,0,0
9,ADEPT HRM Solutions,Human Resources Services,Job Summary: We are seeking a skilled Producti...,0,0


In [18]:
import spacy
import numpy as np
from tqdm import tqdm
tqdm.pandas()

nlp = spacy.load("en_core_web_sm")

def compute_le_nnd(text):
    doc = nlp(text)
    le_counts = []
    nnd_distances = []

    for sent in doc.sents:
        root = sent.root
        le = 0
        for token in sent:
            if token.i < root.i and token.pos_ != "VERB":
                le += 1
        le_counts.append(le)

        # NND: Nested Noun Distance
        for token in sent:
            if token.pos_ in {"NOUN", "PROPN"}:
                ancestor = token.head
                while ancestor != token and ancestor.pos_ in {"NOUN", "PROPN"}:
                    distance = abs(token.i - ancestor.i)
                    nnd_distances.append(distance)
                    break  # Only consider first ancestor

    avg_le = np.mean(le_counts) if le_counts else 0
    avg_nnd = np.mean(nnd_distances) if nnd_distances else 0
    return pd.Series([avg_le, avg_nnd])


In [19]:
df[["avg_le", "avg_nnd"]] = df["description_clean"].progress_apply(compute_le_nnd)


100%|█████████████████████████████████████| 21063/21063 [34:17<00:00, 10.24it/s]


In [21]:
# Save the updated DataFrame with syntactic metrics
df.to_csv("final_dataset_with_le_nnd.csv", index=False)
