# Trait Scoring

## 1) setup and load data 

In [1]:
from pathlib import Path
import re, json, numpy as np, pandas as pd
import matplotlib.pyplot as plt

# Resolve project paths
ROOT = Path.cwd()
while not (ROOT/"pyproject.toml").exists() and ROOT.parent != ROOT:
    ROOT = ROOT.parent
DATA    = ROOT / "data" / "interim"
REPORTS = ROOT / "notebooks" / "reports"
FIGS    = REPORTS / "figures"
for p in [REPORTS, FIGS]: p.mkdir(parents=True, exist_ok=True)

FEATURES = DATA / "features.parquet"
TARGET   = "productivity_pct"

df = pd.read_parquet(FEATURES).sort_values("date").reset_index(drop=True)

def clean_text(s):
    if not isinstance(s, str): return ""
    s = s.replace("\n"," ").strip()
    return re.sub(r"\s+"," ", s)

ref = df.get("reflection", pd.Series([""]*len(df))).apply(clean_text)


## 2) Sentiment & subjectivity (VADER + TextBlob)

In [2]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon', quiet=True)
from textblob import TextBlob

sia = SentimentIntensityAnalyzer()
df["refl_sent_compound"] = ref.apply(lambda t: sia.polarity_scores(t)["compound"])
df["refl_subjectivity"]  = ref.apply(lambda t: TextBlob(t).sentiment.subjectivity)

# simple stats (for README/notes)
refl_stats = {
    "entries": int((ref.str.len()>0).sum()),
    "avg_chars": int(ref.str.len().replace(0,np.nan).mean() or 0),
    "avg_sentiment": float(df["refl_sent_compound"].mean()),
    "avg_subjectivity": float(df["refl_subjectivity"].mean()),
}
refl_stats


{'entries': 72,
 'avg_chars': 500,
 'avg_sentiment': 0.45698749999999994,
 'avg_subjectivity': 0.5060165258755894}

## 3) Keyword families → normalized “rates”

In [3]:
FAMS = {
    "planning":   r"\b(plan|planned|schedule|prioriti[sz]e|milestone|deadline|timebox)\b",
    "blockers":   r"\b(blocker|stuck|couldn'?t|issue|problem|delay|overwhelm|fatigue)\b",
    "learning":   r"\b(learn|reading|paper|tutorial|experiment|tried|prototype|debug)\b",
    "collab":     r"\b(team|sync|met|stakeholder|review|paired|helped|discuss)\b",
    "focus":      r"\b(deep work|focus(ed)?|uninterrupted|flow|single[- ]task)\b",
    "wins":       r"\b(done|finished|shipped|delivered|accomplish|achieved|progress)\b",
}
def count_regex(text, pat): return int(len(re.findall(pat, text.lower())))

length = ref.str.len().replace(0, np.nan)
for k,pat in FAMS.items():
    cnt = ref.apply(lambda t: count_regex(t, pat))
    df[f"refl_{k}_count"] = cnt
    # rate per char, clipped
    df[f"refl_{k}_rate"]  = (cnt/length).fillna(0).clip(0, 0.05)


## 4) Mini topics (unsupervised; optional if few entries)

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

docs = ref[ref.str.len()>0]
topic_labels = []
if len(docs) >= 10:
    vec = TfidfVectorizer(min_df=2, max_df=0.9, ngram_range=(1,2), stop_words="english")
    X = vec.fit_transform(docs)
    k = min(6, max(2, int(np.sqrt(X.shape[0])//2)))
    nmf = NMF(n_components=k, random_state=42, init="nndsvd", max_iter=500)
    W = nmf.fit_transform(X); H = nmf.components_
    terms = np.array(vec.get_feature_names_out())
    # label topics
    for i in range(k):
        top = H[i].argsort()[-5:][::-1]
        topic_labels.append("topic_"+str(i)+":"+", ".join(terms[top[:3]]))
    # attach weights to df
    W_df = pd.DataFrame(0.0, index=df.index, columns=[f"refl_topic_{i}" for i in range(k)])
    W_df.loc[docs.index, :] = W
    df = pd.concat([df, W_df], axis=1)

topic_labels


['topic_0:work, good, productive',
 'topic_1:day, productive day, moving',
 'topic_2:follow, plan, didn',
 'topic_3:better, better tomorrow, help']

## 5) Text-derived traits (0–1) + save

In [5]:
def clip01(x): 
    x = float(x) if np.isfinite(x) else 0.0
    return float(np.clip(x, 0, 1))

reflect_freq   = ((ref.str.len()>0).mean())                          # 0..1
reflect_len    = (ref.str.len().replace(0,np.nan).mean() or 0)/400.0 # ~400 chars → ~1.0
pos_tone       = (df["refl_sent_compound"].mean() + 1)/2              # [-1,1]→[0,1]

traits_from_text = {
    "reflection_discipline": clip01(0.6*reflect_freq + 0.4*reflect_len),
    "resilience":            clip01( pos_tone * (1 - np.tanh(4*df["refl_blockers_rate"].mean())) ),
    "planning_habit":        clip01(df["refl_planning_rate"].mean() * 20),
    "curiosity_learning":    clip01(df["refl_learning_rate"].mean() * 20),
    "communication_collab":  clip01(df["refl_collab_rate"].mean() * 20),
    "focus_signals":         clip01(df["refl_focus_rate"].mean() * 20),
    "delivery_orientation":  clip01(df["refl_wins_rate"].mean() * 20),
}
pd.Series(traits_from_text).to_csv(REPORTS/"05_reflection_traits.csv")
traits_from_text


{'reflection_discipline': 1.0,
 'resilience': 0.7277770593096825,
 'planning_habit': 0.024520694114345584,
 'curiosity_learning': 0.0012864621515077466,
 'communication_collab': 0.0011453445866209705,
 'focus_signals': 0.009538412983710812,
 'delivery_orientation': 0.01614833044294438}

## 6) Load quantitative profile and combine 

In [6]:
# If you already saved a quant profile earlier as 05_profile_traits.csv
quant_path = REPORTS/"05_profile_traits.csv"
quant = pd.read_csv(quant_path, index_col=0).squeeze("columns").to_dict() if quant_path.exists() else {}

# Hybrid weighting (feel free to tweak)
w_quant, w_text = 0.6, 0.4

keys = sorted(set(quant) | set(traits_from_text))
hybrid = {}
for k in keys:
    qv = float(quant.get(k, 0.0)); tv = float(traits_from_text.get(k, 0.0))
    if k in quant and k in traits_from_text:
        hybrid[k] = float(np.clip(w_quant*qv + w_text*tv, 0, 1))
    else:
        hybrid[k] = float(np.clip(qv or tv, 0, 1))

pd.Series(hybrid).to_csv(REPORTS/"05_profile_traits_hybrid.csv")
print("Saved:", REPORTS/"05_profile_traits_hybrid.csv")
hybrid


Saved: /Users/deo/UMKC_phd/project_related/job_related/cte-project/notebooks/reports/05_profile_traits_hybrid.csv


{'communication_collab': 0.0011453445866209705,
 'curiosity_learning': 0.0012864621515077466,
 'delivery_orientation': 0.01614833044294438,
 'focus_signals': 0.009538412983710812,
 'planning_habit': 0.024520694114345584,
 'reflection_discipline': 1.0,
 'resilience': 0.7277770593096825}

## 7) JD → traits (keyword lexicon) + cosine match

In [7]:
TRAIT_LEXICON = {
    "focus": ["focus","deep work","concentration","attention"],
    "reliability": ["reliable","consistent","ownership","dependable","on-time"],
    "initiative": ["initiative","proactive","self-starter","ownership","drive"],
    "communication": ["communication","present","explain","stakeholder","collaborate","influence"],
    "adaptability": ["adaptable","flexible","ambiguity","fast-paced","changing"],
    "curiosity": ["curious","question","explore","insight","experimentation"],
    "impact": ["impact","value","outcome","deliver","business"],
    "teamwork": ["team","cross-functional","partner","collaborate"],
    "independence": ["independent","autonomous","self-driven","ownership"],
    "planning": ["plan","prioritize","roadmap","deadline","timebox"],
    "resilience": ["resilience","grit","perseverance","handle pressure"],
    "learning_mindset": ["learn","growth","improve","iterate","feedback"],
}

def jd_to_traits(jd_text: str, lexicon: dict) -> dict:
    t = jd_text.lower()
    scores = {k:0.0 for k in lexicon}
    for trait, kws in lexicon.items():
        s = 0
        for kw in kws:
            s += len(re.findall(rf"\b{re.escape(kw.lower())}\b", t))
        scores[trait] = float(s)
    mx = max(scores.values()) if scores else 1.0
    if mx > 0:
        for k in scores: scores[k] /= mx
    return scores

# Paste a real JD here when testing:
JD_TEXT = """
We seek a proactive data scientist who communicates clearly with stakeholders,
shows ownership, prioritizes well, and delivers business impact in a fast-paced environment.
"""

jd_traits = jd_to_traits(JD_TEXT, TRAIT_LEXICON)

def cosine(a: dict, b: dict) -> float:
    keys = sorted(set(a)&set(b))
    va = np.array([a[k] for k in keys], float)
    vb = np.array([b[k] for k in keys], float)
    na, nb = np.linalg.norm(va), np.linalg.norm(vb)
    return float(np.dot(va, vb)/(na*nb)) if na*nb>0 else 0.0

match = cosine(hybrid, jd_traits)
print(f"Trait cosine match: {match:.3f}")
pd.Series(jd_traits).to_csv(REPORTS/"05_jd_traits_demo.csv")


Trait cosine match: 0.000


## 8) Plots: JD vs You + contribution

In [8]:
# Bars: JD (top traits) vs You
pairs = sorted(jd_traits.items(), key=lambda kv: kv[1], reverse=True)[:10]
traits = [k for k,_ in pairs]
jdv = [jd_traits[k] for k in traits]
mev = [hybrid.get(k,0) for k in traits]

plt.figure(figsize=(8,5))
idx = np.arange(len(traits)); w=0.4
plt.bar(idx-w/2, jdv, width=w, label="JD")
plt.bar(idx+w/2, mev, width=w, label="You")
plt.xticks(idx, traits, rotation=30, ha="right")
plt.ylim(0,1)
plt.title(f"Trait match (cosine={match:.2f}) — Hybrid profile")
plt.legend()
out = FIGS/"05_trait_match_bars_hybrid.png"
plt.tight_layout(); plt.savefig(out, dpi=150); plt.close(); print("Wrote:", out)

# Contribution = elementwise product
contrib = [(t, hybrid.get(t,0)*jd_traits.get(t,0)) for t in traits]
contrib.sort(key=lambda x: x[1], reverse=True)
plt.figure(figsize=(7,4.5))
plt.barh([t for t,_ in contrib[::-1]], [v for _,v in contrib[::-1]])
plt.xlabel("Contribution (my_score × jd_weight)")
plt.title("Per-trait contribution to match (Top JD traits)")
out = FIGS/"05_trait_match_contrib_hybrid.png"
plt.tight_layout(); plt.savefig(out, dpi=150); plt.close(); print("Wrote:", out)


Wrote: /Users/deo/UMKC_phd/project_related/job_related/cte-project/notebooks/reports/figures/05_trait_match_bars_hybrid.png
Wrote: /Users/deo/UMKC_phd/project_related/job_related/cte-project/notebooks/reports/figures/05_trait_match_contrib_hybrid.png


## 9) Optional GenAI verdict 

In [9]:
FIT_SCHEMA = {
  "type":"object",
  "properties":{
    "overall_fit":{"type":"string","enum":["Strong fit","Possible fit","Borderline","Not a fit"]},
    "match_score":{"type":"number"},
    "top_strengths":{"type":"array","items":{"type":"string"}},
    "top_gaps":{"type":"array","items":{"type":"string"}},
    "evidence":{"type":"array","items":{"type":"string"}},
    "suggested_actions":{"type":"array","items":{"type":"string"}}
  },
  "required":["overall_fit","match_score","top_strengths","top_gaps","evidence","suggested_actions"]
}

def call_llm(system: str, user: str, model="gpt-4.1-mini"):
    # TODO: replace with your provider call; must return text
    raise RuntimeError("Wire your LLM provider here.")

# pick a few short quotes as evidence (optional)
examples = []
pos = df.sort_values("refl_sent_compound", ascending=False)
neg = df.sort_values("refl_sent_compound", ascending=True)
for i,row in pos.head(3).iterrows():
    t = clean_text(row.get("reflection",""))
    if t: examples.append(f"POS[{str(row['date'])[:10]}]: {t[:140]}")
for i,row in neg.head(2).iterrows():
    t = clean_text(row.get("reflection",""))
    if t: examples.append(f"CHALLENGE[{str(row['date'])[:10]}]: {t[:140]}")
evidence_quotes = examples[:5]

system_msg = (
 "You are a precise hiring analyst. Use ONLY the numeric trait vectors and match score. "
 "Return STRICT JSON conforming to the given schema. Be concise and concrete."
)
user_msg = f"""
Hybrid profile (0-1): {json.dumps(hybrid, indent=2)}
JD trait weights (0-1): {json.dumps(jd_traits, indent=2)}
cosine_match: {match:.3f}
Evidence quotes (optional): {json.dumps(evidence_quotes, indent=2)}
Schema: {json.dumps(FIT_SCHEMA, indent=2)}
"""

def judge_fit(system_msg, user_msg):
    try:
        text = call_llm(system_msg, user_msg)
        try:
            return json.loads(text)
        except Exception:
            import re
            m = re.search(r"\{.*\}", text, re.S)
            return json.loads(m.group(0)) if m else {}
    except RuntimeError:
        # Fallback numeric verdict (works without API)
        top_gaps = sorted(jd_traits, key=lambda k: jd_traits[k]-hybrid.get(k,0), reverse=True)[:3]
        top_strengths = sorted(jd_traits, key=lambda k: hybrid.get(k,0)*jd_traits[k], reverse=True)[:5]
        overall = "Strong fit" if match>=0.7 else "Possible fit" if match>=0.55 else "Borderline" if match>=0.45 else "Not a fit"
        return {
          "overall_fit": overall,
          "match_score": round(match,3),
          "top_strengths": top_strengths,
          "top_gaps": top_gaps,
          "evidence": [f"{k}: mine={hybrid.get(k,0):.2f}, jd={jd_traits[k]:.2f}" for k in top_strengths],
          "suggested_actions": [f"Raise '{g}' via targeted habits/projects over 2–4 weeks." for g in top_gaps]
        }

verdict = judge_fit(system_msg, user_msg)
with open(REPORTS/"05_trait_fit_verdict_hybrid.json","w") as f:
    json.dump(verdict, f, indent=2)
print("Saved verdict:", REPORTS/"05_trait_fit_verdict_hybrid.json")
verdict


Saved verdict: /Users/deo/UMKC_phd/project_related/job_related/cte-project/notebooks/reports/05_trait_fit_verdict_hybrid.json


{'overall_fit': 'Not a fit',
 'match_score': 0.0,
 'top_strengths': ['focus',
  'reliability',
  'initiative',
  'communication',
  'adaptability'],
 'top_gaps': ['initiative', 'impact', 'reliability'],
 'evidence': ['focus: mine=0.00, jd=0.00',
  'reliability: mine=0.00, jd=0.50',
  'initiative: mine=0.00, jd=1.00',
  'communication: mine=0.00, jd=0.00',
  'adaptability: mine=0.00, jd=0.50'],
 'suggested_actions': ["Raise 'initiative' via targeted habits/projects over 2–4 weeks.",
  "Raise 'impact' via targeted habits/projects over 2–4 weeks.",
  "Raise 'reliability' via targeted habits/projects over 2–4 weeks."]}