In [2]:
"""
Kaggle Data Cleaner - Fixed Version
Corrupt JSON files ko bhi handle karta hai
"""

import json
import re
import pandas as pd
from pathlib import Path
from datetime import datetime

DATA_DIR = Path("data")
CLEAN_DIR = Path("data/cleaned")
CLEAN_DIR.mkdir(parents=True, exist_ok=True)

def clean_text(text):
    if not text or not isinstance(text, str):
        return ""
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s\.,\-\(\)\/\+]', '', text)
    return text.strip()

def extract_skills(text):
    if not text:
        return []
    SKILLS = [
        "python", "java", "javascript", "typescript", "c++", "c#", "sql",
        "react", "angular", "vue", "nodejs", "django", "flask", "fastapi",
        "machine learning", "deep learning", "nlp", "tensorflow", "pytorch",
        "scikit-learn", "pandas", "numpy", "keras",
        "aws", "azure", "gcp", "docker", "kubernetes", "git",
        "postgresql", "mongodb", "mysql", "redis", "elasticsearch",
        "html", "css", "rest api", "graphql", "microservices",
        "data science", "data analysis", "computer vision",
        "linux", "agile", "scrum", "ci/cd", "jenkins",
    ]
    text_lower = text.lower()
    return [s for s in SKILLS if s in text_lower]

def extract_experience_years(text):
    if not text:
        return None
    patterns = [
        r'(\d+)\+?\s*years?\s*of\s*experience',
        r'(\d+)\+?\s*years?\s*experience',
        r'experience\s*of\s*(\d+)\+?\s*years?',
        r'(\d+)\+?\s*yr[s]?\s*exp',
    ]
    for pat in patterns:
        match = re.search(pat, text.lower())
        if match:
            return int(match.group(1))
    return None

def clean_kaggle_only():
    print("=== Cleaning Kaggle Data (Fixed) ===")
    all_records = []

    # ---- Resume Screening ----
    resume_json = DATA_DIR / "kaggle" / "Resume Screening.json"
    resume_csv  = DATA_DIR / "kaggle" / "Resume Screening.csv"

    if resume_json.exists():
        try:
            with open(resume_json, encoding="utf-8") as f:
                resumes = json.load(f)
            print(f"  Resume JSON loaded: {len(resumes)} records")
        except:
            print("  Resume JSON corrupt, CSV se load kar raha hoon...")
            df = pd.read_csv(resume_csv, on_bad_lines="skip")
            resumes = df.to_dict(orient="records")
            print(f"  Resume CSV loaded: {len(resumes)} records")
    elif resume_csv.exists():
        df = pd.read_csv(resume_csv, on_bad_lines="skip")
        resumes = df.to_dict(orient="records")
        print(f"  Resume CSV loaded: {len(resumes)} records")
    else:
        resumes = []
        print("  Resume file nahi mili!")

    for r in resumes:
        text = r.get("resume", r.get("Resume", r.get("resume_text", r.get("text", ""))))
        category = r.get("category", r.get("Category", r.get("label", "")))
        rec = {
            "id": f"kaggle_resume_{len(all_records)}",
            "source": "kaggle_resume",
            "type": "resume",
            "category": clean_text(str(category)),
            "text": clean_text(str(text)),
            "skills": extract_skills(str(text)),
            "experience_years": extract_experience_years(str(text)),
            "cleaned_at": datetime.now().isoformat(),
        }
        if rec["text"]:
            all_records.append(rec)

    # ---- Job Descriptions ---- (CSV se directly load karo — JSON corrupt hai)
    jd_csv = DATA_DIR / "kaggle" / "job_descriptions.csv"
    jd_json = DATA_DIR / "kaggle" / "job_descriptions.json"

    jds = []
    if jd_csv.exists():
        try:
            # Bari file — chunks mein load karo
            print("  Job Descriptions CSV loading (chunks mein)...")
            chunks = pd.read_csv(jd_csv, on_bad_lines="skip", chunksize=5000, encoding="utf-8")
            df_jd = pd.concat(chunks, ignore_index=True)
            jds = df_jd.to_dict(orient="records")
            print(f"  Job Descriptions loaded: {len(jds)} records")
        except Exception as e:
            print(f"  CSV error: {e}")
    elif jd_json.exists():
        try:
            with open(jd_json, encoding="utf-8") as f:
                jds = json.load(f)
            print(f"  Job Descriptions JSON loaded: {len(jds)} records")
        except Exception as e:
            print(f"  JSON corrupt: {e}")

    for r in jds:
        title = r.get("title", r.get("job_title", r.get("position", r.get("Job Title", ""))))
        desc  = r.get("description", r.get("job_description", r.get("requirements", r.get("Job Description", ""))))
        company = r.get("company", r.get("company_name", r.get("Company", "")))

        rec = {
            "id": f"kaggle_jd_{len(all_records)}",
            "source": "kaggle_jd",
            "type": "job_description",
            "title": clean_text(str(title)),
            "company": clean_text(str(company)),
            "description": clean_text(str(desc)),
            "skills_required": extract_skills(f"{title} {desc}"),
            "experience_years": extract_experience_years(str(desc)),
            "cleaned_at": datetime.now().isoformat(),
        }
        if rec["title"] or rec["description"]:
            all_records.append(rec)

    # Save
    out_path = CLEAN_DIR / "kaggle_clean.json"
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(all_records, f, indent=2, ensure_ascii=False)

    print(f"""
  Kaggle cleaning done!
  Total records: {len(all_records)}
  Saved: {out_path}
    """)

    # all_data_clean.json update karo
    all_clean = CLEAN_DIR / "all_data_clean.json"
    existing = []
    if all_clean.exists():
        try:
            with open(all_clean, encoding="utf-8") as f:
                existing = json.load(f)
            # Purani kaggle records hata do
            existing = [r for r in existing if not r.get("source", "").startswith("kaggle")]
        except:
            pass

    combined = existing + all_records
    with open(all_clean, "w", encoding="utf-8") as f:
        json.dump(combined, f, indent=2, ensure_ascii=False)

    print(f"  all_data_clean.json updated: {len(combined)} total records")
    return all_records

if __name__ == "__main__":
    clean_kaggle_only()

=== Cleaning Kaggle Data (Fixed) ===
  Resume JSON loaded: 962 records
  JSON corrupt: Expecting ',' delimiter: line 9812792 column 257 (char 596467371)

  Kaggle cleaning done!
  Total records: 962
  Saved: data\cleaned\kaggle_clean.json
    
  all_data_clean.json updated: 962 total records


In [3]:

import json
from pathlib import Path

for f in Path('data/cleaned').glob('*.json'):
    with open(f) as fp:
        d = json.load(fp)
    print(f'{f.name}: {len(d)} records')


all_data_clean.json: 962 records
kaggle_clean.json: 962 records


In [5]:
"""
Indeed + LinkedIn Data Cleaner
"""

import json
import re
from pathlib import Path

CLEAN_DIR = Path("data/cleaned")
CLEAN_DIR.mkdir(parents=True, exist_ok=True)

def clean_text(text):
    if not text or not isinstance(text, str):
        return ""
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s\.,\-\(\)\/\+]', '', text)
    return text.strip()

def extract_skills(text):
    if not text:
        return []
    SKILLS = [
        "python", "java", "javascript", "sql", "react", "nodejs",
        "django", "flask", "machine learning", "deep learning",
        "tensorflow", "pytorch", "aws", "azure", "docker",
        "kubernetes", "git", "mongodb", "mysql", "html", "css",
        "data science", "linux", "typescript", "fastapi", "numpy",
        "pandas", "scikit-learn", "gcp", "postgresql", "redis",
    ]
    return [s for s in SKILLS if s in text.lower()]

def clean_source(folder, source_name):
    records = []
    for f in Path(folder).rglob("*.json"):
        if "all_" in f.name:
            continue
        try:
            data = json.load(open(f, encoding="utf-8"))
            records.extend(data if isinstance(data, list) else [data])
        except Exception as e:
            print(f"  Skip {f.name}: {e}")

    cleaned = []
    seen = set()
    for r in records:
        title = clean_text(r.get("title", ""))
        if not title:
            continue
        company = clean_text(r.get("company", ""))
        key = f"{title.lower()}_{company.lower()}"
        if key in seen:
            continue
        seen.add(key)
        desc = clean_text(r.get("description", "") or r.get("summary", ""))
        cleaned.append({
            "id": f"{source_name}_{len(cleaned)}",
            "source": source_name,
            "title": title,
            "company": company,
            "location": clean_text(r.get("location", "")),
            "description": desc,
            "skills_required": extract_skills(f"{title} {desc}"),
            "scraped_at": r.get("scraped_at", ""),
        })

    out = CLEAN_DIR / f"{source_name}_clean.json"
    with open(out, "w", encoding="utf-8") as f:
        json.dump(cleaned, f, indent=2, ensure_ascii=False)
    print(f"{source_name}: {len(records)} raw -> {len(cleaned)} unique saved")
    return cleaned

# Clean both
indeed_data   = clean_source("data/indeed",   "indeed")
linkedin_data = clean_source("data/linkedin", "linkedin")

# Merge with kaggle
kaggle_path = CLEAN_DIR / "kaggle_clean.json"
kaggle_data = []
if kaggle_path.exists():
    kaggle_data = json.load(open(kaggle_path, encoding="utf-8"))

all_data = kaggle_data + indeed_data + linkedin_data
with open(CLEAN_DIR / "all_data_clean.json", "w", encoding="utf-8") as f:
    json.dump(all_data, f, indent=2, ensure_ascii=False)

print(f"\nTotal merged: {len(all_data)} records")
print(f"  Kaggle  : {len(kaggle_data)}")
print(f"  Indeed  : {len(indeed_data)}")
print(f"  LinkedIn: {len(linkedin_data)}")
print(f"\nSaved: data/cleaned/all_data_clean.json")

indeed: 210 raw -> 198 unique saved
linkedin: 306 raw -> 27 unique saved

Total merged: 1187 records
  Kaggle  : 962
  Indeed  : 198
  LinkedIn: 27

Saved: data/cleaned/all_data_clean.json
