In [None]:
"""
# üéØ Notebook 02 : Extraction Simple de Comp√©tences

**Objectif** : Extraire les comp√©tences techniques depuis un CV

**M√©thode** :
1. Charger le texte du CV
2. Utiliser spaCy pour le traitement NLP
3. Matching par mots-cl√©s avec une base de r√©f√©rence
"""

In [23]:
# Imports
import sys
import json
import re
from pathlib import Path
from collections import Counter

# Ajouter le dossier racine au PYTHONPATH
project_root = Path().absolute().parent
sys.path.insert(0, str(project_root))

# V√©rifier les chemins
print(f"üìÅ Projet : {project_root}")
print(f"üìÇ Notebooks : {Path().absolute()}")

# Importer spaCy
try:
    import spacy
    print("‚úÖ spaCy import√©")
except ImportError:
    print("‚ùå spaCy non install√©. Ex√©cutez : pip install spacy")

üìÅ Projet : c:\Users\rober\OneDrive\Bureau\PFE
üìÇ Notebooks : c:\Users\rober\OneDrive\Bureau\PFE\notebooks
‚úÖ spaCy import√©


In [24]:
# Charger le mod√®le spaCy anglais
try:
    nlp = spacy.load("en_core_web_sm")
    print("‚úÖ Mod√®le spaCy charg√© : en_core_web_sm")
except OSError:
    print("‚ùå Mod√®le non trouv√©. Ex√©cutez :")
    print("   python -m spacy download en_core_web_sm")

‚úÖ Mod√®le spaCy charg√© : en_core_web_sm


In [40]:
# Charger le texte du CV extrait pr√©c√©demment
cv_text_path = Path().absolute() / "cv_text_pdfplumber.txt"

if not cv_text_path.exists():
    print(f"‚ùå Fichier non trouv√© : {cv_text_path}")
    print("‚ö†Ô∏è  Ex√©cutez d'abord le notebook 01_cv_parser.ipynb")
else:
    with open(cv_text_path, 'r', encoding='utf-8') as f:
        cv_text = f.read()
    
    print(f"‚úÖ CV charg√© : {len(cv_text)} caract√®res")
    print(f"\nüìù Aper√ßu (200 premiers caract√®res):")
    print("-" * 60)
    print(cv_text[:200])
    print("-" * 60)

‚úÖ CV charg√© : 3195 caract√®res

üìù Aper√ßu (200 premiers caract√®res):
------------------------------------------------------------
+33 6 04 02 12 20
ROBERT UNG
Robert.ung@telecom-sudparis.eu
Engineering student apprentice
C++, node.js, C# .net
Corbeil-Essonnes 91100, France
Portfolio website : https://robert-ung.netlify.app
PROFI
------------------------------------------------------------


In [32]:
# Charger la base de comp√©tences de r√©f√©rence
skills_db_path = project_root / "data" / "skills_reference.json"

if not skills_db_path.exists():
    print(f"‚ùå Fichier non trouv√© : {skills_db_path}")
    print("‚ö†Ô∏è  Cr√©ez le fichier data/skills_reference.json")
else:
    with open(skills_db_path, 'r', encoding='utf-8') as f:
        skills_database = json.load(f)
    
    print(f"‚úÖ Base de comp√©tences charg√©e")
    print(f"   ‚Ä¢ Comp√©tences techniques : {len(skills_database['technical_skills'])}")
    print(f"   ‚Ä¢ Soft skills : {len(skills_database['soft_skills'])}")

‚úÖ Base de comp√©tences charg√©e
   ‚Ä¢ Comp√©tences techniques : 171
   ‚Ä¢ Soft skills : 39


In [41]:
# Traiter le texte avec spaCy
print("\nüîç Traitement du CV avec spaCy...")
doc = nlp(cv_text)

print(f"‚úÖ Traitement termin√©")
print(f"   ‚Ä¢ Tokens (mots) : {len(doc)}")
print(f"   ‚Ä¢ Phrases : {len(list(doc.sents))}")
print(f"   ‚Ä¢ Entit√©s nomm√©es : {len(doc.ents)}")

# Afficher quelques entit√©s d√©tect√©es
print(f"\nüìç Entit√©s d√©tect√©es par spaCy (5 premi√®res) :")
for ent in list(doc.ents)[:5]:
    print(f"   ‚Ä¢ {ent.text:20s} ‚Üí {ent.label_:10s}")


üîç Traitement du CV avec spaCy...
‚úÖ Traitement termin√©
   ‚Ä¢ Tokens (mots) : 579
   ‚Ä¢ Phrases : 11
   ‚Ä¢ Entit√©s nomm√©es : 64

üìç Entit√©s d√©tect√©es par spaCy (5 premi√®res) :
   ‚Ä¢ 6 04                 ‚Üí DATE      
   ‚Ä¢ 12 20                ‚Üí DATE      
   ‚Ä¢ ROBERT               ‚Üí PERSON    
   ‚Ä¢ C++, node.js         ‚Üí ORG       
   ‚Ä¢ Corbeil-Essonnes 91100 ‚Üí ORG       


In [42]:
# Fonction d'extraction simple
def extract_skills_simple(text, skills_list):
    """
    Extraction avec gestion des caract√®res sp√©ciaux
    """
    import re
    text_lower = text.lower()
    found_skills = set()
    
    for skill in skills_list:
        skill_lower = skill.lower()
        
        # V√©rifier si contient des caract√®res sp√©ciaux
        if re.search(r'[^a-z0-9\s]', skill_lower):
            # Pattern flexible pour C++, Node.js, .NET
            escaped = re.escape(skill_lower)
            pattern = r'(?:^|\s|[(\[{])' + escaped + r'(?:\s|$|[.,;:)\]}])'
        else:
            # Pattern classique pour Python, Java, Docker
            pattern = r'\b' + re.escape(skill_lower) + r'\b'
        
        if re.search(pattern, text_lower):
            found_skills.add(skill)
    
    return sorted(found_skills)

# Extraction des comp√©tences techniques
print("\nüîç Extraction des comp√©tences techniques...")
technical_skills_found = extract_skills_simple(
    cv_text, 
    skills_database['technical_skills']
)

print(f"‚úÖ {len(technical_skills_found)} comp√©tences techniques trouv√©es")

# Extraction des soft skills
print("\nüîç Extraction des soft skills...")
soft_skills_found = extract_skills_simple(
    cv_text,
    skills_database['soft_skills']
)

print(f"‚úÖ {len(soft_skills_found)} soft skills trouv√©es")


üîç Extraction des comp√©tences techniques...
‚úÖ 24 comp√©tences techniques trouv√©es

üîç Extraction des soft skills...
‚úÖ 5 soft skills trouv√©es


In [43]:
# Afficher les comp√©tences trouv√©es
print("\n" + "="*60)
print("üéØ COMP√âTENCES EXTRAITES")
print("="*60)

print(f"\nüîß COMP√âTENCES TECHNIQUES ({len(technical_skills_found)})")
print("-"*60)
for skill in technical_skills_found:
    print(f"  ‚Ä¢ {skill}")

print(f"\nüíº SOFT SKILLS ({len(soft_skills_found)})")
print("-"*60)
for skill in soft_skills_found:
    print(f"  ‚Ä¢ {skill}")

print(f"\nüìä TOTAL : {len(technical_skills_found) + len(soft_skills_found)} comp√©tences")


üéØ COMP√âTENCES EXTRAITES

üîß COMP√âTENCES TECHNIQUES (24)
------------------------------------------------------------
  ‚Ä¢ .net
  ‚Ä¢ artificial intelligence
  ‚Ä¢ big data
  ‚Ä¢ c
  ‚Ä¢ c#
  ‚Ä¢ c++
  ‚Ä¢ data analysis
  ‚Ä¢ data science
  ‚Ä¢ deep learning
  ‚Ä¢ docker
  ‚Ä¢ excel
  ‚Ä¢ git
  ‚Ä¢ java
  ‚Ä¢ jupyter
  ‚Ä¢ linux
  ‚Ä¢ machine learning
  ‚Ä¢ matplotlib
  ‚Ä¢ node.js
  ‚Ä¢ numpy
  ‚Ä¢ pandas
  ‚Ä¢ power bi
  ‚Ä¢ python
  ‚Ä¢ scikit-learn
  ‚Ä¢ sql

üíº SOFT SKILLS (5)
------------------------------------------------------------
  ‚Ä¢ adaptability
  ‚Ä¢ communication
  ‚Ä¢ problem-solving
  ‚Ä¢ reliability
  ‚Ä¢ teamwork

üìä TOTAL : 29 comp√©tences


In [44]:
# Cr√©er un dictionnaire de r√©sultats
results = {
    "cv_file": "RESUME ROBERT UNG.pdf",
    "extraction_date": "2026-01-25",
    "method": "spaCy + keyword matching",
    "technical_skills": technical_skills_found,
    "soft_skills": soft_skills_found,
    "total_skills": len(technical_skills_found) + len(soft_skills_found)
}

# Sauvegarder en JSON
output_dir = project_root / "outputs"
output_dir.mkdir(exist_ok=True)

output_path = output_dir / "extracted_skills_simple.json"

with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

print(f"\nüíæ R√©sultats sauvegard√©s : {output_path}")
print(f"üìä Taille : {output_path.stat().st_size / 1024:.1f} KB")

print("\n‚úÖ Extraction termin√©e avec succ√®s !")


üíæ R√©sultats sauvegard√©s : c:\Users\rober\OneDrive\Bureau\PFE\outputs\extracted_skills_simple.json
üìä Taille : 0.7 KB

‚úÖ Extraction termin√©e avec succ√®s !
