In [23]:
import json
import csv
import requests
import re
from datetime import datetime
import spacy
import spacy.cli
import importlib

# === Vérifie et charge le modèle spaCy ===
try:
    nlp = spacy.load("fr_core_news_lg")
except OSError:
    print("🔁 Modèle 'fr_core_news_lg' non trouvé. Téléchargement...")
    spacy.cli.download("fr_core_news_lg")
    importlib.invalidate_caches()
    nlp = spacy.load("fr_core_news_lg")

# === CONFIGURATION ===
OLLAMA_URL = "http://localhost:11434/api/chat"
MODEL = "llama3"
INPUT_JSON = "result.json"
OUTPUT_CSV = "resultats_cv_test.csv"

# === Fonctions utilitaires ===
def ask_ollama(prompt):
    payload = {
        "model": MODEL,
        "messages": [{"role": "user", "content": prompt}],
        "stream": False
    }
    try:
        response = requests.post(OLLAMA_URL, json=payload)
        response.raise_for_status()
        return response.json()["message"]["content"]
    except Exception as e:
        print(f"❌ Erreur Ollama : {e}")
        return ""

def clean_name(nom_complet):
    if not nom_complet:
        return ""
    nom_complet = re.sub(r"\b(dr|mr|mme|mrs|m)\b[\.]?", "", nom_complet, flags=re.IGNORECASE)
    return re.sub(r"\s+", " ", nom_complet.strip())

def estimate_years_experience(from_year, to_year=2025):
    try:
        return max(0, int(to_year) - int(from_year))
    except:
        return 0

def chunk_text(text, max_chars=2000):
    return [text[i:i+max_chars] for i in range(0, len(text), max_chars)]

def extract_from_chunk(chunk):
    prompt = f"""
Voici un extrait de CV :

====================
{chunk}
====================

Retourne uniquement ce JSON strict :
{{
  "nom_complet": "",
  "domaine_expertise": "",
  "date_diplome_principal": "",
  "annees_experience": 0
}}

⚠️ Si une information est absente dans ce chunk, laisse-la vide ou à 0. Aucune explication.
"""
    response = ask_ollama(prompt)
    cleaned_response = re.sub(r"//.*", "", response)
    try:
        json_match = re.search(r'\{[\s\S]*?\}', cleaned_response)
        data = json.loads(json_match.group()) if json_match else {}
        if "date_diplome_principal" in data and data["date_diplome_principal"]:
            year_match = re.search(r"\b(19|20)\d{2}\b", data["date_diplome_principal"])
            if year_match:
                year = year_match.group()
                data["date_diplome_principal"] = year
                data["annees_experience"] = estimate_years_experience(year)
        return data
    except Exception as e:
        print(f"⚠️ Erreur parsing chunk JSON : {e}")
        print(f"🔎 Chunk brut :\n{response}")
        return {}

def extract_full_name(text):
    doc = nlp(text)
    persons = [ent.text.strip() for ent in doc.ents if ent.label_ == "PER"]
    if not persons:
        return ""
    return clean_name(" ".join(persons[:2]))  # max deux entités

def extract_domain(text):
    match = re.search(r"(expert|spécialiste|consultant|responsable) en ([^\n\.\:]+)", text, re.IGNORECASE)
    return match.group(2).strip() if match else ""

def find_earliest_diploma_year(text):
    match_section = re.search(r"5\. *Dipl[oô]mes *:(.*?)(6\. *Connaissances linguistiques *:|7\. *Affiliation|$)",
                              text, re.DOTALL | re.IGNORECASE)
    if not match_section:
        return None
    diplomas_text = match_section.group(1)

    # Cherche les années suivies OU précédées de mots-clés
    pattern = r"(\b(19|20)\d{2}\b).{0,50}(doctorat|mast[èe]re|licence|ing[ée]nieur|master)|" \
              r"(doctorat|mast[èe]re|licence|ing[ée]nieur|master).{0,50}(\b(19|20)\d{2}\b)"

    years = []
    for match in re.finditer(pattern, diplomas_text, re.IGNORECASE):
        for group in match.groups():
            if group and re.match(r"\b(19|20)\d{2}\b", group):
                years.append(int(group))
    return min(years) if years else None

def merge_infos(chunks_data, original_text):
    final = {
        "nom_complet": "",
        "domaine_expertise": "",
        "date_diplome_principal": "",
        "annees_experience": 0
    }

    for data in chunks_data:
        for key in final:
            if key not in ("date_diplome_principal", "annees_experience") and not final[key] and data.get(key):
                final[key] = data[key]

    if not final["nom_complet"]:
        final["nom_complet"] = extract_full_name(original_text)

    if not final["domaine_expertise"]:
        final["domaine_expertise"] = extract_domain(original_text)

    earliest_year = find_earliest_diploma_year(original_text)
    if earliest_year:
        final["date_diplome_principal"] = str(earliest_year)
        final["annees_experience"] = estimate_years_experience(earliest_year)
    else:
        final["annees_experience"] = estimate_years_experience(final.get("date_diplome_principal", 0))

    final["nom_complet"] = clean_name(final["nom_complet"])
    return final

def save_to_csv(data_list, path):
    if not data_list:
        print("⚠️ Aucun résultat à sauvegarder.")
        return
    fieldnames = ["nom_complet", "domaine_expertise", "date_diplome_principal", "annees_experience", "fichier"]
    with open(path, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(data_list)
    print(f"✅ Résultats enregistrés dans {path}")

# === MAIN ===
if __name__ == "__main__":
    with open(INPUT_JSON, "r", encoding="utf-8") as f:
        docs = json.load(f)

    results = []
    for doc in docs[:1]:  # traiter tous les CV si besoin
        print(f"🔍 Traitement de : {doc['filename']}")
        text = doc["text"]
        chunks = chunk_text(text)
        extracted_chunks = [extract_from_chunk(c) for c in chunks]
        merged = merge_infos(extracted_chunks, text)
        merged["fichier"] = doc["filename"]
        results.append(merged)

    save_to_csv(results, OUTPUT_CSV)


🔍 Traitement de : CV AHMED DIOUF DIRIEH DIBAD AVRIL 2024 VF.docx
✅ Résultats enregistrés dans resultats_cv_test.csv


In [None]:
import re

text = """
Expert en stratégie d’affaires et projets de transformation
numérique
Name Dr Ahmed-Diouf DIRIEH DIBAD
Nationalité/Résidence Djiboutien, Republic of Djibouti
Autre nationalité Canadien
1. Nom de Famille : DIRIEH DIBAD
2. Prénoms : AHMED DIOUF
3. Date de naissance 28 septembre 1981
4. Nationalité : Djiboutienne
5. Diplômes :
Institution Diplôme
2012 Doctorat en informatique, université de Rouen, France
2005 Master 2 en Système d’Information & Réseaux, université de
Tours, France
6. Connaissances linguistiques : Indiquer vos connaissances sur une échelle de 1 à 5 (1 - excellent ; 5 -
rudimentaire)
Langue Lu Parlé Ecrit
Français Bilingue Bilingue Bilingue
Anglais Fonctionnel Fonctionnel Fonctionnel
"""

def find_earliest_diploma_year(text):
    # Extraire section Diplômes (même code)
    match_section = re.search(r"5\. *Dipl[oô]mes *:(.*?)(6\. *Connaissances linguistiques *:|7\. *Affiliation|$)", text, re.DOTALL | re.IGNORECASE)
    if not match_section:
        print("⚠️ Section diplômes non trouvée.")
        return None
    diplomas_text = match_section.group(1)
    print(f"--- Section diplômes ---\n{diplomas_text}\n--- Fin section diplômes ---")

    # Regex : chercher année suivie de mot-clé diplôme dans un rayon de 50 caractères
    diplome_keywords = r"(doctorat|mast[èe]re|licence|ing[ée]nieur|master)"
    pattern = rf"(\b(19|20)\d{{2}}\b).{{0,50}}{diplome_keywords}"
    years = []
    for match in re.finditer(pattern, diplomas_text, re.IGNORECASE):
        year = match.group(1)
        print(f"Année trouvée dans diplômes : {year}")
        try:
            y = int(year)
            years.append(y)
        except:
            pass
    if years:
        print(f"Années trouvées : {years}, année la plus ancienne : {min(years)}")
        return min(years)
    print("⚠️ Aucune année valide trouvée dans diplômes.")
    return None



🔍 Traitement de : CV AHMED DIOUF DIRIEH DIBAD AVRIL 2024 VF.docx
🔍 Traitement de : CV Anis JENHANI_EU_fr_Expert SIG.pdf
🔍 Traitement de : CV FR BM Ahmed Faresse_Consultant international Mars25.docx
🔍 Traitement de : CV Habib BEN ALI .docx
🔍 Traitement de : CV HmidaKarboul_V2.docx
✅ Résultats enregistrés dans resultats_cv_test.csv


In [29]:
import os
import re
import json
import csv
import shutil
import requests
import spacy
import spacy.cli
import importlib
import pdfplumber
import pytesseract
from PIL import Image
from docx2pdf import convert
from flask import Flask, request, render_template, send_from_directory
from werkzeug.utils import secure_filename

# === Initialisation ===
app = Flask(__name__)
UPLOAD_FOLDER = "uploads"
TEMP_FOLDER = "temp"
RESULT_CSV = "resultats_cv_web.csv"
ALLOWED_EXTENSIONS = {"pdf", "docx"}
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
os.makedirs(TEMP_FOLDER, exist_ok=True)

# === spaCy init ===
try:
    nlp = spacy.load("fr_core_news_lg")
except OSError:
    spacy.cli.download("fr_core_news_lg")
    importlib.invalidate_caches()
    nlp = spacy.load("fr_core_news_lg")

# === Utilitaires ===
def allowed_file(filename):
    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS

def convert_docx_to_pdf(docx_path, output_dir):
    try:
        os.makedirs(output_dir, exist_ok=True)
        convert(docx_path, output_dir)
        pdf_name = os.path.splitext(os.path.basename(docx_path))[0] + ".pdf"
        return os.path.join(output_dir, pdf_name)
    except Exception as e:
        print(f"Erreur de conversion : {e}")
        return None

def extract_text_from_pdf(pdf_path):
    extracted_text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                text = page.extract_text()
                if text and len(text.strip()) > 20:
                    extracted_text += text + "\n"
                else:
                    image = page.to_image(resolution=300).original
                    ocr_text = pytesseract.image_to_string(image, lang='eng+fra')
                    extracted_text += ocr_text + "\n"
    except Exception as e:
        print(f"Erreur PDF {pdf_path} : {e}")
    return extracted_text.strip() or None

def clean_name(nom_complet):
    nom_complet = re.sub(r"\b(dr|mr|mme|mrs|m)\b[\.]?", "", nom_complet, flags=re.IGNORECASE)
    return re.sub(r"\s+", " ", nom_complet.strip())

def estimate_years_experience(from_year, to_year=2025):
    try:
        return max(0, int(to_year) - int(from_year))
    except:
        return 0

def extract_full_name(text):
    doc = nlp(text)
    persons = [ent.text.strip() for ent in doc.ents if ent.label_ == "PER"]
    return clean_name(" ".join(persons[:2])) if persons else ""

def extract_domain(text):
    match = re.search(r"(expert|sp[ée]cialiste|consultant|responsable) en ([^\n\.:]+)", text, re.IGNORECASE)
    return match.group(2).strip() if match else ""

def find_earliest_year(text):
    years = [int(y) for y in re.findall(r"\b(19|20)\d{2}\b", text)]
    return min(years) if years else None

def extract_cv_info(text):
    return {
        "nom_complet": extract_full_name(text),
        "domaine_expertise": extract_domain(text),
        "date_diplome_principal": str(find_earliest_year(text) or ""),
        "annees_experience": estimate_years_experience(find_earliest_year(text)),
    }

def save_to_csv(data_list, csv_file):
    file_exists = os.path.isfile(csv_file)
    with open(csv_file, "a", newline="", encoding="utf-8") as f:
        fieldnames = ["nom_complet", "domaine_expertise", "date_diplome_principal", "annees_experience", "fichier"]
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        if not file_exists:
            writer.writeheader()
        writer.writerows(data_list)

# === Routes Flask ===
@app.route("/", methods=["GET", "POST"])
def upload():
    if request.method == "POST":
        file = request.files.get("cv")
        if file and allowed_file(file.filename):
            filename = secure_filename(file.filename)
            filepath = os.path.join(UPLOAD_FOLDER, filename)
            file.save(filepath)

            if filename.lower().endswith(".docx"):
                pdf_path = convert_docx_to_pdf(filepath, TEMP_FOLDER)
            else:
                pdf_path = filepath

            text = extract_text_from_pdf(pdf_path)
            if text:
                data = extract_cv_info(text)
                data["fichier"] = f"<a href='/cv/{filename}' target='_blank'>{filename}</a>"
                save_to_csv([data], RESULT_CSV)
                return render_template("result.html", info=data)
            return "Erreur d'extraction du texte."
    return render_template("upload.html")

@app.route("/cv/<path:filename>")
def serve_cv(filename):
    return send_from_directory(UPLOAD_FOLDER, filename)

# === Lancer le serveur ===
if __name__ == "__main__":
    app.run(debug=True)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
 * Restarting with stat


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [28]:
pip install pdfplumber pytesseract pillow python-docx docx2pdf flask


Collecting flask
  Using cached flask-3.1.1-py3-none-any.whl.metadata (3.0 kB)
Collecting blinker>=1.9.0 (from flask)
  Using cached blinker-1.9.0-py3-none-any.whl.metadata (1.6 kB)
Collecting itsdangerous>=2.2.0 (from flask)
  Using cached itsdangerous-2.2.0-py3-none-any.whl.metadata (1.9 kB)
Collecting werkzeug>=3.1.0 (from flask)
  Using cached werkzeug-3.1.3-py3-none-any.whl.metadata (3.7 kB)
Using cached flask-3.1.1-py3-none-any.whl (103 kB)
Using cached blinker-1.9.0-py3-none-any.whl (8.5 kB)
Using cached itsdangerous-2.2.0-py3-none-any.whl (16 kB)
Using cached werkzeug-3.1.3-py3-none-any.whl (224 kB)
Installing collected packages: werkzeug, itsdangerous, blinker, flask
Successfully installed blinker-1.9.0 flask-3.1.1 itsdangerous-2.2.0 werkzeug-3.1.3
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
