# **1. Installation des dépendances**


In [1]:
!pip install -U spacy transformers datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency r

# **2. Téléversement et extraction des données annotées**

In [2]:
import os, io, zipfile, json
from google.colab import files

# Téléversement
print("Veuillez téléverser le fichier ZIP contenant vos CVs annotés...")
uploaded = files.upload()

if not uploaded:
    raise ValueError("Aucun fichier n'a été téléversé")

zip_filename = next(iter(uploaded.keys()))
print(f"Fichier {zip_filename} reçu, extraction en cours...")

# Extraction
with zipfile.ZipFile(io.BytesIO(uploaded[zip_filename]), 'r') as zip_ref:
    zip_ref.extractall('cv_data')


Veuillez téléverser le fichier ZIP contenant vos CVs annotés...


Saving CV_annot.zip to CV_annot.zip
Fichier CV_annot.zip reçu, extraction en cours...


 # **3. Conversion au format spaCy (.spacy)**

## **3.1 Fonction de conversion**

In [14]:
def convert_to_spacy_format(data):
    converted = []
    for text, ann in data["annotations"]:
        entities = []
        for start, end, label in ann["entities"]:
            entities.append((start, end, label))
        converted.append((text, {"entities": entities}))
    return converted

## **3.2 Création du fichier train.spacy**

In [15]:
from spacy.tokens import DocBin
import spacy

nlp = spacy.blank("en")
doc_bin = DocBin()
all_train_data = []
json_count = 0

print("\nChargement des fichiers JSON...")
for root, dirs, files_in_dir in os.walk('cv_data'):
    for file in files_in_dir:
        if file.endswith('.json'):
            file_path = os.path.join(root, file)
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    examples = convert_to_spacy_format(data)
                    for text, annot in examples:
                        doc = nlp.make_doc(text)
                        ents = []
                        for start, end, label in annot["entities"]:
                            span = doc.char_span(start, end, label=label)
                            if span:
                                ents.append(span)
                        doc.ents = ents
                        doc_bin.add(doc)
                    json_count += 1
                    print(f" {file} - {len(examples)} exemples")
            except Exception as e:
                print(f"Erreur avec {file}: {str(e)}")

output_path = "train.spacy"
doc_bin.to_disk(output_path)
print(f"\n {json_count} fichiers traités et sauvegardés dans '{output_path}'")


Chargement des fichiers JSON...
 cv5.json - 1 exemples
 cv44.json - 31 exemples
 cv33.json - 40 exemples
 cv53.json - 44 exemples
 cv25.json - 43 exemples
 cv28.json - 40 exemples
 cv12.json - 1 exemples
 cv47.json - 70 exemples
 cv26.json - 43 exemples
 cv49.json - 53 exemples
 cv13.json - 1 exemples
 cv16.json - 1 exemples
 cv46.json - 67 exemples
 cv15.json - 1 exemples
 cv11.json - 1 exemples
 cv7.json - 1 exemples
 cv51.json - 48 exemples
 cv29.json - 63 exemples
 cv4.json - 1 exemples
 cv42.json - 33 exemples
 cv35.json - 65 exemples
 cv8.json - 1 exemples
 cv14.json - 1 exemples
 cv43.json - 31 exemples
 cv2.json - 1 exemples
 cv27.json - 58 exemples
 cv6.json - 1 exemples
 cv34.json - 40 exemples
 cv1.json - 1 exemples
 cv45.json - 69 exemples
 cv19.json - 1 exemples
 cv40.json - 26 exemples
 cv31.json - 37 exemples
 cv54.json - 31 exemples
 cv39.json - 64 exemples
 cv55.json - 34 exemples
 cv20.json - 1 exemples
 cv21.json - 44 exemples
 cv48.json - 74 exemples
 cv41.json - 3

## **3.3. Split en train / dev / test**

In [22]:
docs = list(doc_bin.get_docs(nlp.vocab))
n = len(docs)
train_end = int(0.8 * n)
dev_end = int(0.9 * n)

train_docbin = DocBin()
dev_docbin = DocBin()
test_docbin = DocBin()

for i, doc in enumerate(docs):
    if i < train_end:
        train_docbin.add(doc)
    elif i < dev_end:
        dev_docbin.add(doc)
    else:
        test_docbin.add(doc)

train_docbin.to_disk("train.spacy")
dev_docbin.to_disk("dev.spacy")
test_docbin.to_disk("test.spacy")

print(f" Dataset split : {train_end} train, {dev_end - train_end} dev, {n - dev_end} test")

 Dataset split : 1224 train, 153 dev, 153 test


#  **4. Initialisation et configuration du pipeline spaCy**

## **4.1 Génération du fichier de configuration**

In [23]:
!python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency --force

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


## **4.2 Modification manuelle pour utiliser BERT**

In [24]:
with open("config.cfg", "r") as f:
    config_text = f.read()

print('\n'.join(config_text.splitlines()[:30]))

# Remplacer la ligne du modèle par BERT
config_text = config_text.replace(
    'name = "roberta-base"',
    'name = "bert-base-cased"'
)

# Réécrire le fichier
with open("config.cfg", "w") as f:
    f.write(config_text)

[paths]
train = null
dev = null
vectors = null
init_tok2vec = null

[system]
gpu_allocator = null
seed = 0

[nlp]
lang = "en"
pipeline = ["tok2vec","ner"]
batch_size = 1000
disabled = []
before_creation = null
after_creation = null
after_pipeline_creation = null
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
vectors = {"@vectors":"spacy.Vectors.v1"}

[components]

[components.ner]
factory = "ner"
incorrect_spans_key = null
moves = null
scorer = {"@scorers":"spacy.ner_scorer.v1"}
update_with_oracle_cut_size = 100



# **5. Entraînement et évaluation du modèle**

## **5.1. Entraînement du modèle**

In [25]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy

[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     52.18    0.00    0.00    0.00    0.00
  1     200       5748.12   3374.12   25.61   38.89   19.09    0.26
  3     400       4342.94   2616.29   50.78   59.04   44.55    0.51
  6     600        609.59   1639.68   62.89   72.62   55.45    0.63
  9     800       1349.12   1238.63   62.03   75.32   52.73    0.62
 12    1000       1667.10   1005.78   62.89   72.62   55.45    0.63
 16    1200        444.77    779.77   62.57   81.16   50.91    0.63
 20    1400        633.21    658.63   60.61   68.18   54.55    0.61
 25    1600        632.85    570.14   62.43   74.68   53.64    0.62
 31    1800        388.40    538.22   62.50   

## **5.2. Évaluation du modèle sur le test set**

In [26]:
!python -m spacy evaluate output/model-best ./test.spacy --output metrics.json

[38;5;4mℹ Using CPU[0m
[1m

TOK     100.00
NER P   64.81 
NER R   56.91 
NER F   60.61 
SPEED   17961 

[1m

                       P        R        F
DIPLOMA           100.00    72.73    84.21
SKILLS             61.40    66.04    63.64
WORK_EXPERIENCE    71.43    76.92    74.07
OTHERS              0.00     0.00     0.00
LANGUAGES          90.00    90.00    90.00
EMAIL              60.00    75.00    66.67
ADDRESS             0.00     0.00     0.00
FIRST NAME         50.00    20.00    28.57
LAST NAME           0.00     0.00     0.00
PHONE               0.00     0.00     0.00
GENDER            100.00   100.00   100.00
BIRTH DATE        100.00   100.00   100.00

[38;5;2m✔ Saved results to metrics.json[0m


## **5.3. Affichage des métriques de performance**

In [27]:
import json

with open("metrics.json", "r", encoding="utf-8") as f:
    metrics = json.load(f)
    print("Évaluation finale du modèle :")
    print(f" - Précision (Precision) : {metrics['ents_p']:.2f}")
    print(f" - Rappel (Recall)       : {metrics['ents_r']:.2f}")
    print(f" - F-score (F1)          : {metrics['ents_f']:.2f}")

Évaluation finale du modèle :
 - Précision (Precision) : 0.65
 - Rappel (Recall)       : 0.57
 - F-score (F1)          : 0.61


# **6. Test sur un nouveau CV**

## **6.1 Téléversement d’un nouveau CV**

In [32]:
from google.colab import files

uploaded = files.upload()
# Récupérer le nom du fichier téléversé
filename = next(iter(uploaded))

Saving test.txt to test (1).txt


## **6.2 Analyse par le modèle**

In [33]:
import pandas as pd
import spacy

nlp = spacy.load("output/model-best")


# Lire le contenu du fichier texte
with open(filename, 'r', encoding='utf-8') as f:
    text = f.read()

# Analyser le texte avec le modèle NER entraîné
doc = nlp(text)

# Extraire les entités dans un tableau
entities = [{
    "Entité": ent.text,
    "Type": ent.label_
} for ent in doc.ents]

# Afficher dans un DataFrame
df = pd.DataFrame(entities)
df

Unnamed: 0,Entité,Type
0,Morgan,FIRST NAME
1,Maxwell,LAST NAME
2,"March 5, 1990",BIRTH DATE
3,Female,GENDER
4,maxwell.morgan@example.com,EMAIL
5,+1 555-123-4567,PHONE
6,"Maple Street, Boston",BIRTH DATE
7,Python,SKILLS
8,Java,SKILLS
9,C++,SKILLS
