In [2]:
import re
import sentencepiece as spm
import pandas as pd

In [3]:
# Spécifiez le chemin vers votre fichier CSV
csv_path = '../jorf_2023.csv'

# Utilisez Pandas pour lire le fichier CSV dans un DataFrame
df = pd.read_csv(csv_path, sep='|',names=["ID texte","ID article","Nature","N° article","N° alinéa","Contenu"])

# Affichez le DataFrame
print(df)


                    ID texte            ID article  Nature N° article  \
0       JORFTEXT000048734585                   NaN       0        NaN   
1       JORFTEXT000048734585  JORFVERS000048734585       0        NaN   
2       JORFTEXT000048734585  JORFARTI000048734586       1          1   
3       JORFTEXT000048734585  JORFARTI000048734586       1          1   
4       JORFTEXT000048734585  JORFARTI000048734586       1          1   
...                      ...                   ...     ...        ...   
454296  JORFTEXT000046851183  JORFVERS000046851183       0        NaN   
454297  JORFTEXT000046851183  JORFARTI000046851184       1        NaN   
454298  JORFTEXT000046851186                   NaN       0        NaN   
454299  JORFTEXT000046851186  JORFVERS000046851186       0        NaN   
454300  JORFTEXT000046851186  JORFARTI000046851187       1        NaN   

        N° alinéa                                            Contenu  
0               0                     fr/lr/loi/2023

In [9]:
# Extract from a csv file the text to be tokenized
def extract_text_from_file(file_path):
    with open(file_path, 'r') as f:
        text = f.readlines()
    return text


# Extract from the text what is between the first « and »
def extract_text_inside(text):
    match_inside = re.search(r'«(.+)', text)
    text_inside = None
    if match_inside:
        text_inside = match_inside.group(1)
    return text_inside

# Extract from the text what is outside the first « and »
def extract_text_outside(text):
    match_outside = re.search(r'^[^«]+', text)
    text_outside = None
    if match_outside:
        text_outside = match_outside.group(0)
    return text_outside

def extracted_data_to_text_file(df, fun1, fun2):
    df['Law'] = df['Contenu'].apply(fun1)
    df['Not_Law'] = df['Contenu'].apply(fun2)
    
    with open("../jorf_2023.txt","w") as f:
        f.writelines("\n".join(df["Contenu"].to_list()))
    
    # Writing what pertains to the Law
    with open("../jorf_2023_Law.txt","w") as f:
        data = "\n".join(df['Law'].dropna().to_list())
        f.writelines(data)

    # Writing what DO NOT pertains to the Law
    with open("../jorf_2023_Not_Law.txt","w") as f:
        data = "\n".join(df['Not_Law'].dropna().to_list())
        f.writelines(data)


# Write the text in a file
def write_text(text, output_file):
    with open(output_file, 'w') as f:
        for line in text:
            f.write(line + '\n')

# Tokenize the text with the sentencepiece model
def tokenize_text(text, sp):
    tokenized_text = [sp.EncodeAsPieces(line) for line in text]
    return tokenized_text


# Write the tokenized text in a csv file
def write_tokenized_text(tokenized_text, output_file):
    with open(output_file, 'w') as f:
        for line in tokenized_text:
            f.write(' '.join(line) + '\n')



In [7]:
extracted_data_to_text_file(df, extract_text_inside, extract_text_outside)

In [8]:
# General tokenization model
for n in (100,1000,10000):
    spm.SentencePieceTrainer.train(input='../jorf_2023.txt', model_prefix=f'../models/{n}_tokens', vocab_size=n)

100 is not enough while 10000 is excessive.
We will stay at a 1000 for the specialized models

In [10]:
# Specialized in Law tokenization model
spm.SentencePieceTrainer.train(input='../jorf_2023_Law.txt', model_prefix=f'../models/1000_tokens_Law', vocab_size=1000)

In [11]:
# Specialized in Not_Law tokenization model
spm.SentencePieceTrainer.train(input='../jorf_2023_Not_Law.txt', model_prefix=f'../models/1000_tokens_Not_Law', vocab_size=1000)

In [12]:
# Tokenize the text from the text files to a csv file
sp_general = spm.SentencePieceProcessor()
sp_general.Load('../models/1000_tokens.model')
sp_law = spm.SentencePieceProcessor()
sp_law.Load('../models/1000_tokens_Law.model')
sp_not_law = spm.SentencePieceProcessor()
sp_not_law.Load('../models/1000_tokens_Not_Law.model')

general_tokenized = tokenize_text(extract_text_from_file('../jorf_2023.txt'), sp_general)
law_tokenized = tokenize_text(extract_text_from_file('../jorf_2023_Law.txt'), sp_law)
not_law_tokenized = tokenize_text(extract_text_from_file('../jorf_2023_Not_Law.txt'), sp_not_law)

In [13]:
output_file_general_tokenized = '../jorf_2023_general_tokenized.csv'
output_file_law_tokenized = '../jorf_2023_law_tokenized.csv'
output_file_not_law_tokenized = '../jorf_2023_not_law_tokenized.csv'

write_tokenized_text(general_tokenized, output_file_general_tokenized)
write_tokenized_text(law_tokenized, output_file_law_tokenized)
write_tokenized_text(not_law_tokenized, output_file_not_law_tokenized)