In [None]:
!pip install sentencepiece



In [None]:
!pip install transformers



In [None]:
# reading in the data via the Kaggle API
# mount your Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# install Kaggle
! pip install kaggle



In [None]:
!mkdir ~/.kaggle

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [None]:
#read in your Kaggle credentials from Google Drive
!cp /content/drive/MyDrive/Kaggle/kaggle.json ~/.kaggle/kaggle.json

In [None]:
# download the dataset from the competition page
! kaggle competitions download -c detecting-french-texts-difficulty-level-2023
from zipfile import ZipFile
with ZipFile('detecting-french-texts-difficulty-level-2023.zip','r') as zip:
  zip.extractall(path="")

detecting-french-texts-difficulty-level-2023.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
import requests
from bs4 import BeautifulSoup
import nltk
import pandas as pd

nltk.download('punkt')

def clean_text(text):
    # Replace newline and tab characters with an empty string
    cleaned_text = text.replace('\n', '').replace('\t', '').replace('>>', '')
    return cleaned_text

def scrape_articles(urls_grades):
    all_dataframes = []

    for url, grade in urls_grades:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract article content
        paragraphs = soup.find_all('p')
        article_text = ' '.join([clean_text(p.get_text()) for p in paragraphs])

        # Tokenize the article text into sentences
        sentences = nltk.sent_tokenize(article_text)

        # Create a DataFrame with sentences and grades
        data = {'sentence': sentences, 'difficulty': [grade] * len(sentences)}
        df = pd.DataFrame(data)

        # Append the DataFrame to the list
        all_dataframes.append(df)

    # Concatenate all DataFrames into a single dataset
    final_dataset = pd.concat(all_dataframes, ignore_index=True)

    return final_dataset


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Example usage
urls_grades = [
    ("https://www.letemps.ch/monde/europe/a-un-an-de-la-reouverture-de-notre-dame-la-fleche-de-la-cathedrale-plombe-la-fete", "C1"),
    ("https://histoiresdouces.fr/histoire-dans-l-arbre", "A2"),
     ("https://www.meshistoiresdusoir.fr/h/le-soldat-courageux/", "B1"),
       ("http://www.isabelle-et-ses-amis.com/fr/divers-ados/la-vengeance-de-zaoar.html", "B2"),
         ("https://www.mediapart.fr/?at_medium=sl&at_campaign=Marque&at_platform=google&at_creation=Mediapart-Large&at_variant=&at_network=[search]&at_term=mediapart&gclid=Cj0KCQiAsburBhCIARIsAExmsu7fI-BN4zFyli8HX3TFDFbs24RUirpTHXH8cHjlKlBQw1_3bXpVkvQaAjRAEALw_wcB", "C2")
]

add_data = scrape_articles(urls_grades)
add_data['sentence'] = add_data['sentence'].apply(lambda x: clean_text(x))

#

In [None]:
# read in your training data
import pandas as pd
import numpy as np

training = pd.read_csv('training_data.csv', index_col = 'id')
test = pd.read_csv('unlabelled_test_data.csv')
training = pd.concat([training, add_data], axis=0, ignore_index=True)


In [None]:
training.head()
training.dropna()
training.drop_duplicates()


Unnamed: 0,sentence,difficulty
0,Les coûts kilométriques réels peuvent diverger...,C1
1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1
2,Le test de niveau en français est sur le site ...,A1
3,Est-ce que ton mari est aussi de Boston?,A1
4,"Dans les écoles de commerce, dans les couloirs...",B1
...,...,...
5497,Voici les différents cookies et technologies s...,C2
5498,Voici les outils tiers compris dans cette caté...,C2
5499,Mais nous faisons la promotion de nos contenus...,C2
5500,"Pour cela, nous utilisons les technologies mis...",C2


The first thing to do in order to train/test the data is to encode the column difficulty. We used labelencoder to have a new column with A1 = 0 ... C2 =5


In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
label_encoder = LabelEncoder()
training['encoded_diff'] = label_encoder.fit_transform(training['difficulty'])


In [None]:
%%capture
!python -m spacy download fr_core_news_lg


Transform our sentences to spacy + tokenize

In [None]:
import numpy as np
import spacy
sp = spacy.load('fr_core_news_lg')
spacy_stopwords = spacy.lang.fr.stop_words.STOP_WORDS



In [None]:
#function that tokenize, takes out stopwords, and counts token in df
def tokenize_stop_words_count(df):
  df['sentence_sp'] = df['sentence'].apply(sp)
  df['tokens'] = df['sentence_sp'].apply(lambda doc: [token.text for token in doc])
  df['tokens_no_stop'] = df['tokens'].apply(lambda tokens: [token for token in tokens if token.lower() not in spacy_stopwords])
  df['token_count_no_stop'] = df['tokens_no_stop'].apply(len)
  df['token_count'] = df['tokens'].apply(len)
  return df

In [None]:
training = tokenize_stop_words_count(training)

In [None]:
def count_verbs_nouns_adj(df):
  df['nb_verbs'] = df['sentence_sp'].apply(lambda x: sum(1 for token in sp(x) if token.pos_ == 'VERB'))
  df['nb_nouns'] = df['sentence_sp'].apply(lambda x: sum(1 for token in sp(x) if token.pos_ == 'NOUN'))
  df['nb_adj'] = df['sentence_sp'].apply(lambda x: sum(1 for token in sp(x) if token.pos_ == 'ADJ'))
  df['nb_adv'] = df['sentence_sp'].apply(lambda x: sum(1 for token in sp(x) if token.pos_ == 'ADV'))
  return df

In [None]:
training = count_verbs_nouns_adj(training)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_sentence_unigram(df):
  corpus = df['sentence'].tolist()
  tfidf = TfidfVectorizer(ngram_range=(1, 1), stop_words=list(spacy_stopwords))
  features = tfidf.fit_transform(corpus)
  results = pd.DataFrame(
    features.todense(),
    columns=tfidf.get_feature_names_out(),
    )
  word_freq = results.sum().sort_values(ascending=False)
  df['words'] = df['sentence'].apply(lambda x: x.lower().split())
  df['tfidf_score_unigram'] = df['words'].apply(lambda words: sum(word_freq.get(word, 0) for word in words))
  return df

def tfidf_sentence_1_2_grams(df):
  corpus = df['sentence'].tolist()
  tfidf = TfidfVectorizer(ngram_range=(1, 2), stop_words=list(spacy_stopwords))
  features = tfidf.fit_transform(corpus)
  results = pd.DataFrame(
    features.todense(),
    columns=tfidf.get_feature_names_out(),
    )
  word_freq = results.sum().sort_values(ascending=False)
  df['words'] = df['sentence'].apply(lambda x: x.lower().split())
  df['tfidf_score_1_2gram'] = df['words'].apply(lambda words: sum(word_freq.get(word, 0) for word in words))
  return df


In [None]:
training = tfidf_sentence_unigram(training)
training = tfidf_sentence_1_2_grams(training)



In [None]:
test = tokenize_stop_words_count(test)

In [None]:
test = count_verbs_nouns_adj(test)

In [None]:
test = tfidf_sentence_unigram(test)
test = tfidf_sentence_1_2_grams(test)




In [None]:
from transformers import CamembertConfig, CamembertModel, CamembertTokenizer, CamembertTokenizer, CamembertForSequenceClassification
import torch


model_name = 'camembert-base'  # You can choose a different model if needed
camembert_model = CamembertModel.from_pretrained(model_name)
tokenizer = CamembertTokenizer.from_pretrained(model_name, revision="main", sentencepiece_model="/usr/local/lib/python3.10/dist-packages")


sentencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
import torch
import pandas as pd
import numpy as np
from transformers import BertModel, BertTokenizer

def get_bert_embedding(sentence, model, tokenizer):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
    return embeddings

In [None]:
def max_pool_embeddings(embeddings):
    return np.max(embeddings, axis=0)

In [None]:
training['camembert_embedding'] = training['sentence'].apply(lambda x: get_bert_embedding(x, camembert_model, tokenizer))


In [None]:
test['camembert_embedding'] = test['sentence'].apply(lambda x: get_bert_embedding(x, camembert_model, tokenizer))

In [None]:
training['cam_pooled_embedding'] = training['camembert_embedding'].apply(lambda x: max_pool_embeddings(x))
test['cam_pooled_embedding'] = test['camembert_embedding'].apply(lambda x: max_pool_embeddings(x))


In [None]:
training.head()
training.dropna()

Unnamed: 0,sentence,difficulty,encoded_diff,sentence_sp,tokens,tokens_no_stop,token_count_no_stop,token_count,nb_verbs,nb_nouns,nb_adj,nb_adv,words,tfidf_score_unigram,tfidf_score_1_2gram,camembert_embedding,cam_pooled_embedding
0,Les coûts kilométriques réels peuvent diverger...,C1,4,"(Les, coûts, kilométriques, réels, peuvent, di...","[Les, coûts, kilométriques, réels, peuvent, di...","[coûts, kilométriques, réels, diverger, sensib...",27,45,4,14,3,2,"[les, coûts, kilométriques, réels, peuvent, di...",28.467464,19.426037,"[[0.04393054, -0.012339459, 0.03234782, 0.1047...","[0.04393054, -0.012339459, 0.03234782, 0.10470..."
1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1,0,"(Le, bleu, ,, c', est, ma, couleur, préférée, ...","[Le, bleu, ,, c', est, ma, couleur, préférée, ...","[bleu, ,, couleur, préférée, aime, vert, !]",7,16,1,3,1,2,"[le, bleu,, c'est, ma, couleur, préférée, mais...",6.346735,4.307671,"[[0.031562533, -0.064943954, -0.062135864, 0.0...","[0.031562533, -0.064943954, -0.062135864, 0.05..."
2,Le test de niveau en français est sur le site ...,A1,0,"(Le, test, de, niveau, en, français, est, sur,...","[Le, test, de, niveau, en, français, est, sur,...","[test, niveau, français, site, Internet, école...",7,15,1,4,1,0,"[le, test, de, niveau, en, français, est, sur,...",37.878158,25.101195,"[[0.017271552, -0.05883773, -0.020093849, -0.0...","[0.017271552, -0.05883773, -0.020093849, -0.04..."
3,Est-ce que ton mari est aussi de Boston?,A1,0,"(Est, -ce, que, ton, mari, est, aussi, de, Bos...","[Est, -ce, que, ton, mari, est, aussi, de, Bos...","[-ce, mari, Boston, ?]",4,10,0,2,0,1,"[est-ce, que, ton, mari, est, aussi, de, boston?]",6.505797,4.420138,"[[0.0041115624, -0.07472146, -0.021084685, 0.0...","[0.0041115624, -0.07472146, -0.021084685, 0.01..."
4,"Dans les écoles de commerce, dans les couloirs...",B1,2,"(Dans, les, écoles, de, commerce, ,, dans, les...","[Dans, les, écoles, de, commerce, ,, dans, les...","[écoles, commerce, ,, couloirs, places, financ...",24,42,4,10,2,1,"[dans, les, écoles, de, commerce,, dans, les, ...",97.445890,66.224032,"[[0.025271175, 0.009502788, -0.07489673, 0.161...","[0.025271175, 0.009502788, -0.07489673, 0.1611..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5497,Voici les différents cookies et technologies s...,C2,5,"(Voici, les, différents, cookies, et, technolo...","[Voici, les, différents, cookies, et, technolo...","[cookies, technologies, similaires, catégorie,...",25,48,7,13,4,0,"[voici, les, différents, cookies, et, technolo...",40.911849,27.928791,"[[0.06006707, -0.040940583, -0.015274796, 0.03...","[0.06006707, -0.040940583, -0.015274796, 0.039..."
5498,Voici les outils tiers compris dans cette caté...,C2,5,"(Voici, les, outils, tiers, compris, dans, cet...","[Voici, les, outils, tiers, compris, dans, cet...","[outils, tiers, catégorie, , :, Reciblage, pu...",10,22,3,4,2,2,"[voici, les, outils, tiers, compris, dans, cet...",11.272136,7.905061,"[[0.041568007, -0.080894, -0.058108833, 0.0222...","[0.041568007, -0.080894, -0.058108833, 0.02222..."
5499,Mais nous faisons la promotion de nos contenus...,C2,5,"(Mais, nous, faisons, la, promotion, de, nos, ...","[Mais, nous, faisons, la, promotion, de, nos, ...","[faisons, promotion, contenus, services, sites...",8,18,1,5,2,0,"[mais, nous, faisons, la, promotion, de, nos, ...",12.976926,8.813108,"[[0.073869385, 0.068457544, -0.05252191, 0.063...","[0.073869385, 0.068457544, -0.05252191, 0.0630..."
5500,"Pour cela, nous utilisons les technologies mis...",C2,5,"(Pour, cela, ,, nous, utilisons, les, technolo...","[Pour, cela, ,, nous, utilisons, les, technolo...","[,, utilisons, technologies, mises, dispositio...",8,15,2,3,1,0,"[pour, cela,, nous, utilisons, les, technologi...",11.311183,7.664863,"[[0.033559702, 0.103856005, -0.037337154, 0.03...","[0.033559702, 0.103856005, -0.037337154, 0.035..."


In [None]:
test.head()
test.dropna()

Unnamed: 0,id,sentence,sentence_sp,tokens,tokens_no_stop,token_count_no_stop,token_count,nb_verbs,nb_nouns,nb_adj,nb_adv,words,tfidf_score_unigram,tfidf_score_1_2gram,camembert_embedding,cam_pooled_embedding
0,0,Nous dûmes nous excuser des propos que nous eû...,"(Nous, dûmes, nous, excuser, des, propos, que,...","[Nous, dûmes, nous, excuser, des, propos, que,...","[dûmes, excuser, propos, eûmes, prononcés]",5,10,3,1,0,0,"[nous, dûmes, nous, excuser, des, propos, que,...",3.192917,2.331769,"[[0.047271907, 0.030453198, -0.022989253, 0.10...","[0.047271907, 0.030453198, -0.022989253, 0.109..."
1,1,Vous ne pouvez pas savoir le plaisir que j'ai ...,"(Vous, ne, pouvez, pas, savoir, le, plaisir, q...","[Vous, ne, pouvez, pas, savoir, le, plaisir, q...","[pouvez, savoir, plaisir, recevoir, bonne, nou...",7,16,4,1,2,2,"[vous, ne, pouvez, pas, savoir, le, plaisir, q...",12.660056,8.902162,"[[-0.0065091653, -0.11096406, -0.0380778, 0.06...","[-0.0065091653, -0.11096406, -0.0380778, 0.065..."
2,2,"Et, paradoxalement, boire froid n'est pas la b...","(Et, ,, paradoxalement, ,, boire, froid, n', e...","[Et, ,, paradoxalement, ,, boire, froid, n', e...","[,, paradoxalement, ,, boire, froid, bonne, pa...",8,13,1,1,2,3,"[et,, paradoxalement,, boire, froid, n'est, pa...",8.244312,5.840872,"[[-0.035896078, -0.04058774, 0.08143554, 0.039...","[-0.035896078, -0.04058774, 0.08143554, 0.0398..."
3,3,"Ce n'est pas étonnant, car c'est une saison my...","(Ce, n', est, pas, étonnant, ,, car, c', est, ...","[Ce, n', est, pas, étonnant, ,, car, c', est, ...","[étonnant, ,, saison, mystérieuse]",4,12,0,1,2,2,"[ce, n'est, pas, étonnant,, car, c'est, une, s...",1.943283,1.444063,"[[0.034818918, -0.037731417, -0.055650946, 0.0...","[0.034818918, -0.037731417, -0.055650946, 0.08..."
4,4,"Le corps de Golo lui-même, d'une essence aussi...","(Le, corps, de, Golo, lui-même, ,, d', une, es...","[Le, corps, de, Golo, lui-même, ,, d', une, es...","[corps, Golo, ,, essence, surnaturelle, montur...",40,83,9,13,5,8,"[le, corps, de, golo, lui-même,, d'une, essenc...",15.581025,10.953801,"[[0.011139873, 0.057788055, -0.029189765, 0.12...","[0.011139873, 0.057788055, -0.029189765, 0.120..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,1195,C'est un phénomène qui trouve une accélération...,"(C', est, un, phénomène, qui, trouve, une, acc...","[C', est, un, phénomène, qui, trouve, une, acc...","[phénomène, trouve, accélération, formidable, ...",11,26,3,7,1,0,"[c'est, un, phénomène, qui, trouve, une, accél...",8.477968,5.939945,"[[0.038071394, -0.027568962, -0.014385092, 0.0...","[0.038071394, -0.027568962, -0.014385092, 0.08..."
1196,1196,Je vais parler au serveur et voir si on peut d...,"(Je, vais, parler, au, serveur, et, voir, si, ...","[Je, vais, parler, au, serveur, et, voir, si, ...","[serveur, voir, déplacer, tables, .]",5,14,5,2,0,0,"[je, vais, parler, au, serveur, et, voir, si, ...",5.018744,3.563436,"[[0.04553629, 0.030349812, -0.06302876, 0.0106...","[0.04553629, 0.030349812, -0.06302876, 0.01066..."
1197,1197,Il n'était pas comme tant de gens qui par pare...,"(Il, n', était, pas, comme, tant, de, gens, qu...","[Il, n', était, pas, comme, tant, de, gens, qu...","[paresse, sentiment, résigné, obligation, crée...",40,88,12,15,6,4,"[il, n'était, pas, comme, tant, de, gens, qui,...",18.140234,12.651893,"[[0.0074568326, -0.028050965, -0.0016393242, 0...","[0.0074568326, -0.028050965, -0.0016393242, 0...."
1198,1198,Ils deviennent dangereux pour notre économie.,"(Ils, deviennent, dangereux, pour, notre, écon...","[Ils, deviennent, dangereux, pour, notre, écon...","[deviennent, dangereux, économie, .]",4,7,1,1,1,0,"[ils, deviennent, dangereux, pour, notre, écon...",2.230427,1.623497,"[[0.031044463, 0.06897067, -0.041632712, -0.00...","[0.031044463, 0.06897067, -0.041632712, -0.003..."


In [None]:
training.to_csv('new_training.csv', index=False)
test.to_csv('new_test.csv', index=False)

In [None]:
from google.colab import files
files.download('new_training.csv')
files.download('new_test.csv' )

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>