In [1]:

# reading in the data via the Kaggle API

# mount your Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:

# install Kaggle
! pip install kaggle



In [3]:
!mkdir ~/.kaggle

In [4]:
#read in your Kaggle credentials from Google Drive
!cp /content/drive/MyDrive/Kaggle/kaggle.json ~/.kaggle/kaggle.json

In [5]:
# download the dataset from the competition page
! kaggle competitions download -c detecting-french-texts-difficulty-level-2023
from zipfile import ZipFile
with ZipFile('detecting-french-texts-difficulty-level-2023.zip','r') as zip:
  zip.extractall(path="")

Downloading detecting-french-texts-difficulty-level-2023.zip to /content
  0% 0.00/303k [00:00<?, ?B/s]
100% 303k/303k [00:00<00:00, 73.4MB/s]


In [6]:
# read in your training data
import pandas as pd
import numpy as np

training = pd.read_csv('training_data.csv', index_col = 'id')

In [7]:
training.head()
training.dropna()
training.drop_duplicates()


Unnamed: 0_level_0,sentence,difficulty
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Les coûts kilométriques réels peuvent diverger...,C1
1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1
2,Le test de niveau en français est sur le site ...,A1
3,Est-ce que ton mari est aussi de Boston?,A1
4,"Dans les écoles de commerce, dans les couloirs...",B1
...,...,...
4795,"C'est pourquoi, il décida de remplacer les hab...",B2
4796,Il avait une de ces pâleurs splendides qui don...,C1
4797,"Et le premier samedi de chaque mois, venez ren...",A2
4798,Les coûts liés à la journalisation n'étant pas...,C2


The first thing to do in order to train/test the data is to encode the column difficulty. We used labelencoder to have a new column with A1 = 0 ... C2 =5


In [8]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
label_encoder = LabelEncoder()
training['encoded_diff'] = label_encoder.fit_transform(training['difficulty'])


In [9]:
%%capture
!python -m spacy download fr_core_news_lg


Transform our sentences to spacy + tokenize

In [10]:
import numpy as np
import spacy
sp = spacy.load('fr_core_news_lg')
spacy_stopwords = spacy.lang.fr.stop_words.STOP_WORDS



In [11]:
#function that tokenize, takes out stopwords, and counts token in df
def tokenize_stop_words_count(df):
  df['sentence'] = df['sentence'].apply(sp)
  df['tokens'] = df['sentence'].apply(lambda doc: [token.text for token in doc])
  df['tokens_no_stop'] = df['tokens'].apply(lambda tokens: [token for token in tokens if token.lower() not in spacy_stopwords])
  df['token_count'] = df['tokens_no_stop'].apply(len)
  return df

In [12]:
training = tokenize_stop_words_count(training)

In [13]:
def count_verbs_nouns_adj(df):
  df['nb_verbs'] = df['sentence'].apply(lambda x: sum(1 for token in sp(x) if token.pos_ == 'VERB'))
  df['nb_nouns'] = df['sentence'].apply(lambda x: sum(1 for token in sp(x) if token.pos_ == 'NOUN'))
  df['nb_adj'] = df['sentence'].apply(lambda x: sum(1 for token in sp(x) if token.pos_ == 'ADJ'))
  return df

In [14]:
training = count_verbs_nouns_adj(training)

In [15]:
training.head()

Unnamed: 0_level_0,sentence,difficulty,encoded_diff,tokens,tokens_no_stop,token_count,nb_verbs,nb_nouns,nb_adj
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,"(Les, coûts, kilométriques, réels, peuvent, di...",C1,4,"[Les, coûts, kilométriques, réels, peuvent, di...","[coûts, kilométriques, réels, diverger, sensib...",27,4,14,3
1,"(Le, bleu, ,, c', est, ma, couleur, préférée, ...",A1,0,"[Le, bleu, ,, c', est, ma, couleur, préférée, ...","[bleu, ,, couleur, préférée, aime, vert, !]",7,1,3,1
2,"(Le, test, de, niveau, en, français, est, sur,...",A1,0,"[Le, test, de, niveau, en, français, est, sur,...","[test, niveau, français, site, Internet, école...",7,1,4,1
3,"(Est, -ce, que, ton, mari, est, aussi, de, Bos...",A1,0,"[Est, -ce, que, ton, mari, est, aussi, de, Bos...","[-ce, mari, Boston, ?]",4,0,2,0
4,"(Dans, les, écoles, de, commerce, ,, dans, les...",B1,2,"[Dans, les, écoles, de, commerce, ,, dans, les...","[écoles, commerce, ,, couloirs, places, financ...",24,4,10,2


Try a LogisticRegression with y = the level and x = the nb of tokens

In [16]:
y = training['encoded_diff']
X = training[['token_count','nb_verbs', 'nb_adj', 'nb_nouns']]

Split into train/test

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)


Reshape X_train because need 2D array

In [18]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(max_depth = 5)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


Check accuracy of predicted values

In [19]:
# Accuracy on the test set
print('Accuracy of Logistic regression classifier on test set: {:.2f}'
     .format(model.score(X_test, y_test)))

# Accuracy on the training set
print('Accuracy of Logistic regression classifier on training set: {:.2f}'
     .format(model.score(X_train, y_train)))

Accuracy of Logistic regression classifier on test set: 0.36
Accuracy of Logistic regression classifier on training set: 0.38


Not really good accuracy


In [20]:
from sklearn.metrics import accuracy_score
accuracy_test = accuracy_score(y_test, y_pred)

print(f'Accurary of Logistic regression classifier on test set: {accuracy_test :.2f}')

Accurary of Logistic regression classifier on test set: 0.36


In [21]:
test = pd.read_csv('unlabelled_test_data.csv')

In [22]:
test = tokenize_stop_words_count(test)

In [23]:
test = count_verbs_nouns_adj(test)

In [24]:
test.head()

Unnamed: 0,id,sentence,tokens,tokens_no_stop,token_count,nb_verbs,nb_nouns,nb_adj
0,0,"(Nous, dûmes, nous, excuser, des, propos, que,...","[Nous, dûmes, nous, excuser, des, propos, que,...","[dûmes, excuser, propos, eûmes, prononcés]",5,3,1,0
1,1,"(Vous, ne, pouvez, pas, savoir, le, plaisir, q...","[Vous, ne, pouvez, pas, savoir, le, plaisir, q...","[pouvez, savoir, plaisir, recevoir, bonne, nou...",7,4,1,2
2,2,"(Et, ,, paradoxalement, ,, boire, froid, n', e...","[Et, ,, paradoxalement, ,, boire, froid, n', e...","[,, paradoxalement, ,, boire, froid, bonne, pa...",8,1,1,2
3,3,"(Ce, n', est, pas, étonnant, ,, car, c', est, ...","[Ce, n', est, pas, étonnant, ,, car, c', est, ...","[étonnant, ,, saison, mystérieuse]",4,0,1,2
4,4,"(Le, corps, de, Golo, lui-même, ,, d', une, es...","[Le, corps, de, Golo, lui-même, ,, d', une, es...","[corps, Golo, ,, essence, surnaturelle, montur...",40,9,13,5


In [25]:
X_to_predict = test[['token_count','nb_verbs', 'nb_adj', 'nb_nouns']]

In [26]:
test['difficulty encoded'] = model.predict(X_to_predict)

In [27]:
#function that cleans and return the final dataset to upload on kaggle
def prep_final_pred(df):
  number_to_level = {
    0: 'A1',
    1: 'A2',
    2: 'B1',
    3: 'B2',
    4: 'C1',
    5: 'C2'
}

  df['difficulty'] = df['difficulty encoded'].map(number_to_level)
  columns_to_keep = ['id', 'difficulty']
  final_df =  df[columns_to_keep]
  final_df.set_index('id', inplace=True)


  return final_df


In [28]:
final_test = prep_final_pred(test)

In [29]:
final_test.head()

Unnamed: 0_level_0,difficulty
id,Unnamed: 1_level_1
0,B1
1,A1
2,B2
3,A1
4,C2


In [30]:
final_test.to_csv('submission.csv')


In [31]:
sample = pd.read_csv('sample_submission.csv')
sample.head()

Unnamed: 0,id,difficulty
0,0,A1
1,1,A1
2,2,A1
3,3,A1
4,4,A1


In [33]:
! kaggle competitions submit -c detecting-french-texts-difficulty-level-2023 -f submission.csv -m "Sample submission"


100% 8.30k/8.30k [00:00<00:00, 18.7kB/s]
Successfully submitted to Detecting the difficulty level of French texts

1)Compter le nb de token

> Bloc en retrait


2)Lemmatizer + créer une colonne qui compte cmb de mots ont été lemmatisé dans la phrase
3)