In [None]:

# reading in the data via the Kaggle API

# mount your Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:

# install Kaggle
! pip install kaggle



In [None]:
!mkdir ~/.kaggle

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [None]:
#read in your Kaggle credentials from Google Drive
!cp /content/drive/MyDrive/Kaggle/kaggle.json ~/.kaggle/kaggle.json

In [None]:
# download the dataset from the competition page
! kaggle competitions download -c detecting-french-texts-difficulty-level-2023
from zipfile import ZipFile
with ZipFile('detecting-french-texts-difficulty-level-2023.zip','r') as zip:
  zip.extractall(path="")

detecting-french-texts-difficulty-level-2023.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
# read in your training data
import pandas as pd
import numpy as np

training = pd.read_csv('training_data.csv', index_col = 'id')

In [None]:
training.head()
training.dropna()
training.drop_duplicates()


Unnamed: 0_level_0,sentence,difficulty
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Les coûts kilométriques réels peuvent diverger...,C1
1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1
2,Le test de niveau en français est sur le site ...,A1
3,Est-ce que ton mari est aussi de Boston?,A1
4,"Dans les écoles de commerce, dans les couloirs...",B1
...,...,...
4795,"C'est pourquoi, il décida de remplacer les hab...",B2
4796,Il avait une de ces pâleurs splendides qui don...,C1
4797,"Et le premier samedi de chaque mois, venez ren...",A2
4798,Les coûts liés à la journalisation n'étant pas...,C2


The first thing to do in order to train/test the data is to encode the column difficulty. We used labelencoder to have a new column with A1 = 0 ... C2 =5


In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
label_encoder = LabelEncoder()
training['encoded_diff'] = label_encoder.fit_transform(training['difficulty'])


In [None]:
%%capture
!python -m spacy download fr_core_news_lg


Transform our sentences to spacy + tokenize

In [None]:
import numpy as np
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('french'))






[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#function that tokenize, takes out stopwords, and counts token in df
def tokenize_stop_words_count(df):
  df['tokens'] = df['sentence'].apply(lambda x: word_tokenize(x, language='french'))
  df['no_stop_words'] = df['tokens'].apply(lambda tokens: [word for word in tokens if word.lower() not in stop_words])
  df['token_count'] = df['no_stop_words'].apply(len)
  return df

In [None]:
training = tokenize_stop_words_count(training)

In [None]:
training.head()

Unnamed: 0_level_0,sentence,difficulty,encoded_diff,tokens,token_count,no_stop_words
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Les coûts kilométriques réels peuvent diverger...,C1,4,"[Les, coûts, kilométriques, réels, peuvent, di...",29,"[coûts, kilométriques, réels, peuvent, diverge..."
1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1,0,"[Le, bleu, ,, c'est, ma, couleur, préférée, ma...",8,"[bleu, ,, c'est, couleur, préférée, n'aime, ve..."
2,Le test de niveau en français est sur le site ...,A1,0,"[Le, test, de, niveau, en, français, est, sur,...",7,"[test, niveau, français, site, Internet, l'éco..."
3,Est-ce que ton mari est aussi de Boston?,A1,0,"[Est-ce, que, ton, mari, est, aussi, de, Bosto...",5,"[Est-ce, mari, aussi, Boston, ?]"
4,"Dans les écoles de commerce, dans les couloirs...",B1,2,"[Dans, les, écoles, de, commerce, ,, dans, les...",24,"[écoles, commerce, ,, couloirs, places, financ..."


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
training['preprocessed_text'] = training['no_stop_words'].apply(lambda x: ' '.join(x))
count_vectorizer = CountVectorizer(ngram_range=(1, 1))
X = count_vectorizer.fit_transform(training['preprocessed_text'])


  (0, 3455)	1
  (0, 7459)	1
  (0, 11586)	1
  (0, 9661)	1
  (0, 4041)	1
  (0, 12037)	1
  (0, 13563)	1
  (0, 8467)	1
  (0, 5669)	1
  (0, 8464)	1
  (0, 13228)	1
  (0, 13524)	1
  (0, 12868)	2
  (0, 8976)	1
  (0, 11052)	1
  (0, 6903)	1
  (0, 13525)	1
  (0, 13087)	1
  (0, 7629)	1
  (0, 5646)	1
  (0, 13170)	1
  (0, 5139)	1
  (1, 1705)	1
  (1, 5123)	1
  (1, 3383)	1
  :	:
  (4798, 12744)	1
  (4798, 8606)	1
  (4798, 11210)	1
  (4798, 7173)	1
  (4798, 14398)	2
  (4798, 12758)	1
  (4798, 14520)	1
  (4798, 2107)	1
  (4798, 7691)	1
  (4798, 8903)	1
  (4798, 6864)	1
  (4798, 7374)	1
  (4798, 7370)	1
  (4798, 422)	1
  (4798, 6777)	1
  (4798, 9628)	1
  (4799, 8138)	1
  (4799, 13147)	1
  (4799, 11203)	1
  (4799, 9647)	1
  (4799, 10787)	1
  (4799, 11787)	1
  (4799, 13545)	1
  (4799, 6318)	1
  (4799, 14446)	1


Try a LogisticRegression with y = the level and x = the nb of tokens

In [None]:
y = training['encoded_diff']
X = training[['token_count']]

Split into train/test

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)


Reshape X_train because need 2D array

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(penalty='l2', solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


Check accuracy of predicted values

In [None]:
# Accuracy on the test set
print('Accuracy of Logistic regression classifier on test set: {:.2f}'
     .format(model.score(X_test, y_test)))

# Accuracy on the training set
print('Accuracy of Logistic regression classifier on training set: {:.2f}'
     .format(model.score(X_train, y_train)))

Not really good accuracy


In [None]:
from sklearn.metrics import accuracy_score
accuracy_test = accuracy_score(y_test, y_pred)

print(f'Accurary of Logistic regression classifier on test set: {accuracy_test :.2f}')

In [None]:
test = pd.read_csv('unlabelled_test_data.csv')

In [None]:
test = tokenize_stop_words_count(test)

In [None]:
test.head()

In [None]:
X_to_predict = test[['token_count']]

In [None]:
test['difficulty encoded'] = model.predict(X_to_predict)

In [None]:
#function that cleans and return the final dataset to upload on kaggle
def prep_final_pred(df):
  number_to_level = {
    0: 'A1',
    1: 'A2',
    2: 'B1',
    3: 'B2',
    4: 'C1',
    5: 'C2'
}

  df['difficulty'] = df['difficulty encoded'].map(number_to_level)
  columns_to_keep = ['id', 'difficulty']
  final_df =  df[columns_to_keep]
  final_df.set_index('id', inplace=True)


  return final_df


In [None]:
final_test = prep_final_pred(test)

In [None]:
final_test.head()

In [None]:
final_test.to_csv('submission.csv')


In [None]:
sample = pd.read_csv('sample_submission.csv')
sample.head()

In [None]:
#! kaggle competitions submit -c detecting-french-texts-difficulty-level-2023 -f submission.csv -m "Sample submission"
