In [None]:
# reading in the data via the Kaggle API
# mount your Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# install Kaggle
! pip install kaggle



In [None]:
!mkdir ~/.kaggle

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [None]:
#read in your Kaggle credentials from Google Drive
!cp /content/drive/MyDrive/Kaggle/kaggle.json ~/.kaggle/kaggle.json

In [None]:
# download the dataset from the competition page
! kaggle competitions download -c detecting-french-texts-difficulty-level-2023
from zipfile import ZipFile
with ZipFile('detecting-french-texts-difficulty-level-2023.zip','r') as zip:
  zip.extractall(path="")

detecting-french-texts-difficulty-level-2023.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
!pip install requests pdfplumber

In [None]:
#DO ALL NECESSARY IMPORTS
from google.colab import files
import ast
import difflib
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import CamembertConfig, CamembertModel, CamembertTokenizer, CamembertTokenizer, CamembertForSequenceClassification
from transformers import BertModel, BertTokenizer
import torch


In [None]:
# read in your training data

training = pd.read_csv('training_data.csv', index_col = 'id')
test = pd.read_csv('unlabelled_test_data.csv')


In [None]:
training.head()
training.dropna()
training.drop_duplicates()


Unnamed: 0_level_0,sentence,difficulty
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Les coûts kilométriques réels peuvent diverger...,C1
1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1
2,Le test de niveau en français est sur le site ...,A1
3,Est-ce que ton mari est aussi de Boston?,A1
4,"Dans les écoles de commerce, dans les couloirs...",B1
...,...,...
4795,"C'est pourquoi, il décida de remplacer les hab...",B2
4796,Il avait une de ces pâleurs splendides qui don...,C1
4797,"Et le premier samedi de chaque mois, venez ren...",A2
4798,Les coûts liés à la journalisation n'étant pas...,C2


The first thing to do in order to train/test the data is to encode the column difficulty. We used labelencoder to have a new column with A1 = 0 ... C2 =5


In [None]:
label_encoder = LabelEncoder()
training['encoded_diff'] = label_encoder.fit_transform(training['difficulty'])


In [None]:
%%capture
!python -m spacy download fr_core_news_lg


In [None]:

sp = spacy.load('fr_core_news_lg')
spacy_stopwords = spacy.lang.fr.stop_words.STOP_WORDS



In [None]:
#function that tokenize, takes out stopwords, and counts token in df
def tokenize_stop_words_count(df):
  df['sentence_sp'] = df['sentence'].apply(sp)
  df['tokens'] = df['sentence_sp'].apply(lambda doc: [token.text for token in doc])
  df['tokens_no_stop'] = df['tokens'].apply(lambda tokens: [token for token in tokens if token.lower() not in spacy_stopwords])
  df['token_count_no_stop'] = df['tokens_no_stop'].apply(len)
  df['token_count'] = df['tokens'].apply(len)
  return df

#function that counts selectted pos
def count_verbs_nouns_adj(df):
  df['nb_verbs'] = df['sentence_sp'].apply(lambda x: sum(1 for token in sp(x) if token.pos_ == 'VERB'))
  df['nb_nouns'] = df['sentence_sp'].apply(lambda x: sum(1 for token in sp(x) if token.pos_ == 'NOUN'))
  df['nb_adj'] = df['sentence_sp'].apply(lambda x: sum(1 for token in sp(x) if token.pos_ == 'ADJ'))
  df['nb_adv'] = df['sentence_sp'].apply(lambda x: sum(1 for token in sp(x) if token.pos_ == 'ADV'))
  return df

#function that compute tfidf score of each sentence

def tfidf_sentence_unigram(df):
  corpus = df['sentence'].tolist()
  tfidf = TfidfVectorizer(ngram_range=(1, 1), stop_words=list(spacy_stopwords))
  features = tfidf.fit_transform(corpus)
  results = pd.DataFrame(
    features.todense(),
    columns=tfidf.get_feature_names_out(),
    )
  word_freq = results.sum().sort_values(ascending=False)
  df['words'] = df['sentence'].apply(lambda x: x.lower().split())
  df['tfidf_score_unigram'] = df['words'].apply(lambda words: sum(word_freq.get(word, 0) for word in words))
  return df

def tfidf_sentence_bigram(df):
  corpus = df['sentence'].tolist()
  tfidf = TfidfVectorizer(ngram_range=(1, 2), stop_words=list(spacy_stopwords))
  features = tfidf.fit_transform(corpus)
  results = pd.DataFrame(
    features.todense(),
    columns=tfidf.get_feature_names_out(),
    )
  word_freq = results.sum().sort_values(ascending=False)
  df['words'] = df['sentence'].apply(lambda x: x.lower().split())
  df['tfidf_score_bigram'] = df['words'].apply(lambda words: sum(word_freq.get(word, 0) for word in words))
  return df


In [None]:
def find_first_french_word(df):
    previous_word = 'a'
    for index, row in df.iterrows():
        #print(row[0][0], previous_word)
        if row[0][0] == 'a' and previous_word[0] == 'v':
            return index
        else:
            previous_word = row[0]
def find_cognates(word_list, french_cognates, similarity_threshold=0.90):
    if isinstance(word_list, str):
        actual_list = ast.literal_eval(word_list)
    else:
        actual_list = word_list
    i = 0
    for french_word in french_cognates:
        for words in actual_list:
            similarity = difflib.SequenceMatcher(None, french_word, words).ratio()
            if similarity > similarity_threshold:
                i+=1
    return i
def cognates_similarities(df):
    df['cognate_count'] = df['tokens'].apply(lambda x: find_cognates(x, french_cognates))


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
pdf_url = 'https://docs.steinhardt.nyu.edu/pdfs/metrocenter/xr1/glossaries/ELA/GlossaryCognatesFrenchUpdated5-5-2014.pdf'

# Use requests to get the content of the PDF file from the URL
response = requests.get(pdf_url)
response.raise_for_status()

cognates = []

# Use BytesIO to open the PDF from the content in memory
with io.BytesIO(response.content) as open_pdf_file:
    with pdfplumber.open(open_pdf_file) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            for line in text.split('\n'):
                parts = list(filter(None, line.split(' ')))
                if len(parts) == 4:
                    cognates.append((parts[1], parts[3]))

cognates = pd.DataFrame(cognates, columns = ['1', '2'])
cognates = cognates[cognates.apply(lambda x: x[0][0].lower() == x[1][0].lower(), axis=1)]
first_french = find_first_french_word(cognates)
french_cognates = cognates['1'][first_french-14:].tolist()

In [None]:
training = tokenize_stop_words_count(training)

In [None]:
training = count_verbs_nouns_adj(training)

In [None]:
training = tfidf_sentence_unigram(training)
training = tfidf_sentence_1_2_grams(training)



In [None]:
training = cognates_similarities(training)

In [None]:
test = tokenize_stop_words_count(test)

In [None]:
test = count_verbs_nouns_adj(test)

In [None]:
test = tfidf_sentence_unigram(test)
test = tfidf_sentence_1_2_grams(test)



In [None]:
test = cognates_similarities(test)

In [None]:
training.to_csv('final_training.csv', index=False)
test.to_csv('final_test.csv', index=False)
files.download('final_training.csv')
files.download('final_test.csv' )

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>