In [None]:
# reading in the data via the Kaggle API
# mount your Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# install Kaggle
! pip install kaggle



In [None]:
!mkdir ~/.kaggle

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [None]:
#read in your Kaggle credentials from Google Drive
!cp /content/drive/MyDrive/Kaggle/kaggle.json ~/.kaggle/kaggle.json

In [None]:
# download the dataset from the competition page
! kaggle competitions download -c detecting-french-texts-difficulty-level-2023
from zipfile import ZipFile
with ZipFile('detecting-french-texts-difficulty-level-2023.zip','r') as zip:
  zip.extractall(path="")

detecting-french-texts-difficulty-level-2023.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
# read in your training data
import pandas as pd
import numpy as np

training = pd.read_csv('training_data.csv', index_col = 'id')

In [None]:
training.head()
training.dropna()
training.drop_duplicates()


Unnamed: 0_level_0,sentence,difficulty
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Les coûts kilométriques réels peuvent diverger...,C1
1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1
2,Le test de niveau en français est sur le site ...,A1
3,Est-ce que ton mari est aussi de Boston?,A1
4,"Dans les écoles de commerce, dans les couloirs...",B1
...,...,...
4795,"C'est pourquoi, il décida de remplacer les hab...",B2
4796,Il avait une de ces pâleurs splendides qui don...,C1
4797,"Et le premier samedi de chaque mois, venez ren...",A2
4798,Les coûts liés à la journalisation n'étant pas...,C2


The first thing to do in order to train/test the data is to encode the column difficulty. We used labelencoder to have a new column with A1 = 0 ... C2 =5


In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
label_encoder = LabelEncoder()
training['encoded_diff'] = label_encoder.fit_transform(training['difficulty'])


In [None]:
%%capture
!python -m spacy download fr_core_news_lg


Transform our sentences to spacy + tokenize

In [None]:
import numpy as np
import spacy
sp = spacy.load('fr_core_news_lg')
spacy_stopwords = spacy.lang.fr.stop_words.STOP_WORDS



In [None]:
#function that tokenize, takes out stopwords, and counts token in df
def tokenize_stop_words_count(df):
  df['sentence_sp'] = df['sentence'].apply(sp)
  df['tokens'] = df['sentence_sp'].apply(lambda doc: [token.text for token in doc])
  df['tokens_no_stop'] = df['tokens'].apply(lambda tokens: [token for token in tokens if token.lower() not in spacy_stopwords])
  df['token_count'] = df['tokens_no_stop'].apply(len)
  return df

In [None]:
training = tokenize_stop_words_count(training)

In [None]:
def count_verbs_nouns_adj(df):
  df['nb_verbs'] = df['sentence_sp'].apply(lambda x: sum(1 for token in sp(x) if token.pos_ == 'VERB'))
  df['nb_nouns'] = df['sentence_sp'].apply(lambda x: sum(1 for token in sp(x) if token.pos_ == 'NOUN'))
  df['nb_adj'] = df['sentence_sp'].apply(lambda x: sum(1 for token in sp(x) if token.pos_ == 'ADJ'))
  df['nb_adv'] = df['sentence_sp'].apply(lambda x: sum(1 for token in sp(x) if token.pos_ == 'ADV'))
  return df

In [None]:
training = count_verbs_nouns_adj(training)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_sentence_unigram(df):
  corpus = df['sentence'].tolist()
  tfidf = TfidfVectorizer(ngram_range=(1, 1), stop_words=list(spacy_stopwords))
  features = tfidf.fit_transform(corpus)
  results = pd.DataFrame(
    features.todense(),
    columns=tfidf.get_feature_names_out(),
    )
  word_freq = results.sum().sort_values(ascending=False)
  df['words'] = df['sentence'].apply(lambda x: x.lower().split())
  df['tfidf_score_unigram'] = df['words'].apply(lambda words: sum(word_freq.get(word, 0) for word in words))
  return df

def tfidf_sentence_1_2_grams(df):
  corpus = df['sentence'].tolist()
  tfidf = TfidfVectorizer(ngram_range=(1, 2), stop_words=list(spacy_stopwords))
  features = tfidf.fit_transform(corpus)
  results = pd.DataFrame(
    features.todense(),
    columns=tfidf.get_feature_names_out(),
    )
  word_freq = results.sum().sort_values(ascending=False)
  df['words'] = df['sentence'].apply(lambda x: x.lower().split())
  df['tfidf_score_bigram'] = df['words'].apply(lambda words: sum(word_freq.get(word, 0) for word in words))
  return df


In [None]:
training = tfidf_sentence_unigram(training)
training = tfidf_sentence_1_2_grams(training)



In [None]:
import torch
import pandas as pd
import numpy as np

def get_roberta_embedding(sentence, model, tokenizer):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
    return embeddings

In [None]:
training['roberta_embedding'] = training['tokens'].apply(lambda x: get_roberta_embedding(x, roberta_model, tokenizer))


In [None]:
training['pooled_embedding'] = training['roberta_embedding'].apply(lambda vec: np.mean(vec, axis=0))


In [None]:
training.head()

Unnamed: 0_level_0,sentence,difficulty,encoded_diff,sentence_sp,tokens,tokens_no_stop,token_count,nb_verbs,nb_nouns,nb_adj,nb_adv,words,tfidf_score_unigram,tfidf_score_bigram
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,Les coûts kilométriques réels peuvent diverger...,C1,4,"(Les, coûts, kilométriques, réels, peuvent, di...","[Les, coûts, kilométriques, réels, peuvent, di...","[coûts, kilométriques, réels, diverger, sensib...",27,4,14,3,2,"[les, coûts, kilométriques, réels, peuvent, di...",27.043962,18.477813
1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1,0,"(Le, bleu, ,, c', est, ma, couleur, préférée, ...","[Le, bleu, ,, c', est, ma, couleur, préférée, ...","[bleu, ,, couleur, préférée, aime, vert, !]",7,1,3,1,2,"[le, bleu,, c'est, ma, couleur, préférée, mais...",6.295061,4.28307
2,Le test de niveau en français est sur le site ...,A1,0,"(Le, test, de, niveau, en, français, est, sur,...","[Le, test, de, niveau, en, français, est, sur,...","[test, niveau, français, site, Internet, école...",7,1,4,1,0,"[le, test, de, niveau, en, français, est, sur,...",35.379919,23.504444
3,Est-ce que ton mari est aussi de Boston?,A1,0,"(Est, -ce, que, ton, mari, est, aussi, de, Bos...","[Est, -ce, que, ton, mari, est, aussi, de, Bos...","[-ce, mari, Boston, ?]",4,0,2,0,1,"[est-ce, que, ton, mari, est, aussi, de, boston?]",6.326896,4.309433
4,"Dans les écoles de commerce, dans les couloirs...",B1,2,"(Dans, les, écoles, de, commerce, ,, dans, les...","[Dans, les, écoles, de, commerce, ,, dans, les...","[écoles, commerce, ,, couloirs, places, financ...",24,4,10,2,1,"[dans, les, écoles, de, commerce,, dans, les, ...",88.671934,60.137653


In [None]:
test = pd.read_csv('unlabelled_test_data.csv')

In [None]:
test = tokenize_stop_words_count(test)

In [None]:
test = count_verbs_nouns_adj(test)

In [None]:
test = tfidf_sentence_unigram(test)
test = tfidf_sentence_1_2_grams(test)



In [None]:
test['roberta_embedding'] = test['tokens'].apply(lambda x: get_roberta_embedding(x, roberta_model, tokenizer))
test['pooled_embedding'] = test['roberta_embedding'].apply(lambda vec: np.mean(vec, axis=0))


In [None]:
test.head()

Unnamed: 0,id,sentence,sentence_sp,tokens,tokens_no_stop,token_count,nb_verbs,nb_nouns,nb_adj,nb_adv,words,tfidf_score_unigram,tfidf_score_bigram
0,0,Nous dûmes nous excuser des propos que nous eû...,"(Nous, dûmes, nous, excuser, des, propos, que,...","[Nous, dûmes, nous, excuser, des, propos, que,...","[dûmes, excuser, propos, eûmes, prononcés]",5,3,1,0,0,"[nous, dûmes, nous, excuser, des, propos, que,...",3.192917,2.331769
1,1,Vous ne pouvez pas savoir le plaisir que j'ai ...,"(Vous, ne, pouvez, pas, savoir, le, plaisir, q...","[Vous, ne, pouvez, pas, savoir, le, plaisir, q...","[pouvez, savoir, plaisir, recevoir, bonne, nou...",7,4,1,2,2,"[vous, ne, pouvez, pas, savoir, le, plaisir, q...",12.660056,8.902162
2,2,"Et, paradoxalement, boire froid n'est pas la b...","(Et, ,, paradoxalement, ,, boire, froid, n', e...","[Et, ,, paradoxalement, ,, boire, froid, n', e...","[,, paradoxalement, ,, boire, froid, bonne, pa...",8,1,1,2,3,"[et,, paradoxalement,, boire, froid, n'est, pa...",8.244312,5.840872
3,3,"Ce n'est pas étonnant, car c'est une saison my...","(Ce, n', est, pas, étonnant, ,, car, c', est, ...","[Ce, n', est, pas, étonnant, ,, car, c', est, ...","[étonnant, ,, saison, mystérieuse]",4,0,1,2,2,"[ce, n'est, pas, étonnant,, car, c'est, une, s...",1.943283,1.444063
4,4,"Le corps de Golo lui-même, d'une essence aussi...","(Le, corps, de, Golo, lui-même, ,, d', une, es...","[Le, corps, de, Golo, lui-même, ,, d', une, es...","[corps, Golo, ,, essence, surnaturelle, montur...",40,9,13,5,8,"[le, corps, de, golo, lui-même,, d'une, essenc...",15.581025,10.953801


In [None]:
training.to_csv('new_training.csv', index=False)
test.to_csv('new_test.csv', index=False)


In [None]:
from google.colab import files
files.download('new_training.csv')
files.download('new_test.csv' )

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import requests
from requests.auth import HTTPBasicAuth
import json
"""

# Personal Access Token (replace with your own token)
github_token = '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'


# GitHub repository information
repo_owner = 'eperroud'
repo_name = 'DataSceinceProject'
training_file_path = 'data/new_training.csv'  # Specify the path where you want to save the training file
test_file_path = 'data/new_test.csv'  # Specify the path where you want to save the test file


# Function to upload a file to GitHub
def upload_to_github(file_path, content, commit_message):
    # Create the GitHub API URL
    url = f'https://api.github.com/repos/{repo_owner}/{repo_name}/contents/{file_path}'

    # Prepare headers with the authorization token
    headers = {
        'Authorization': f'token {github_token}',
    }

    # Prepare the data payload for the API request
    payload = {
        'message': commit_message,
        'content': content,
    }

    # Make a PUT request to create or update the file
    response = requests.put(url, headers=headers, json=payload)

    # Check if the file was successfully uploaded
    if response.status_code == 201:
        print(f'File {file_path} uploaded successfully.')
    else:
        print(f'Error uploading file {file_path}. Status code: {response.status_code}, Message: {response.text}')

# Read the content of the training CSV file
with open('new_training.csv', 'r') as file:
    training_content = file.read()

# Upload the training CSV file to GitHub
upload_to_github(training_file_path, training_content, 'Update existing new_training.csv')

# Read the content of the test CSV file
with open('new_test.csv', 'r') as file:
    test_content = file.read()

# Upload the test CSV file to GitHub
upload_to_github(test_file_path, test_content, 'Update existing new_test.csv')
"""

"\n\n# Personal Access Token (replace with your own token)\ngithub_token = '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'\n\n\n# GitHub repository information\nrepo_owner = 'eperroud'\nrepo_name = 'DataSceinceProject'\ntraining_file_path = 'data/new_training.csv'  # Specify the path where you want to save the training file\ntest_file_path = 'data/new_test.csv'  # Specify the path where you want to save the test file\n\n\n# Function to upload a file to GitHub\ndef upload_to_github(file_path, content, commit_message):\n    # Create the GitHub API URL\n    url = f'https://api.github.com/repos/{repo_owner}/{repo_name}/contents/{file_path}'\n\n    # Prepare headers with the authorization token\n    headers = {\n        'Authorization': f'token {github_token}',\n    }\n\n    # Prepare the data payload for the API request\n    payload = {\n        'message': commit_message,\n        'content': content,\n    }\n\n    # Make a PUT request to create or update the file\n    response = requests.put(ur