In [None]:
%%capture
!pip install sentencepiece
!pip install transformers

In [None]:
#DO ALL NECESSARY IMPORTS
from google.colab import files
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import CamembertConfig, CamembertModel, CamembertTokenizer, CamembertTokenizer, CamembertForSequenceClassification
from transformers import BertModel, BertTokenizer
import torch


In [None]:
#use bert embedding but using camembert as model
model_name = 'camembert-base'
camembert_model = CamembertModel.from_pretrained(model_name)
tokenizer = CamembertTokenizer.from_pretrained(model_name, revision="main", sentencepiece_model="/usr/local/lib/python3.10/dist-packages")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:

def get_bert_embedding(sentence, model, tokenizer):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
    return embeddings

def max_pool_embeddings(embeddings):
    return np.max(embeddings, axis=0)

def final_camembert(df):
    df['camembert_embedding'] =  df['sentence'].apply(lambda x: get_bert_embedding(x, camembert_model, tokenizer))
    df['cam_pooled_embedding'] = df['camembert_embedding'].apply(lambda x: max_pool_embeddings(x))
    return df

In [None]:
import pandas as pd

test = pd.read_csv('https://github.com/eperroud/DataScienceProject/raw/3686315a1ffa6d20da666d2c373ac1569c22c9d1/data/final_test.csv')
training = pd.read_csv('https://github.com/eperroud/DataScienceProject/raw/3686315a1ffa6d20da666d2c373ac1569c22c9d1/data/final_training.csv')

In [None]:
training = final_camembert(training)

In [None]:
test = final_camembert(test)

In [None]:
test.head()

Unnamed: 0,id,sentence,sentence_sp,tokens,tokens_no_stop,token_count_no_stop,token_count,nb_verbs,nb_nouns,nb_adj,nb_adv,words,tfidf_score_unigram,tfidf_score_bigram,camembert_embedding,cam_pooled_embedding
0,0,Nous dûmes nous excuser des propos que nous eû...,Nous dûmes nous excuser des propos que nous eû...,"['Nous', 'dûmes', 'nous', 'excuser', 'des', 'p...","['dûmes', 'excuser', 'propos', 'eûmes', 'prono...",5,10,3,1,0,0,"['nous', 'dûmes', 'nous', 'excuser', 'des', 'p...",3.192917,2.331769,"[[0.047271907, 0.030453198, -0.022989253, 0.10...","[0.047271907, 0.030453198, -0.022989253, 0.109..."
1,1,Vous ne pouvez pas savoir le plaisir que j'ai ...,Vous ne pouvez pas savoir le plaisir que j'ai ...,"['Vous', 'ne', 'pouvez', 'pas', 'savoir', 'le'...","['pouvez', 'savoir', 'plaisir', 'recevoir', 'b...",7,16,4,1,2,2,"['vous', 'ne', 'pouvez', 'pas', 'savoir', 'le'...",12.660056,8.902162,"[[-0.0065091653, -0.11096406, -0.0380778, 0.06...","[-0.0065091653, -0.11096406, -0.0380778, 0.065..."
2,2,"Et, paradoxalement, boire froid n'est pas la b...","Et, paradoxalement, boire froid n'est pas la b...","['Et', ',', 'paradoxalement', ',', 'boire', 'f...","[',', 'paradoxalement', ',', 'boire', 'froid',...",8,13,1,1,2,3,"['et,', 'paradoxalement,', 'boire', 'froid', ""...",8.244312,5.840872,"[[-0.035896078, -0.04058774, 0.08143554, 0.039...","[-0.035896078, -0.04058774, 0.08143554, 0.0398..."
3,3,"Ce n'est pas étonnant, car c'est une saison my...","Ce n'est pas étonnant, car c'est une saison my...","['Ce', ""n'"", 'est', 'pas', 'étonnant', ',', 'c...","['étonnant', ',', 'saison', 'mystérieuse']",4,12,0,1,2,2,"['ce', ""n'est"", 'pas', 'étonnant,', 'car', ""c'...",1.943283,1.444063,"[[0.034818918, -0.037731417, -0.055650946, 0.0...","[0.034818918, -0.037731417, -0.055650946, 0.08..."
4,4,"Le corps de Golo lui-même, d'une essence aussi...","Le corps de Golo lui-même, d'une essence aussi...","['Le', 'corps', 'de', 'Golo', 'lui-même', ',',...","['corps', 'Golo', ',', 'essence', 'surnaturell...",40,83,9,13,5,8,"['le', 'corps', 'de', 'golo', 'lui-même,', ""d'...",15.581025,10.953801,"[[0.011139873, 0.057788055, -0.029189765, 0.12...","[0.011139873, 0.057788055, -0.029189765, 0.120..."


In [None]:
pred_features = ['token_count_no_stop', 'tfidf_score_bigram', 'nb_nouns', 'nb_adj', 'nb_verbs']
X_cam = pd.DataFrame(training['cam_pooled_embedding'].tolist())
X = pd.concat([training[pred_features], X_cam], axis=1)
X.columns = X.columns.astype(str)
y = training[['encoded_diff']]


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)


In [None]:
%%capture
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

RF = RandomForestClassifier(random_state=42)
parameters= {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
}

#perform grid search to find the best parameters
grid_search = GridSearchCV(RF, parameters, cv=5, scoring='accuracy')

grid_search.fit(X_train, y_train)


In [None]:
RF = grid_search.best_estimator_


In [None]:
y_pred_RF = RF.predict(X_test)

In [None]:
# Accuracy on the test set
print('Accuracy of Random Forest regression classifier on test set: {:.2f}'
     .format(RF.score(X_test, y_test)))

# Accuracy on the training set
print('Accuracy of Random Forest regression classifier on training set: {:.2f}'
     .format(RF.score(X_train, y_train)))

from sklearn.metrics import precision_score, recall_score, f1_score
average_type = 'weighted'

precision = precision_score(y_test, y_pred_RF, average=average_type)
recall = recall_score(y_test, y_pred_RF, average=average_type)
f1 = f1_score(y_test, y_pred_RF, average=average_type)

print('The precision is: {:0.3f}'.format(precision))
print('The recall is: {:0.3f}'.format(recall))
print('The F1 score is: {:0.3f}'.format(f1))

Accuracy of Random Forest regression classifier on test set: 0.50
Accuracy of Random Forest regression classifier on training set: 1.00
The precision is: 0.499
The recall is: 0.503
The F1 score is: 0.499


In [None]:
X_cam_pred = pd.DataFrame(test['cam_pooled_embedding'].tolist())
X_pred = pd.concat([test[pred_features], X_cam_pred], axis=1)
X_pred.columns = X_pred.columns.astype(str)
test['difficulty encoded'] = RF.predict(X_pred)

In [None]:
test['difficulty encoded'].head()

0    5
1    2
2    2
3    1
4    5
Name: difficulty encoded, dtype: int64

In [None]:
#function that cleans and return the final dataset to upload on kaggle
def prep_final_pred(df):
  number_to_level = {
    0: 'A1',
    1: 'A2',
    2: 'B1',
    3: 'B2',
    4: 'C1',
    5: 'C2'
}

  df['difficulty'] = df['difficulty encoded'].map(number_to_level)
  columns_to_keep = ['id', 'difficulty']
  final_df =  df[columns_to_keep]
  final_df.set_index('id', inplace=True)


  return final_df


In [None]:
final_test = prep_final_pred(test)

In [None]:
final_test.to_csv('submission.csv')

In [None]:
final_test.head()

Unnamed: 0_level_0,difficulty
id,Unnamed: 1_level_1
0,C2
1,B1
2,B1
3,A2
4,C2


In [None]:
! pip install kaggle



In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


In [None]:
!mkdir ~/.kaggle

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [None]:
!cp /content/drive/MyDrive/Kaggle/kaggle.json ~/.kaggle/kaggle.json

In [None]:
! kaggle competitions submit -c detecting-french-texts-difficulty-level-2023 -f submission.csv -m "Sample submission"


100% 8.30k/8.30k [00:00<00:00, 32.4kB/s]
Successfully submitted to Detecting the difficulty level of French texts