In [1]:
# reading in the data via the Kaggle API
# mount your Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
# install Kaggle
! pip install kaggle



In [3]:
!mkdir ~/.kaggle

In [4]:
#read in your Kaggle credentials from Google Drive
!cp /content/drive/MyDrive/Kaggle/kaggle.json ~/.kaggle/kaggle.json

In [5]:
# download the dataset from the competition page
! kaggle competitions download -c detecting-french-texts-difficulty-level-2023
from zipfile import ZipFile
with ZipFile('detecting-french-texts-difficulty-level-2023.zip','r') as zip:
  zip.extractall(path="")

Downloading detecting-french-texts-difficulty-level-2023.zip to /content
  0% 0.00/303k [00:00<?, ?B/s]
100% 303k/303k [00:00<00:00, 112MB/s]


In [6]:
# read in your training data
import pandas as pd
import numpy as np

training = pd.read_csv('training_data.csv', index_col = 'id')

In [7]:
training.head()
training.dropna()
training.drop_duplicates()


Unnamed: 0_level_0,sentence,difficulty
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Les coûts kilométriques réels peuvent diverger...,C1
1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1
2,Le test de niveau en français est sur le site ...,A1
3,Est-ce que ton mari est aussi de Boston?,A1
4,"Dans les écoles de commerce, dans les couloirs...",B1
...,...,...
4795,"C'est pourquoi, il décida de remplacer les hab...",B2
4796,Il avait une de ces pâleurs splendides qui don...,C1
4797,"Et le premier samedi de chaque mois, venez ren...",A2
4798,Les coûts liés à la journalisation n'étant pas...,C2


The first thing to do in order to train/test the data is to encode the column difficulty. We used labelencoder to have a new column with A1 = 0 ... C2 =5


In [8]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
label_encoder = LabelEncoder()
training['encoded_diff'] = label_encoder.fit_transform(training['difficulty'])


In [9]:
%%capture
!python -m spacy download fr_core_news_lg


Transform our sentences to spacy + tokenize

In [10]:
import numpy as np
import spacy
sp = spacy.load('fr_core_news_lg')
spacy_stopwords = spacy.lang.fr.stop_words.STOP_WORDS



In [11]:
#function that tokenize, takes out stopwords, and counts token in df
def tokenize_stop_words_count(df):
  df['sentence'] = df['sentence'].apply(sp)
  df['tokens'] = df['sentence'].apply(lambda doc: [token.text for token in doc])
  df['tokens_no_stop'] = df['tokens'].apply(lambda tokens: [token for token in tokens if token.lower() not in spacy_stopwords])
  df['token_count'] = df['tokens_no_stop'].apply(len)
  return df

In [12]:
training = tokenize_stop_words_count(training)

In [13]:
def count_verbs_nouns_adj(df):
  df['nb_verbs'] = df['sentence'].apply(lambda x: sum(1 for token in sp(x) if token.pos_ == 'VERB'))
  df['nb_nouns'] = df['sentence'].apply(lambda x: sum(1 for token in sp(x) if token.pos_ == 'NOUN'))
  df['nb_adj'] = df['sentence'].apply(lambda x: sum(1 for token in sp(x) if token.pos_ == 'ADJ'))
  df['nb_adj'] = df['sentence'].apply(lambda x: sum(1 for token in sp(x) if token.pos_ == 'ADV'))
  return df

In [14]:
training = count_verbs_nouns_adj(training)

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf(df):
  tfidf = TfidfVectorizer()
  tfidf_matrix = tfidf.fit_transform(df['sentence'].apply(lambda x: sp(x).text))
  tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())
  df['tfidf_score'] = tfidf_df.sum(axis=1)
  return df


In [21]:
training = tfidf(training)

In [30]:
training.head()

Unnamed: 0_level_0,sentence,difficulty,encoded_diff,tokens,tokens_no_stop,token_count,nb_verbs,nb_nouns,nb_adj,tfidf_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,"(Les, coûts, kilométriques, réels, peuvent, di...",C1,4,"[Les, coûts, kilométriques, réels, peuvent, di...","[coûts, kilométriques, réels, diverger, sensib...",27,4,14,2,5.105955
1,"(Le, bleu, ,, c', est, ma, couleur, préférée, ...",A1,0,"[Le, bleu, ,, c', est, ma, couleur, préférée, ...","[bleu, ,, couleur, préférée, aime, vert, !]",7,1,3,2,3.13313
2,"(Le, test, de, niveau, en, français, est, sur,...",A1,0,"[Le, test, de, niveau, en, français, est, sur,...","[test, niveau, français, site, Internet, école...",7,1,4,0,3.123805
3,"(Est, -ce, que, ton, mari, est, aussi, de, Bos...",A1,0,"[Est, -ce, que, ton, mari, est, aussi, de, Bos...","[-ce, mari, Boston, ?]",4,0,2,1,2.611847
4,"(Dans, les, écoles, de, commerce, ,, dans, les...",B1,2,"[Dans, les, écoles, de, commerce, ,, dans, les...","[écoles, commerce, ,, couloirs, places, financ...",24,4,10,1,5.017778


In [23]:
test = pd.read_csv('unlabelled_test_data.csv')

In [24]:
test = tokenize_stop_words_count(test)

In [25]:
test = count_verbs_nouns_adj(test)

In [26]:
test = tfidf(test)

In [27]:
test.head()

Unnamed: 0,id,sentence,tokens,tokens_no_stop,token_count,nb_verbs,nb_nouns,nb_adj,tfidf_score
0,0,"(Nous, dûmes, nous, excuser, des, propos, que,...","[Nous, dûmes, nous, excuser, des, propos, que,...","[dûmes, excuser, propos, eûmes, prononcés]",5,3,1,0,2.62043
1,1,"(Vous, ne, pouvez, pas, savoir, le, plaisir, q...","[Vous, ne, pouvez, pas, savoir, le, plaisir, q...","[pouvez, savoir, plaisir, recevoir, bonne, nou...",7,4,1,2,3.529864
2,2,"(Et, ,, paradoxalement, ,, boire, froid, n', e...","[Et, ,, paradoxalement, ,, boire, froid, n', e...","[,, paradoxalement, ,, boire, froid, bonne, pa...",8,1,1,3,2.738606
3,3,"(Ce, n', est, pas, étonnant, ,, car, c', est, ...","[Ce, n', est, pas, étonnant, ,, car, c', est, ...","[étonnant, ,, saison, mystérieuse]",4,0,1,2,2.670108
4,4,"(Le, corps, de, Golo, lui-même, ,, d', une, es...","[Le, corps, de, Golo, lui-même, ,, d', une, es...","[corps, Golo, ,, essence, surnaturelle, montur...",40,9,13,8,7.260068


In [28]:
training.to_csv('new_training.csv', index=False)
test.to_csv('new_test.csv', index=False)


In [43]:
from google.colab import files
files.download('new_training.csv')
files.download('new_test.csv' )

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [41]:
import requests
from requests.auth import HTTPBasicAuth
import json
"""

# Personal Access Token (replace with your own token)
github_token = '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'


# GitHub repository information
repo_owner = 'eperroud'
repo_name = 'DataSceinceProject'
training_file_path = 'data/new_training.csv'  # Specify the path where you want to save the training file
test_file_path = 'data/new_test.csv'  # Specify the path where you want to save the test file


# Function to upload a file to GitHub
def upload_to_github(file_path, content, commit_message):
    # Create the GitHub API URL
    url = f'https://api.github.com/repos/{repo_owner}/{repo_name}/contents/{file_path}'

    # Prepare headers with the authorization token
    headers = {
        'Authorization': f'token {github_token}',
    }

    # Prepare the data payload for the API request
    payload = {
        'message': commit_message,
        'content': content,
    }

    # Make a PUT request to create or update the file
    response = requests.put(url, headers=headers, json=payload)

    # Check if the file was successfully uploaded
    if response.status_code == 201:
        print(f'File {file_path} uploaded successfully.')
    else:
        print(f'Error uploading file {file_path}. Status code: {response.status_code}, Message: {response.text}')

# Read the content of the training CSV file
with open('new_training.csv', 'r') as file:
    training_content = file.read()

# Upload the training CSV file to GitHub
upload_to_github(training_file_path, training_content, 'Update existing new_training.csv')

# Read the content of the test CSV file
with open('new_test.csv', 'r') as file:
    test_content = file.read()

# Upload the test CSV file to GitHub
upload_to_github(test_file_path, test_content, 'Update existing new_test.csv')
"""

Error uploading file data/new_training.csv. Status code: 404, Message: {"message":"Not Found","documentation_url":"https://docs.github.com/rest/repos/contents#create-or-update-file-contents"}
Error uploading file data/new_test.csv. Status code: 404, Message: {"message":"Not Found","documentation_url":"https://docs.github.com/rest/repos/contents#create-or-update-file-contents"}
