In [1]:
# Import the libraries
import pandas as pd
import numpy as np
import spacy.cli

from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.svm import SVC



In [2]:
# Download the French language model
spacy.cli.download("fr_core_news_lg")
nlp = spacy.load("fr_core_news_lg")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_lg')


In [3]:
# Read the training data
df = pd.read_csv("training_data.csv")
df.info

<bound method DataFrame.info of         id                                           sentence difficulty
0        0  Les coûts kilométriques réels peuvent diverger...         C1
1        1  Le bleu, c'est ma couleur préférée mais je n'a...         A1
2        2  Le test de niveau en français est sur le site ...         A1
3        3           Est-ce que ton mari est aussi de Boston?         A1
4        4  Dans les écoles de commerce, dans les couloirs...         B1
...    ...                                                ...        ...
4795  4795  C'est pourquoi, il décida de remplacer les hab...         B2
4796  4796  Il avait une de ces pâleurs splendides qui don...         C1
4797  4797  Et le premier samedi de chaque mois, venez ren...         A2
4798  4798  Les coûts liés à la journalisation n'étant pas...         C2
4799  4799  Sur le sable, la mer haletait de toute la resp...         C2

[4800 rows x 3 columns]>

In [4]:
# Read the unlabelled data
df_pred = pd.read_csv('unlabelled_test_data.csv')
df_pred.info

<bound method DataFrame.info of         id                                           sentence
0        0  Nous dûmes nous excuser des propos que nous eû...
1        1  Vous ne pouvez pas savoir le plaisir que j'ai ...
2        2  Et, paradoxalement, boire froid n'est pas la b...
3        3  Ce n'est pas étonnant, car c'est une saison my...
4        4  Le corps de Golo lui-même, d'une essence aussi...
...    ...                                                ...
1195  1195  C'est un phénomène qui trouve une accélération...
1196  1196  Je vais parler au serveur et voir si on peut d...
1197  1197  Il n'était pas comme tant de gens qui par pare...
1198  1198      Ils deviennent dangereux pour notre économie.
1199  1199  Son succès a généré beaucoup de réactions néga...

[1200 rows x 2 columns]>

In [5]:
# Check the submission format
df_submission = pd.read_csv('sample_submission.csv')
df_submission.info

<bound method DataFrame.info of         id difficulty
0        0         A1
1        1         A1
2        2         A1
3        3         A1
4        4         A1
...    ...        ...
1195  1195         A1
1196  1196         A1
1197  1197         A1
1198  1198         A1
1199  1199         A1

[1200 rows x 2 columns]>

In [6]:
# Apply the nlp sentence vectorizer 
with nlp.disable_pipes():
    sentence_vectors = np.array([nlp(sentence).vector for sentence in df.sentence])
    
sentence_vectors.shape

(4800, 300)

In [7]:
# Select features
y = df.difficulty 

# Split train test data 
X_train_vec, X_test_vec, y_train_vec, y_test_vec = train_test_split(sentence_vectors, y, test_size=0.2, random_state=0, stratify=y)

In [8]:
# Check the length of the X_train_vec
len(X_train_vec)

3840

In [9]:
# Have a look at the sentence vectors
X_train_vec

array([[ 0.86313105, -0.34950605, -0.744026  , ..., -0.31258205,
         1.4116739 , -0.18395697],
       [-0.9572263 ,  0.7255361 , -2.9996173 , ...,  1.3372338 ,
        -2.0699434 ,  0.80701214],
       [-0.00980117, -0.9344252 , -0.66158444, ..., -0.6897755 ,
         1.2421126 ,  0.6886996 ],
       ...,
       [ 0.11343437,  0.21153995, -2.1969724 , ...,  0.17389652,
        -2.536263  ,  2.0801914 ],
       [ 0.9807474 , -2.097571  , -1.0858219 , ..., -1.2354175 ,
         1.6772972 , -1.1694974 ],
       [ 0.4412396 , -1.3123204 , -2.287636  , ..., -0.5434421 ,
        -0.08971963,  0.71144223]], dtype=float32)

In [10]:
y_train_vec

183     C2
90      A2
1128    C2
2336    B1
4398    A1
        ..
3983    A2
1870    A1
394     B2
3244    B2
411     C2
Name: difficulty, Length: 3840, dtype: object

In [11]:
# Try out linear SVC, set dual=False to speed up training
svc = LinearSVC(random_state=0, dual=False, max_iter=10000)
svc.fit(X_train_vec, y_train_vec)
print(f"Accuracy: {svc.score(X_test_vec, y_test_vec):.4f}")

Accuracy: 0.4844


In [12]:
# Use SVC instead of line SVC 
svclassifier = SVC(kernel='linear', degree=8, gamma="auto")
svclassifier.fit(X_train_vec, y_train_vec)
print(f"Accuracy: {svclassifier.score(X_test_vec, y_test_vec):.4f}")

Accuracy: 0.4854


In [None]:
# Use rbf as kernel to see if it is getting any better
svclassifier = SVC(kernel='rbf', degree=8, gamma="auto")
svclassifier.fit(X_train_vec, y_train_vec)
print(f"Accuracy: {svclassifier.score(X_test_vec, y_test_vec):.4f}")

Accuracy: 0.4896


In [None]:
# Continue tuning based on research on the internet, more trial and error
svclassifier = SVC(C=10, kernel='rbf', degree=3, gamma=0.001)
svclassifier.fit(X_train_vec, y_train_vec)
print(f"Accuracy: {svclassifier.score(X_test_vec, y_test_vec):.4f}")

Accuracy: 0.5031


In [13]:
# Use all availabe data to train the model
svclassifier = SVC(C=10, kernel='rbf', degree=3, gamma=0.001)
svclassifier.fit(sentence_vectors, y)
print(f"Accuracy: {svclassifier.score(X_test_vec, y_test_vec):.4f}")

Accuracy: 0.7625


In [14]:
# Vectorize the unlabelled data for predictions
with nlp.disable_pipes():
    pred_sentence_vectors = np.array([nlp(sentence).vector for sentence in df_pred.sentence])
    
pred_sentence_vectors.shape

(1200, 300)

In [15]:
# Use the SVC classifier for predcition, e.g. the one tuned first, then trained with all available data
pred_difficulty = svclassifier.predict(pred_sentence_vectors)

In [16]:
# Have a look at the predictions
pred_difficulty

array(['C2', 'B1', 'B1', ..., 'C2', 'B2', 'B2'], dtype=object)

In [17]:
# Replace the dummy data with predictions
df_submission.difficulty = pred_difficulty
# Write to a csv file
df_submission.to_csv("submission_22_12_word_embedding.csv", index=False)