In [1]:
# Install and update spaCy
!pip install -U spacy

# Download the French language model
!python -m spacy download fr

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
2022-12-22 11:42:11.352454: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'fr' are deprecated. Please use the
full pipeline package name 'fr_core_news_sm' instead.[0m
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fr-core-news-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.4.0/fr_core_news_sm-3.4.0-py3-none-any.whl (16.3 MB)
[K     |████████████████████████████████| 16.3 MB 213 kB/s 
Installing collected packages: fr-core-news-sm
Successfully installed fr-core-news-sm-3.4.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')


In [2]:
# Import the libriaries
import pandas as pd
import numpy as np
import spacy

from spacy import displacy
from spacy.lang.fr.stop_words import STOP_WORDS
from spacy.lang.fr import French

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

import string



In [3]:
# Load the French language
sp = spacy.load('fr_core_news_sm')

In [5]:
# Read the training data
df = pd.read_csv("training_data.csv")
df.info

<bound method DataFrame.info of         id                                           sentence difficulty
0        0  Les coûts kilométriques réels peuvent diverger...         C1
1        1  Le bleu, c'est ma couleur préférée mais je n'a...         A1
2        2  Le test de niveau en français est sur le site ...         A1
3        3           Est-ce que ton mari est aussi de Boston?         A1
4        4  Dans les écoles de commerce, dans les couloirs...         B1
...    ...                                                ...        ...
4795  4795  C'est pourquoi, il décida de remplacer les hab...         B2
4796  4796  Il avait une de ces pâleurs splendides qui don...         C1
4797  4797  Et le premier samedi de chaque mois, venez ren...         A2
4798  4798  Les coûts liés à la journalisation n'étant pas...         C2
4799  4799  Sur le sable, la mer haletait de toute la resp...         C2

[4800 rows x 3 columns]>

In [6]:
# Select features
X = df.sentence 
y = df.difficulty 

# Split train test data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

In [7]:
# Exam the train test data split
X_test

3749    Communiquer et penser dans tant de langages im...
2003    Il est également connu pour ses publicités tél...
4401    Et cela ne changera pas, tant qu'aucune des él...
1339    S'il fait beau, l'après-midi vous pouvez aller...
2585    Edgar, étincelant de furie, dominait tous les ...
                              ...                        
849     Or Skype refuse de donner aux autorités polici...
2784    Les robots humanoïdes n'en sont qu'à leurs bal...
1078    Les bruits de klaxon viennent entrecouper les ...
1564    Mais si, ça me fait plaisir d'aller à Bâle ave...
1907    Le chat a voulu s'asseoir dans le fauteuil, ma...
Name: sentence, Length: 960, dtype: object

In [8]:
# Exam the train test data split
y_test

3749    C2
2003    C1
4401    C2
1339    A1
2585    C1
        ..
849     C1
2784    C2
1078    C1
1564    B1
1907    A1
Name: difficulty, Length: 960, dtype: object

In [9]:
# Read the unlabelled data
df_pred = pd.read_csv('unlabelled_test_data.csv')
df_pred.info

<bound method DataFrame.info of         id                                           sentence
0        0  Nous dûmes nous excuser des propos que nous eû...
1        1  Vous ne pouvez pas savoir le plaisir que j'ai ...
2        2  Et, paradoxalement, boire froid n'est pas la b...
3        3  Ce n'est pas étonnant, car c'est une saison my...
4        4  Le corps de Golo lui-même, d'une essence aussi...
...    ...                                                ...
1195  1195  C'est un phénomène qui trouve une accélération...
1196  1196  Je vais parler au serveur et voir si on peut d...
1197  1197  Il n'était pas comme tant de gens qui par pare...
1198  1198      Ils deviennent dangereux pour notre économie.
1199  1199  Son succès a généré beaucoup de réactions néga...

[1200 rows x 2 columns]>

In [10]:
# Check the submission format
df_submission = pd.read_csv('sample_submission.csv')
df_submission.info

<bound method DataFrame.info of         id difficulty
0        0         A1
1        1         A1
2        2         A1
3        3         A1
4        4         A1
...    ...        ...
1195  1195         A1
1196  1196         A1
1197  1197         A1
1198  1198         A1
1199  1199         A1

[1200 rows x 2 columns]>

In [11]:
# Set the random seed to 0 as instructed
np.random.seed = 0

In [12]:
# Calculate the base rate
base_rate = round(df.difficulty.value_counts()[0]/len(df.difficulty), 4)
base_rate

0.1694

In [13]:
# Create the tokenizer function
def spacy_tokenizer(sentence):

    # Collect all punctuations   
    punctuations = string.punctuation
    # Collect all stop words
    stop_words = spacy.lang.fr.stop_words.STOP_WORDS
    
    # Create token objects
    mytokens = sp(sentence)

    # Lemmatize each token and convert each token into lowercase
    mytokens = [ word.lemma_.lower().strip() for word in mytokens ]   
    # Remove stop words and punctuations
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # Return the preprocessed list of tokens
    return mytokens

In [14]:
# Test the tokenizer with a random sentence 
spacy_tokenizer(df.sentence[25])

['-ce',
 'pouvoir',
 'bien',
 'poser',
 'problème',
 'petit',
 'livret',
 'pourtant',
 'attractif',
 '23',
 '4',
 'pourcent',
 'détenteur',
 'arriver',
 'bout']

In [15]:
# Evaluate the model
def evaluate(true, pred):
    precision = precision_score(true, pred, average='weighted')
    recall = recall_score(true, pred, average='weighted')
    f1 = f1_score(true, pred, average='weighted')
    
    print(f"Confusion matrix:\n{confusion_matrix(true, pred)}")
    print(f"Accuracy score:\n{accuracy_score(true, pred):.4f}")
    print(f"Classification report:\n\tPrecision: {precision:.4f}\n\tRecall: {recall:.4f}\n\tF1_Score: {f1:.4f}")

In [16]:
# Create a list of configs
def configs():

    configs = list()
    
    # Define all configs
    ngram_range = [(1,1), (1,2), (1,3), (2,2), (2,3), (3,3)]
    min_df = [1]
    max_df = [1.0]
    analyzer=['word', 'char']
    
    # Create config instances
    for n in ngram_range:
        for i in min_df:
            for j in max_df:
              for a in analyzer:
                    cfg = [n, i, j, a]
                    configs.append(cfg)
    return configs

configs = configs()

In [17]:
# Find the best config for the specific classifer
def tune_configs(classifier):

    # Define a list of results
    result = []
    
    for config in configs:

        # Assign the config to the vectorizer 
        tfidf_vector = TfidfVectorizer(tokenizer=spacy_tokenizer, ngram_range=config[0], min_df=config[1], max_df=config[2], analyzer=config[3])

        # Create a pipeline
        pipe = Pipeline([('vectorizer', tfidf_vector), ('classifier', classifier)])
        # Fit the model on training data
        pipe.fit(X_train, y_train)

        # Evalute the model on test data using the config
        print("Config: ", config)
        y_pred = pipe.predict(X_test)
        evaluate(y_test, y_pred)
        
        print("-----------------------")

In [18]:
# Find the best config and tune the hyperparamters for the specific classifer
def tune_configs_hyperpm(classifier, grid):

    # Define a list of result
    result = []

    # Create the grid search for the classifer using the given grid
    classifier_cv = GridSearchCV(estimator=classifier, param_grid=grid, n_jobs=-1, verbose=1, scoring = "accuracy") 
    
    for config in configs:

        # Assign the config to the vectorizer 
        tfidf_vector = TfidfVectorizer(tokenizer=spacy_tokenizer, ngram_range=config[0], min_df=config[1], max_df=config[2], analyzer=config[3])

        # Create a pipeline
        pipe = Pipeline([('vectorizer', tfidf_vector), ('classifier', classifier_cv)])
        # Fit the model on training data
        pipe.fit(X_train, y_train)

        # Tune the hyperparameters using the config
        print("Config: ", config)
        print("Hyperparameters:", classifier_cv.best_params_)
        print("Train Score:", round(classifier_cv.best_score_, 4))
        print("-----------------------")

Logistic regression as classifer

---



In [19]:
# Define the vectorizer
tfidf_vector = TfidfVectorizer(tokenizer=spacy_tokenizer)
# Define the classifier
log_reg = LogisticRegression()

# Create a pipeline
pipe = Pipeline([('vectorizer', tfidf_vector), ('classifier', log_reg)])
# Fit the model on training data
pipe.fit(X_train, y_train)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<function spacy_tokenizer at 0x7fa6e834d790>)),
                ('classifier', LogisticRegression())])

In [20]:
# Use the model for prection on test data
y_pred = pipe.predict(X_test)
# Evaluate the prediction
evaluate(y_test, y_pred)

Confusion matrix:
[[90 37 15  9  7  5]
 [37 58 30 14 11  9]
 [28 36 39 29 13 14]
 [ 6  8 20 67 20 37]
 [ 9  3 10 25 69 44]
 [ 5  7 11 16 29 93]]
Accuracy score:
0.4333
Classification report:
	Precision: 0.4268
	Recall: 0.4333
	F1_Score: 0.4280


In [None]:
# Find the best config for the specific classifer
result = tune_configs(log_reg)

In [21]:
# Apply the best config: [(1, 3), 1, 1.0, 'char']
tfidf_vector = TfidfVectorizer(tokenizer=spacy_tokenizer, ngram_range=(1, 3), min_df=1, max_df=1.0, analyzer='char')

pipe = Pipeline([('vectorizer', tfidf_vector), ('classifier', log_reg)])
pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)
evaluate(y_test, y_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Confusion matrix:
[[114  23  17   4   4   1]
 [ 42  65  41   5   2   4]
 [ 20  35  63  21   6  14]
 [  7   5   9  59  43  35]
 [  5   3   8  37  60  47]
 [  5  10   4  24  32  86]]
Accuracy score:
0.4656
Classification report:
	Precision: 0.4600
	Recall: 0.4656
	F1_Score: 0.4609


In [22]:
# Use the model to predict the unlabelled test data
y_pred_unlabelled = pipe.predict(df_pred.sentence)
len(y_pred_unlabelled)

1200

In [23]:
# Replace the dummy data with predictions
df_submission.difficulty = y_pred_unlabelled
df_submission

Unnamed: 0,id,difficulty
0,0,C2
1,1,B1
2,2,A1
3,3,B1
4,4,C2
...,...,...
1195,1195,B1
1196,1196,A2
1197,1197,C2
1198,1198,B2


In [24]:
# Write to a csv file
df_submission.to_csv("submission_22_12_log_reg.csv", index=False)

In [None]:
# It looks that the model is having trouble to detect among A2, B1, B2, C1, check out some random samples of B2 difficulty 
y_test[y_test == "B2"].sample(3)

4671    B2
4710    B2
1270    B2
Name: difficulty, dtype: object

In [None]:
# Have a look at the model's predictions
tmp = [df.sentence[4671], df.sentence[4710], df.sentence[1270]]
y_pred = pipe.predict(tmp)
y_pred

array(['C1', 'C1', 'B2'], dtype=object)

KNN as classifer

---



In [None]:
# Use the same vectorizer, but this time use knn as classifier
tfidf_vector = TfidfVectorizer(tokenizer=spacy_tokenizer)
knn = KNeighborsClassifier(n_neighbors=5)

pipe = Pipeline([('vectorizer', tfidf_vector), ('classifier', knn)])
pipe.fit(X_train, y_train)

In [None]:
y_pred = pipe.predict(X_test)
evaluate(y_test, y_pred)

In [None]:
# Create the parameter grid in terms of n_neighbors, p, weights
grid = {'n_neighbors':np.arange(1,5), 'p':np.arange(1,3), 'weights':['uniform','distance']}

# Find the best config and tune the hyperparamters for the specific classifer
result = tune_configs_hyperpm(knn, grid)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
CONFIG:  [(1, 1), 1, 1.0, 'word']
Hyperparameters: {'n_neighbors': 2, 'p': 1, 'weights': 'distance'}
Train Score: 0.1966
-----------------------
Fitting 5 folds for each of 16 candidates, totalling 80 fits
CONFIG:  [(1, 1), 1, 1.0, 'char']
Hyperparameters: {'n_neighbors': 4, 'p': 1, 'weights': 'uniform'}
Train Score: 0.3492
-----------------------
Fitting 5 folds for each of 16 candidates, totalling 80 fits
CONFIG:  [(1, 2), 1, 1.0, 'word']
Hyperparameters: {'n_neighbors': 2, 'p': 2, 'weights': 'distance'}
Train Score: 0.187
-----------------------
Fitting 5 folds for each of 16 candidates, totalling 80 fits
CONFIG:  [(1, 2), 1, 1.0, 'char']
Hyperparameters: {'n_neighbors': 4, 'p': 1, 'weights': 'distance'}
Train Score: 0.3909
-----------------------
Fitting 5 folds for each of 16 candidates, totalling 80 fits
CONFIG:  [(1, 3), 1, 1.0, 'word']
Hyperparameters: {'n_neighbors': 2, 'p': 2, 'weights': 'distance'}
Train Score: 0.1

In [None]:
# Apply the best config and the tuned hyperparameters
# CONFIG: [(3, 3), 1, 1.0, 'char']
# Hyperparameters: {'n_neighbors': 1, 'p': 2, 'weights': 'uniform'}
tfidf_vector = TfidfVectorizer(tokenizer=spacy_tokenizer, ngram_range=(3, 3), min_df=1, max_df=1.0, analyzer='char')
knn = KNeighborsClassifier(n_neighbors=1, p=2, weights='uniform')

pipe = Pipeline([('vectorizer', tfidf_vector), ('classifier', knn)])
pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)
evaluate(y_test, y_pred)

CONFUSION MATRIX:
[[112  27  14   6   1   3]
 [ 55  64  27   4   3   6]
 [ 57  33  39  12   7  11]
 [ 23  10  20  54  17  34]
 [ 12   9   9  20  48  62]
 [  9   8   9  12  20 103]]
ACCURACY SCORE:
0.4375
CLASSIFICATION REPORT:
	Precision: 0.4404
	Recall: 0.4375
	F1_Score: 0.4235


Decision tree as classifer

---



In [None]:
# Use the same vectorizer, but this time use decision tree as classifier
tfidf_vector = TfidfVectorizer(tokenizer=spacy_tokenizer)
tree = DecisionTreeClassifier()

pipe = Pipeline([('vectorizer', tfidf_vector), ('classifier', tree)])
pipe.fit(X_train, y_train)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(tokenizer=<function spacy_tokenizer at 0x7f1c15f6aca0>)),
                ('classifier', DecisionTreeClassifier())])

In [None]:
# Get the depth of the tree
tree.get_depth()

864

In [None]:
y_pred = pipe.predict(X_test)
evaluate(y_test, y_pred)

Confusion matrix:
[[91 45 22  3  2  0]
 [49 61 26 12  8  3]
 [32 40 42 21 12 12]
 [36 18 14 42 23 25]
 [23 16 26 22 38 35]
 [22 14 19 23 42 41]]
Accuracy score:
0.3281
Classification report:
	Precision: 0.3260
	Recall: 0.3281
	F1_Score: 0.3200


In [None]:
# Create the parameter grid based on the known depth of the tree
grid = {'max_depth': [550, 650, 600, 750, 800]}

# Find the best config and tune the hyperparamters for the specific classifer
result = tune_configs_hyperpm(tree, grid)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Config:  [(1, 1), 1, 1.0, 'word']
Hyperparameters: {'max_depth': 750}
Train Score: 0.3484
-----------------------
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Config:  [(1, 1), 1, 1.0, 'char']
Hyperparameters: {'max_depth': 650}
Train Score: 0.3289
-----------------------
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Config:  [(1, 2), 1, 1.0, 'word']
Hyperparameters: {'max_depth': 650}
Train Score: 0.3328
-----------------------
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Config:  [(1, 2), 1, 1.0, 'char']
Hyperparameters: {'max_depth': 750}
Train Score: 0.3232
-----------------------
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Config:  [(1, 3), 1, 1.0, 'word']
Hyperparameters: {'max_depth': 550}
Train Score: 0.3396
-----------------------
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Config:  [(1, 3), 1, 1.0, 'char']
Hyperparameters: {'max_depth': 650}


In [None]:
# Apply the best config and the tuned hyperparameters
# Config: [(1, 1), 1, 1.0, 'word']
# Hyperparameters: {'max_depth': 750}
tfidf_vector = TfidfVectorizer(tokenizer=spacy_tokenizer, ngram_range=(1, 1), min_df=1, max_df=1.0, analyzer='word')
tree = DecisionTreeClassifier(max_depth=750)

pipe = Pipeline([('vectorizer', tfidf_vector), ('classifier', tree)])
pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)
evaluate(y_test, y_pred)

Confusion matrix:
[[91 40 23  3  4  2]
 [51 62 26 11  8  1]
 [40 38 45 16 12  8]
 [36 19 15 38 25 25]
 [27 15 22 26 42 28]
 [25 17 18 26 40 35]]
Accuracy score:
0.3260
Classification report:
	Precision: 0.3259
	Recall: 0.3260
	F1_Score: 0.3167


Random forest as classifier

---



In [None]:
# Use the same vectorizer, but this time use random forest as classifier
tfidf_vector = TfidfVectorizer(tokenizer=spacy_tokenizer, ngram_range=(1, 3), min_df=1, max_df=1.0, analyzer='char')
forest = RandomForestClassifier()

pipe = Pipeline([('vectorizer', tfidf_vector), ('classifier', forest)])
pipe.fit(X_train, y_train)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(analyzer='char', ngram_range=(1, 3),
                                 tokenizer=<function spacy_tokenizer at 0x7f1c15f6aca0>)),
                ('classifier', RandomForestClassifier())])

In [None]:
y_pred = pipe.predict(X_test)
evaluate(y_test, y_pred)

Confusion matrix:
[[120  21  18   3   1   0]
 [ 50  61  35   3   5   5]
 [ 23  38  60  22  12   4]
 [  9  10  18  65  32  24]
 [  6   9  13  35  49  48]
 [ 11   8  15  22  49  56]]
Accuracy score:
0.4281
Classification report:
	Precision: 0.4193
	Recall: 0.4281
	F1_Score: 0.4207


In [None]:
# Find the best config for the specific classifer
result = tune_configs(forest)

Config:  [(1, 1), 1, 1.0, 'word']
Confusion matrix:
[[123  24  12   2   2   0]
 [ 77  54  23   2   2   1]
 [ 55  40  45  13   3   3]
 [ 45  13  21  42  16  21]
 [ 39  11  15  22  40  33]
 [ 26  14  19  14  22  66]]
Accuracy score:
0.3854
Classification report:
	Precision: 0.4102
	Recall: 0.3854
	F1_Score: 0.3733
-----------------------
Config:  [(1, 1), 1, 1.0, 'char']
Confusion matrix:
[[107  32  17   4   2   1]
 [ 48  64  36   6   3   2]
 [ 22  50  53  13   7  14]
 [  5   7  28  55  32  31]
 [  8   2  18  41  49  42]
 [  9   8  15  25  39  65]]
Accuracy score:
0.4094
Classification report:
	Precision: 0.4039
	Recall: 0.4094
	F1_Score: 0.4049
-----------------------
Config:  [(1, 2), 1, 1.0, 'word']
Confusion matrix:
[[140  15   8   0   0   0]
 [106  34  17   0   1   1]
 [ 90  37  24   5   1   2]
 [ 68  14  28  30   7  11]
 [ 65  13  20  22  23  17]
 [ 52  11  24  15  13  46]]
Accuracy score:
0.3094
Classification report:
	Precision: 0.3778
	Recall: 0.3094
	F1_Score: 0.2829
----------

In [None]:
# Apply the best config: [(2, 3), 1, 1.0, 'char']
tfidf_vector = TfidfVectorizer(tokenizer=spacy_tokenizer, ngram_range=(2, 3), min_df=1, max_df=1.0, analyzer='char')

pipe = Pipeline([('vectorizer', tfidf_vector), ('classifier', forest)])
pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)
evaluate(y_test, y_pred)

Confusion matrix:
[[122  22  13   3   3   0]
 [ 57  62  26   5   3   6]
 [ 29  45  47  23  10   5]
 [  8   9  22  64  26  29]
 [ 12   5  10  39  48  46]
 [  7  14   9  27  37  67]]
Accuracy score:
0.4271
Classification report:
	Precision: 0.4167
	Recall: 0.4271
	F1_Score: 0.4169
