## PARIS SUBSIDIES: NLP PROCESSING
Adapted from https://medium.com/analytics-vidhya/building-a-text-classifier-with-spacy-3-0-dd16e9979a

### LIBRAIRIES IMPORT

In [1]:
# General data librairies
import numpy as np
import pandas as pd

# Sklearn
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.model_selection import train_test_split

# Spacy
import spacy
from spacy.tokens import DocBin

# Data and model saving
import time
import sys
from tqdm.auto import tqdm

### CUSTOM FUNCTIONS

In [2]:
def confusion_matrix_metrics(y_true, y_pred, pos_label):
    # Returns main metrics from binary confusion matrix
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, pos_label = pos_label)
    recall = recall_score(y_true, y_pred, pos_label = pos_label)
    precision = precision_score(y_true, y_pred, pos_label = pos_label)
    return accuracy, f1, recall, precision 

In [33]:
def make_docs(X,y):
    """
    this will take a X (text) and a y (labels) arrays 
    and transform them in spacy documents
    """

    data = tuple(zip(X.tolist(), y.tolist())) 
    print(data[1])
    docs = []
    # nlp.pipe([texts]) is way faster than running 
    # nlp(text) for each text
    # as_tuples allows us to pass in a tuple, 
    # the first one is treated as text
    # the second one will get returned as it is.
    nlp = spacy.load('fr_core_news_sm')
    for doc, label in tqdm(nlp.pipe(data, as_tuples=True), total = len(data)):
        
        # we need to set the (text)cat(egory) for each document
        if (label=='yes'):
            doc.cats['yes'] = 1
            doc.cats['no'] = 0
        else:
            doc.cats['yes'] = 0
            doc.cats['no'] = 1
        
        # put them into a nice list
        docs.append(doc)
    
    return docs, data

### DATA IMPORT

In [3]:
data = pd.read_feather('../00_DataFiles/05_NLPScoring/ParisSubsidies_NLPPreprocessed.feather')

### TRAIN / TEST DATA PREPARATION

In [4]:
# Split between train and test data
X = data.objet_dossier_nlp
y = data.subsidy_granted
# random_state = 42 so we get the same split every time
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
# Make docs and save as binary file for train data
train_docs, train_data  = make_docs(X_train, y_train)
doc_bin = DocBin(docs=train_docs)
doc_bin.to_disk("./textcat_data/textcat_train.spacy")

# Make docs and save as binary file for test data
test_docs,test_data  = make_docs(X_test, y_test)
doc_bin = DocBin(docs=test_docs)
doc_bin.to_disk("./textcat_data/textcat_valid.spacy")

('jeunesse aller fille prevention delinquanc pour adolescent risque demargin', 'yes')


  0%|          | 0/63968 [00:00<?, ?it/s]

('developpement medalliance lieu ressourc pour mineur accompagne', 'yes')


  0%|          | 0/15993 [00:00<?, ?it/s]

### MODEL TRAINING
Base config file to be created from here: https://spacy.io/usage/training#quickstart

In [37]:
# Init config file
!python3 -m spacy init fill-config ./textcat_base_config.cfg ./textcat_config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
textcat_config.cfg
You can now add your data and train your pipeline:
python -m spacy train textcat_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [38]:
# Train model
!python3 -m spacy train textcat_config.cfg --verbose --output ./textcat_output --paths.train textcat_data/textcat_train.spacy --paths.dev textcat_data/textcat_valid.spacy

[2022-04-20 07:57:57,201] [DEBUG] Config overrides from CLI: ['paths.train', 'paths.dev']
[38;5;4mℹ Saving to output directory: textcat_output[0m
[38;5;4mℹ Using CPU[0m
[1m
[2022-04-20 07:57:59,064] [INFO] Set up nlp object from config
[2022-04-20 07:57:59,067] [DEBUG] Loading corpus from path: textcat_data/textcat_valid.spacy
[2022-04-20 07:57:59,067] [DEBUG] Loading corpus from path: textcat_data/textcat_train.spacy
[2022-04-20 07:57:59,067] [INFO] Pipeline: ['textcat']
[2022-04-20 07:57:59,069] [INFO] Created vocabulary
[2022-04-20 07:57:59,070] [INFO] Finished initializing nlp object
[2022-04-20 07:58:16,611] [INFO] Initialized pipeline components: ['textcat']
[38;5;2m✔ Initialized pipeline[0m
[1m
[2022-04-20 07:58:16,618] [DEBUG] Loading corpus from path: textcat_data/textcat_valid.spacy
[2022-04-20 07:58:16,618] [DEBUG] Loading corpus from path: textcat_data/textcat_train.spacy
[38;5;4mℹ Pipeline: ['textcat'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS T

### MODEL SCORING

#### Model loading

In [5]:
nlp_textcat = spacy.load('./textcat_output/model-best')

#### Train data

In [6]:
df_train = pd.concat([X_train, y_train], axis=1)
df_train['nlp_scoring'] = df_train.objet_dossier_nlp.apply(lambda x: nlp_textcat(x).cats['yes'])

dct_scoring = dict()

for i in np.arange(0.3, 0.71, 0.05):
    threshold=i
    df_train['subsidy_granted_predicted'] = np.where(df_train.nlp_scoring > i, 'yes', 'no')
    dct_scoring[i] = confusion_matrix_metrics(df_train.subsidy_granted, df_train.subsidy_granted_predicted, 'yes')

pd.DataFrame.from_dict(dct_scoring, orient = 'index', columns = ['accuracy', 'f1', 'recall', 'precision'])

Unnamed: 0,accuracy,f1,recall,precision
0.3,0.614917,0.75953,0.997027,0.613412
0.35,0.621217,0.762307,0.995797,0.617514
0.4,0.633864,0.76754,0.990979,0.626322
0.45,0.655093,0.775442,0.976319,0.643121
0.5,0.694879,0.785243,0.914527,0.687984
0.55,0.697192,0.746976,0.73279,0.761722
0.6,0.638507,0.635383,0.516377,0.82567
0.65,0.575366,0.514504,0.368881,0.850097
0.7,0.531406,0.418988,0.277,0.85962


- f1, accuracy and precision scores are slighty better as threshold increases between 0.3 and 0.5 (and maximised at 0.5)
- recall score is close to perfection until 0.45 threshold included, then it starts to drop
- best thresholds on train data = 0.45 or 0.5

#### Test data

In [7]:
df_test = pd.concat([X_test, y_test], axis=1)
df_test['nlp_scoring'] = df_test.objet_dossier_nlp.apply(lambda x: nlp_textcat(x).cats['yes'])

dct_scoring = dict()

for i in np.arange(0.3, 0.71, 0.05):
    threshold=i
    df_test['subsidy_granted_predicted'] = np.where(df_test.nlp_scoring > i, 'yes', 'no')
    dct_scoring[i] = confusion_matrix_metrics(df_test.subsidy_granted, df_test.subsidy_granted_predicted, 'yes')

pd.DataFrame.from_dict(dct_scoring, orient = 'index', columns = ['accuracy', 'f1', 'recall', 'precision'])

Unnamed: 0,accuracy,f1,recall,precision
0.3,0.614269,0.759108,0.996616,0.613017
0.35,0.621709,0.762391,0.995181,0.617862
0.4,0.631776,0.766133,0.989029,0.625227
0.45,0.643657,0.768681,0.970881,0.636187
0.5,0.657725,0.759807,0.887727,0.66411
0.55,0.656162,0.71235,0.698144,0.727147
0.6,0.61602,0.611894,0.49636,0.797529
0.65,0.566873,0.504471,0.36153,0.834359
0.7,0.529294,0.415437,0.274275,0.856


- f1 scores are very similar between 0.3 and 0.5
- accuracy and precision scores are slighty better as threshold increases between 0.3 and 0.5 (and maximised at 0.5)
- recall score is close to perfection until 0.45 threshold included, then it starts to drop
- best threshold on test data = 0.45


#### Conclusion
We retain a 0.45 threshold for the NLP model predictions.

### SAVE
We will save the scoring into our original datafile

In [8]:
# Load main dataframe
data_original = pd.read_feather('../00_DataFiles/03_Joined/ParisSubsidies_Joined.feather')

In [9]:
# Insert nlp_scoring from train data
data_w_nlpscoring = data_original.merge(df_train.drop(['objet_dossier_nlp', 'subsidy_granted', 'subsidy_granted_predicted'], axis = 1), how='left', left_index = True, right_index=True)

In [10]:
# Inset nlp_scoring from test data
data_w_nlpscoring = data_w_nlpscoring.merge(df_test.drop(['objet_dossier_nlp', 'subsidy_granted', 'subsidy_granted_predicted'], axis = 1), how='left', left_index = True, right_index=True)

In [11]:
# Merge newly created columns into one
data_w_nlpscoring['nlp_scoring'] = np.where(data_w_nlpscoring.nlp_scoring_x.isna(), data_w_nlpscoring.nlp_scoring_y, data_w_nlpscoring.nlp_scoring_x)
# Drop the merged columns
data_w_nlpscoring.drop(['nlp_scoring_x', 'nlp_scoring_y'], axis = 1, inplace = True)

In [12]:
# Check if all nlp_scoring are filled
data_w_nlpscoring.nlp_scoring.isna().sum() == 0

True

In [13]:
# Save
data_w_nlpscoring.to_csv('../00_DataFiles/05_NLPScoring/ParisSubsidies_NLPScoring.csv')
data_w_nlpscoring.to_feather('../00_DataFiles/05_NLPScoring/ParisSubsidies_NLPScoring.feather')