## PARIS SUBSIDIES: DATA PREPROCESSING
We will use NLP to examine the projects' descriptions

### LIBRAIRIES IMPORT

In [1]:
# General data librairies
import pandas as pd

# String process librairies
from unidecode import unidecode
import string

# NLP
import spacy

### CUSTOM FUNCTIONS

In [2]:
def clean_text(text ): 
    delete_dict = {sp_character: '' for sp_character in string.punctuation} 
    delete_dict[' '] = ' ' 
    table = str.maketrans(delete_dict)
    text1 = text.translate(table)
    textArr= text1.split()
    text2 = ' '.join([w for w in textArr if ( not w.isdigit() and  ( not w.isdigit() and len(w)>3))]) 
    text2 = unidecode(text2)
    
    return text2.lower()

### DATA IMPORT

In [3]:
data = pd.read_feather('../00_DataFiles/03_Joined/ParisSubsidies_Joined.feather')

### PREPROCESSING

In [4]:
# Spacy French
nlp = spacy.load('fr_core_news_sm')

In [5]:
# Clean text
data['objet_dossier_nlp'] = data.objet_dossier.apply(lambda x: clean_text(x))

In [6]:
# Tokenisation
data.objet_dossier_nlp = data.objet_dossier_nlp.apply(lambda x: nlp(x))

In [7]:
# Stopwords & lemma
stop_words = spacy.lang.fr.stop_words.STOP_WORDS
data.objet_dossier_nlp = data.objet_dossier_nlp.apply(lambda x: [token.lemma_ for token in x if token not in stop_words])

In [8]:
# Join
data.objet_dossier_nlp = data.objet_dossier_nlp.apply(lambda x: ' '.join(x))

### SAVE

In [9]:
data_nlp = data[['numero_dossier', 'objet_dossier_nlp', 'subsidy_granted']]
data_nlp.head()

Unnamed: 0,numero_dossier,objet_dossier_nlp,subsidy_granted
0,2020_07586,cofinancement ddct post adultesrelai r0041,yes
1,2020_08053,2020dases seple,yes
2,2020_04885,ressourceri ephemere,no
3,2020_05032,mome doperette grotte lascau graf lascard,no
4,2020_02992,fonctionnement cite ours cite falaise,yes


In [10]:
data_nlp.to_csv('../00_DataFiles/05_NLPScoring/ParisSubsidies_NLPPreprocessed.csv')
data_nlp.to_feather('../00_DataFiles/05_NLPScoring/ParisSubsidies_NLPPreprocessed.feather')