# Lemmatizing files using spaCy

In this notebook, we will lemmatize our corpus. This needs to be done for each language separately. Lemmatizing is not obligatory for Topic Modeling, but if your lemmatization model works well with your corpus, we recommend it, since this can improve the quality of the topics.<br>  
<i>spaCy</i> is a python library for natural language processing. See more: https://spacy.io/. 

In [1]:
import warnings
warnings.filterwarnings('ignore')
from cophi_toolbox import preprocessing
import metadata_toolbox.utils as metadata
import pandas as pd
from pathlib import Path
import spacy
import string

In [2]:
data = 'Y:/data/projekte/dispecs/topicModeling'
language = 'fr' # language 2 letter abbreviation
path_to_corpus = Path(data, 'dispecs_'+language+'_lemmatized') # Careful! The files will be overwritten, so make a backup :)

In [3]:
pattern = '{year}_{journal}_{author}_{volume}_{issue}_{id}'#1716_Le-Spectateur-ou-le-Socrate-moderne_Anonym_Table-des-Matieres_119-1257
meta = pd.concat([metadata.fname2metadata(str(path), pattern=pattern) for path in path_to_corpus.glob('*.txt')])

In [4]:
meta[:5]

Unnamed: 0,year,journal,author,volume,issue,id
Y:\data\projekte\dispecs\topicModeling\dispecs_fr_lemmatized_test\1711-1712_Le-Misantrope_Justus-Van-Effen_Vol-1_Nr-001_2948.txt,1711-1712,Le-Misantrope,Justus-Van-Effen,Vol-1,Nr-001,2948


In [5]:
len(meta)

1

In [6]:
"""
Write your own dictionaries for lemmatization of special cases.
The usage of upper and lowercase letters in values is relevant, so be sure to correct both versions, if needed.

"""
corr_fr = {
    "avoir" : ["avois", "avoit", "Avois", "Avoit"], 
    "dire" : ["disois", "disoit", "Disois", "Disoit"],
    "manière" : ["maniere", "Maniere"],
    "pièce" : ["piéce", "Piéce"],
    "poète" : ["poëte", "Poëte", "Poëtes", "poëtes"],
    "poème" : ["poëme", "Poëme"],
    "poésie" : ["poësie", "Poësie"],
    "sexe" : ["séxe", "Séxe"],
    "moyen" : ["moïen", "Moïen"],
    "thèatre":["théâtre", "Théâtre", "théatre","Théatre"],
    "comédie":["comédien","Comédien"],
    "tragédie":["tragedie","Tragedie"],
    "société":["societe", "societé","Societe", "Societé"],
    "feuille":["feüille","Feüille"]
        
}

corr_es = {
    "decir":["dixo", "decia", "Dixo", "Decia"],
    "ir":["iba", "Iba"],
    "pacerer":["parecia", "Parecia"],
    "poder":["podia", "Podia"],
    "ser":["fuesse", "Fuesse"],
    "haber":["habia", "havia", "Habia", "Havia"],
    "ahora" : ["aora", "Aora"],
    "estar" : ["estàn", "Estàn"],
    "lujo" : ["luxo","luxar", "Luxo","Luxar"],
    "razón" : ["razon", "razòn", "Razon", "Razòn"],
    "caballero" : ["cavallero", "Cavallero"],
    "mujer" : ["muger", "mugeres", "Muger", "Mugeres"],
    "vez" : ["vèz", "Vèz"],
    "jamás" : ["jamas", "Jamas"],
    "demás" : ["demas", "demàs", "Demas", "Demàs"],
    "cuidar" : ["cuydado", "Cuydado"],
    "posible" : ["possible", "Possible"],
    "comedia":["comediar", "Comedias"],
    "poeta":["poetas", "Poetas"],
    "mano":["manir", "Manir"],
    "barba":["barbar", "Barbar"],
    "idea":["ideo", "Ideo"],
    "nada":["nadar", "Nadar"]
}

corr_none={}

In [7]:
"""
Load the language packages and special lemmatization rules defined in the dictionaries above.

-----> Language packages (have to be installed first, see here: https://spacy.io/usage/models):
French: fr_core_news_lg
Spanish: es_core_news_lg
Italian: it_core_news_lg
English: en_core_web_lg
Portuguese: pt_core_news_lg
German: de_core_news_lg
"""
if language == 'fr':
    nlp = spacy.load('fr_core_news_lg')
    correction_dictionary=corr_fr
    print('French package loaded. Corrections dictionary for French defined.')
if language == 'it':
    nlp = spacy.load('it_core_news_lg')
    correction_dictionary=corr_none
    print('Italian package loaded. Correction dictionary is empty.')
if language == 'es':
    nlp = spacy.load('es_core_news_lg')
    correction_dictionary=corr_es
    print('Spanish package loaded. Corrections dictionary for Spanish defined.')
if language == 'de':
    nlp = spacy.load('de_core_news_lg')
    correction_dictionary=corr_none
    print('German package loaded. Correction dictionary is empty.')
if language == 'en':
    nlp = spacy.load('en_core_web_lg')
    correction_dictionary=corr_none
    print('English package loaded. Correction dictionary is empty.')
if language == 'pt':
    nlp = spacy.load('pt_core_news_lg')
    correction_dictionary=corr_none
    print('Portuguese package loaded. Correction dictionary is empty.')

French package loaded. Corrections dictionary for French defined.


In [8]:
# Customize tokenizer to not tokenize the paragraph marker ('###')
from spacy.tokenizer import Tokenizer
special_cases = {"###": [{"ORTH": "###"}]}
def custom_tokenizer(nlp):
    return Tokenizer(nlp.vocab, rules=special_cases)
nlp.tokenizer = custom_tokenizer(nlp)

In [38]:
# Lemmatization of all text files in the corpus. The files will be overwritten. 

for file in path_to_corpus.glob('*.txt'): 
    with open(file, encoding='utf-8') as f:
        # replace characters from first rule with None, from second rule with whitespace
        original = f.read().translate(str.maketrans('', '', '.,;!?$:¡¿()\"\“'))
#         lemmatized_object = nlp(original)        
#         lemma_list = []
#         for lemma in lemmatized_object:
#             lemma_list.append(lemma.lemma_)
#         lemma_doc = ' '.join(lemma_list)
        lemmatized_object = nlp(original)
        lemma_list = []
        for lemma in lemmatized_object:
            lemma=(lemma.lemma_).translate(str.maketrans('’\'', '  ')).lower()
            for key, value in correction_dictionary.items():
                if lemma in value: #.translate(None, '.,;!?$:¡¿()\"')
                    #print(lemma+" replaced with "+key)
                    lemma=key
            lemma_list.append(lemma)
            #print(lemma+" appended to list")
        lemma_doc = ' '.join(lemma_list)
    with open(file, 'w', encoding='utf-8') as f:
        f.write(lemma_doc)
