# Lemmatizing strings using spacy!

Import the libraries...

In [118]:
from file import File
import os
import pandas as pd
import spacy
import nltk


Import the file...

In [119]:
wd = os.getcwd()

files = os.listdir("./texts")

file_paths = [wd+"/texts/"+file for file in files]



dataframes = [open(file, "r").read() for file in file_paths]

# short for:
#dataframes = []

#for file in file_paths:
#    with open(file, 'r') as text:
#        dataframes.append(text.read())



set up spacy, Tokenize the text

In [120]:
nlp = spacy.load('en_core_web_md')

In [121]:
# run nlp function on dataframe elements
df = pd.DataFrame({"content": dataframes, "file": files})

start = time.time()
df['tokenized_content'] = [nlp(text) for text in df.content]
end = time.time()
print(end-start)
print(df)


# or
start = time.time()
df['tokenized_content'] = df.content.apply(nlp)

end = time.time()
print(end-start)

63.38513708114624
                                             content  \
0  The Project Gutenberg EBook of The Picture of ...   
1  ﻿\nProject Gutenberg's The Hound of the Basker...   
2  ﻿\nThe Project Gutenberg EBook of Adventures o...   

                                         file  \
0      wilde_-_the_picture_of_dorian_gray.txt   
1   doyle_-_the_hound_of_the_baskervilles.txt   
2  twain_-_adventures_of_huckleberry_finn.txt   

                                   tokenized_content  
0  (The, Project, Gutenberg, EBook, of, The, Pict...  
1  (﻿, \n, Project, Gutenberg, 's, The, Hound, of...  
2  (﻿, \n, The, Project, Gutenberg, EBook, of, Ad...  
50.79260277748108


This also works using nltk (looks even better and much faster):

In [122]:
import time

df = pd.DataFrame({"content": dataframes, "file": files})

nltk.download('punkt')


start = time.time()

df['tokenized_content'] = [nltk.word_tokenize(text) for text in df.content]
end = time.time()
print(end-start)


print(df)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/alexanderheinz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


2.3776068687438965
                                             content  \
0  The Project Gutenberg EBook of The Picture of ...   
1  ﻿\nProject Gutenberg's The Hound of the Basker...   
2  ﻿\nThe Project Gutenberg EBook of Adventures o...   

                                         file  \
0      wilde_-_the_picture_of_dorian_gray.txt   
1   doyle_-_the_hound_of_the_baskervilles.txt   
2  twain_-_adventures_of_huckleberry_finn.txt   

                                   tokenized_content  
0  [The, Project, Gutenberg, EBook, of, The, Pict...  
1  [﻿, Project, Gutenberg, 's, The, Hound, of, th...  
2  [﻿, The, Project, Gutenberg, EBook, of, Advent...  


Get the lemmas

In [123]:
df["lemmatized_content"] = df['content'].apply(lambda x: " ".join([y.lemma_ for y in nlp(x)])).str.split()

print(df)

                                             content  \
0  The Project Gutenberg EBook of The Picture of ...   
1  ﻿\nProject Gutenberg's The Hound of the Basker...   
2  ﻿\nThe Project Gutenberg EBook of Adventures o...   

                                         file  \
0      wilde_-_the_picture_of_dorian_gray.txt   
1   doyle_-_the_hound_of_the_baskervilles.txt   
2  twain_-_adventures_of_huckleberry_finn.txt   

                                   tokenized_content  \
0  [The, Project, Gutenberg, EBook, of, The, Pict...   
1  [﻿, Project, Gutenberg, 's, The, Hound, of, th...   
2  [﻿, The, Project, Gutenberg, EBook, of, Advent...   

                                  lemmatized_content  
0  [the, Project, Gutenberg, ebook, of, the, Pict...  
1  [﻿, Project, Gutenberg, 's, the, Hound, of, th...  
2  [﻿, the, Project, Gutenberg, ebook, of, Advent...  


Now we have the lemmas split like the tokens as new column. 

Next we count the unique lemmas.

In [124]:
# keep unique lemmas
df["unique_lemmatized"] = [set(content) for content in df.lemmatized_content]
# count unique lemmas
df["unique_lemma_count"] = [len(elements) for elements in df.unique_lemmatized]

print(df)

                                             content  \
0  The Project Gutenberg EBook of The Picture of ...   
1  ﻿\nProject Gutenberg's The Hound of the Basker...   
2  ﻿\nThe Project Gutenberg EBook of Adventures o...   

                                         file  \
0      wilde_-_the_picture_of_dorian_gray.txt   
1   doyle_-_the_hound_of_the_baskervilles.txt   
2  twain_-_adventures_of_huckleberry_finn.txt   

                                   tokenized_content  \
0  [The, Project, Gutenberg, EBook, of, The, Pict...   
1  [﻿, Project, Gutenberg, 's, The, Hound, of, th...   
2  [﻿, The, Project, Gutenberg, EBook, of, Advent...   

                                  lemmatized_content  \
0  [the, Project, Gutenberg, ebook, of, the, Pict...   
1  [﻿, Project, Gutenberg, 's, the, Hound, of, th...   
2  [﻿, the, Project, Gutenberg, ebook, of, Advent...   

                                   unique_lemmatized  unique_lemma_count  
0  {asphodel, sharp, an, sickly, don't, doesn't, ... 

Looks like Oscar wilde has had a competition against Mark Twain, but won this competition with 5864 unique lemmas!

## Using NLTK
We can do the same with the nltk package:

In [125]:
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in text]

df["nltk_lemmatized"] = df.tokenized_content.apply(lemmatize_text)

# keep unique lemmas
df["unique_lemmatized_nltk"] = [set(content) for content in df.nltk_lemmatized]
# count unique lemmas
df["unique_lemma_count_nltk"] = [len(elements) for elements in df.unique_lemmatized_nltk]

print(df)

                                             content  \
0  The Project Gutenberg EBook of The Picture of ...   
1  ﻿\nProject Gutenberg's The Hound of the Basker...   
2  ﻿\nThe Project Gutenberg EBook of Adventures o...   

                                         file  \
0      wilde_-_the_picture_of_dorian_gray.txt   
1   doyle_-_the_hound_of_the_baskervilles.txt   
2  twain_-_adventures_of_huckleberry_finn.txt   

                                   tokenized_content  \
0  [The, Project, Gutenberg, EBook, of, The, Pict...   
1  [﻿, Project, Gutenberg, 's, The, Hound, of, th...   
2  [﻿, The, Project, Gutenberg, EBook, of, Advent...   

                                  lemmatized_content  \
0  [the, Project, Gutenberg, ebook, of, the, Pict...   
1  [﻿, Project, Gutenberg, 's, the, Hound, of, th...   
2  [﻿, the, Project, Gutenberg, ebook, of, Advent...   

                                   unique_lemmatized  unique_lemma_count  \
0  {asphodel, sharp, an, sickly, don't, doesn't, ...

In this case, Mark Twain wins with 8173 unique lemmas!

Both packages are easy to use, whereas nltk seems to be a bit faster and counts more unique lemmas. but for example 'you"--tosse" is a lemma in spacy, whereas nltk ignores it, which could be a good sign. 