In [2]:
import spacy
import pandas as pd
import nltk
from nltk.stem.porter import *

In [3]:
pd.set_option("display.max_rows", None, "display.max_columns", None)

#  Lemmatizing Keywords

In [64]:
# Read in the CSV to lemmatize the keywords
df_keywords = pd.read_csv("words2stem_lemmatize.csv", sep="\t") # sep="\t" weil in Zeile 147 Kommas in der Zelle stehen

In [65]:
# Remove colons
df_keywords = df_keywords.replace('\:','', regex=True)
# Rename column
df_keywords = df_keywords.rename(columns={'DOI': 'keywords'})

In [66]:
nlp = spacy.load("en_core_web_trf")
# x ist der text der Spalte keywords, nlp(x) erstellt daraus ein Dokument und y repräsentoert jeden Token, welcher mit y.lemma_ lemmatisiert wird 
# " ".join(...) fügt alle gefundenen Lemmata zu einer einzigen durch Leerzeichen getrennten Zeichenfolge zusammen.
df_keywords["lemmatized"] = df_keywords['keywords'].apply(lambda x: " ".join([y.lemma_ for y in nlp(x)]))
# https://stackoverflow.com/questions/62712963/using-spacy-to-lemmatize-a-column-of-parsed-html-text-in-a-pandas-dataframe



In [67]:
# Lowercase letters that were not lowercased during lemmatization
df_keywords["lemmatized"] = df_keywords["lemmatized"].str.lower() 

# Stemming Keywords

In [68]:
stemmer = PorterStemmer()
keywords_stemmed = []

for words in df_keywords['keywords']:
    
    # if keyword consists of more then one words, split them, stem them, an put them back together to one string
    if " " in words:
        words_split = words.split() 
        words_stemmed = [stemmer.stem(x) for x in words_split]
        words_stemmed = ' '.join(map(str, words_stemmed))
        keywords_stemmed.append(words_stemmed)
        
    # if keyword consists of one word just stem it
    else:
        keywords_stemmed.append(stemmer.stem(words))
        
df_keywords["stemmed"] = keywords_stemmed

In [69]:
# Saving the lemmatized terms to CSV
df_keywords.to_csv('keywords_lemmatized_stemmed.csv', index=False)

# SpaCy Rule-based entity recognition

In [17]:
import spacy
from spacy.lang.en import English
from spacy.pipeline import EntityRuler

## Expand species names list

In [71]:
# creates lists for NER model
df_keywords = pd.read_csv("keywords_lemmatized_stemmed.csv")

# remove line 0 to 10 because they are unnecessary
df_keywords = df_keywords.drop(range(0,11))

list_keywords = df_keywords["keywords"].tolist()

df_species = pd.read_csv("species_names_list.csv")
list_species = df_species["species"].tolist()

In [82]:
# lowercase all species names
species_low = [name.lower() for name in list_species]

In [83]:
# separates all double names into single words, lower case them and adds them to list
species_split = []
for name in list_species:
    if " " in name:
        name_split = name.split()
        for x in name_split:
            x = x.strip()
            x = x.lower() 
            species_split.append(x)

# removes duplicates
species_split = list(dict.fromkeys(species_split))

In [84]:
# capitalizes all names
species_cap = [name.capitalize() for name in species_expanded]

In [85]:
# merges all species lists an removes duplicates
species_final = list_species + species_low + species_split + species_cap
species_final = list(dict.fromkeys(species_final))

## Species and Keywords NER Model

In [23]:
# creates the pattern for ner keywords rules
def create_keywords_ner_pattern():
    species = species_final
    keywords = list_keywords
    patterns = []
    for item in species:
        pattern = {
            "label": "SPECIES",
            "pattern": item
        }
        patterns.append(pattern)
    for item in keywords:
        pattern = {
            "label": "KEYWORD",
            "pattern": item
        }
        patterns.append(pattern)
        
    return (patterns)

# create rules for ner keywords model and creates the model
def generate_rules(patterns):
    nlp = English()
    source_nlp = spacy.load("en_core_web_trf")
    nlp.add_pipe("ner", source=source_nlp)
    ruler = EntityRuler(nlp)
    ruler.add_patterns(patterns)
    nlp.to_disk("keywords_ner")

# run the function    
patterns = create_keywords_ner_pattern()
#generate_rules(patterns)


In [25]:
# Konvertiere Spalte zu String, um als nlp-objekt erkannt zu werden
df = pd.read_csv("fulltext_articles.csv")
list_ft = df.loc[:5, "fulltext"].tolist()
string_ft = ' '.join(map(str, list_ft))

In [26]:
# Versuch 2 ERFOLGREICH!
nlp = English()
ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns)
doc = nlp(string_ft)
print([(ent.text, ent.label_) for ent in doc.ents])

[('genus', 'KEYWORD'), ('genus', 'KEYWORD'), ('genus', 'KEYWORD'), ('genus', 'KEYWORD'), ('species', 'KEYWORD'), ('genus', 'KEYWORD'), ('genus', 'KEYWORD'), ('genus', 'KEYWORD'), ('species', 'KEYWORD'), ('genus', 'KEYWORD'), ('Aliidiomarina taiwanensis', 'SPECIES'), ('Aliidiomarina haloalkalitolerans', 'SPECIES'), ('Aliidiomarina shirensis', 'SPECIES'), ('Aliidiomarina minuta', 'SPECIES'), ('Aliidiomarina iranensis', 'SPECIES'), ('Aliidiomarina soli', 'SPECIES'), ('Aliidiomarina maris', 'SPECIES'), ('Aliidiomarina celeris', 'SPECIES'), ('Aliidiomarina sedimenti', 'SPECIES'), ('species', 'KEYWORD'), ('genus', 'KEYWORD'), ('species', 'KEYWORD'), ('genus', 'KEYWORD'), ('pH', 'KEYWORD'), ('pH', 'KEYWORD'), ('Escherichia coli', 'SPECIES'), ('species', 'KEYWORD'), ('type', 'KEYWORD'), ('species', 'KEYWORD'), ('genus', 'KEYWORD'), ('species', 'KEYWORD'), ('genus', 'KEYWORD'), ('genus', 'KEYWORD'), ('species', 'KEYWORD'), ('genus', 'KEYWORD'), ('species', 'KEYWORD'), ('genus', 'KEYWORD'), ('sp

In [None]:
# # convert list to string for lemmatizing
#abstract_string = ' '.join(map(str, abstract_list))

In [27]:
for ent in doc.ents[:50]:
    print(ent.text,ent.label_)

genus KEYWORD
genus KEYWORD
genus KEYWORD
genus KEYWORD
species KEYWORD
genus KEYWORD
genus KEYWORD
genus KEYWORD
species KEYWORD
genus KEYWORD
Aliidiomarina taiwanensis SPECIES
Aliidiomarina haloalkalitolerans SPECIES
Aliidiomarina shirensis SPECIES
Aliidiomarina minuta SPECIES
Aliidiomarina iranensis SPECIES
Aliidiomarina soli SPECIES
Aliidiomarina maris SPECIES
Aliidiomarina celeris SPECIES
Aliidiomarina sedimenti SPECIES
species KEYWORD
genus KEYWORD
species KEYWORD
genus KEYWORD
pH KEYWORD
pH KEYWORD
Escherichia coli SPECIES
species KEYWORD
type KEYWORD
species KEYWORD
genus KEYWORD
species KEYWORD
genus KEYWORD
genus KEYWORD
species KEYWORD
genus KEYWORD
species KEYWORD
genus KEYWORD
species KEYWORD
genus KEYWORD
type KEYWORD
species KEYWORD
genus KEYWORD
species KEYWORD
species KEYWORD
species KEYWORD
species KEYWORD
species KEYWORD
species KEYWORD
genus KEYWORD
type KEYWORD


In [28]:
spacy.displacy.render(doc, style='ent',jupyter=True)

# Analysing the articles

In [29]:
pd.options.display.max_rows = 4000
#pd.options.display.max_seq_items = 2000

In [30]:
df = pd.read_csv("fulltext_articles.csv")

In [31]:
#converts df column to list
articles = df["fulltext"].head(5).tolist()

In [32]:
# convertrs list to string
articles = ' '.join([str(elem) for elem in articles])

In [33]:
# erase linebreaks for better nlp results
articles = articles.replace("\n\n", " ").replace("\n", " ").strip()

In [None]:
# Approach No. 2: maybe faster and easier?
df = pd.read_csv("fulltext_articles.csv")
list_ft = df.loc[:5, "fulltext"].tolist()
string_ft = ' '.join(map(str, list_ft))

In [34]:
nlp = spacy.load("en_core_web_trf")

In [35]:
doc = nlp(articles)



## Lemmatizing of doc object

In [36]:
lemmatized_articles = " ".join([token.lemma_ for token in doc])
 
print(lemmatized_articles)
# https://www.geeksforgeeks.org/python-lemmatization-approaches-with-examples/

ANI , average nucleotide identity ; ANIb , average nucleotide identity calculate use blast ; ANIm , average nucleotide identity calculate use MUMmer ; dddh , digital dna – dna hybridization ; dddh , digital dna – dna hybridization . ,the genus Aliidiomarina belong to the family Idiomarinaceae , order Alteromonadales , class Gammaproteobacteria [ 1 ] . this genus have two sister genus name Idiomarina and Pseudidiomarina in the List of prokaryotic name with Standing in Nomenclature . despite the reclassification of the genus Pseudidiomarina into the genus Idiomarina , the specie of both genus could not be distinguish from each other use phenotypic or chemotaxonomic characteristic [ 2 ] , and recently the genus Pseudidiomarina be reinstate follow the result of genome - base analysis [ 3 ] . the genus Aliidiomarina be close to the genus Pseudidiomarina in the phylogenetic tree base on the 16s rrna gene . however , it be distant from both sister genus in the phylogenomic tree base on the 78

## Lemmatizing of specific df column

In [37]:
df["abstract_lemma"] = df['abstract'].head(5).astype(str).apply(lambda x: " ".join([y.lemma_ for y in nlp(x)]))
print(df["abstract_lemma"])

0        a haloalkaliphilic strain ( IM 1326 T ) be iso...
1        through bacterial plant – endophyte extraction...
2        a gram - negative , moderately halophilic bact...
3        a gram - stain - negative , strictly aerobic ,...
4        a strictly anaerobic bacterial strain , design...
                               ...                        
17001                                                  NaN
17002                                                  NaN
17003                                                  NaN
17004                                                  NaN
17005                                                  NaN
Name: abstract_lemma, Length: 17006, dtype: object


## Sentences

In [38]:
#save sentences of text in list, so one can iterate over the sentences
sentences = list(doc.sents)
print(sentences[3])

This genus has two sister genera named Idiomarina and Pseudidiomarina in the List of Prokaryotic names with Standing in Nomenclature.


## Named Entity Recognotion (NER)

In [39]:
# save entities of a sentece in a list
sentence = sentences[42]
ents = list(doc.ents)
print(ents)

[two, 78, ten, 5, 6, 7, 4, 5, 6, Hutong Qagan Lake, the Ordos, Inner Mongolia Autonomous Region, China, July 2017, 39° 12′ 14″ N, 109° 0′ 8″ E, 1250 , 26%, 9.95, 37 , 2 , 15.0 , 0.5 , 6.0 , 4.0 , 0.2 , 2.0 , 0.25 , 0.38 , 0.25 , 0.25 , 1.0 , 0.08 , 56 , 9.5, 1 M NaOH, 15.0 , 37 , 5 , 5 , 37 , 37 , 20 , −80 , Takara, Witon Information Technology Co., Ltd., Beijing, China, 1000, 1503 , 95.8, 95.7, 95.4, 95.3, 95.0, 94.7, 93.6%, IM 1326, 1, less than 70%, 1, lower than, 70 , Biomarker Technologies, Beijing, PR China, Qubit, Invitrogen, Illumina, NovaSeq, 3.13.0, NCBI, the JSpecies Web Server, Meier-Kolthoff et al, 120, the Genome Taxonomy Database, IM 1326, 3.06 , 19, 12, 2.57 , IM 1326, 49.7 mol%, 46.3–52.1 , 48.7 mol%, 1, 68.1 to, 72.6 , 80.7 to, 84.9 , 76 to, 78 , 16.2 to, 22.7 , 2, 72.6 , 83.5 , 78 , 18.4 , 95 , 95 , 70 , 120, 2, 100%, 120, 78, the China General Microbiological Culture Collection Center, the Leibniz Institute German Collection of Microorganisms and Cell Cultures, 5.0 

In [268]:
""" GOAL: List of sentences, which contains species and keywords
if ent"SPECIES" and ent"KEYWORD in sentences:
    print sentence
"""

[1900, Tissier, Two, 17:00, 8th, Bergey, 8th, Bergey, four]


## POS (Part of Speech) tagging

In [46]:
# outputs every POS for every token in the sentence
for token in sentences[3]:
    print(token.text, token.pos_, token.dep_)

This DET det
genus NOUN nsubj
has VERB ROOT
two NUM nummod
sister NOUN compound
genera NOUN dobj
named VERB acl
Idiomarina PROPN oprd
and CCONJ cc
Pseudidiomarina PROPN conj
in ADP prep
the DET det
List PROPN pobj
of ADP prep
Prokaryotic ADJ amod
names NOUN pobj
with ADP prep
Standing PROPN pcomp
in ADP prep
Nomenclature PROPN pobj
. PUNCT punct


## Extracting noun chunks

In [41]:
# finds the noun "species" with additional word
chunks = list(doc.noun_chunks)
for chunk in chunks:
    if "species" in str(chunk):
                       print(chunk)

the species
ten species
These species
a novel species
the most closely related species
the type species
species
a novel species
other species
other species
the type species
its closely related species
the most closely related species
species delineation
other species
a novel species
a novel species
type species
reference species
the reference species
the reference species
the species description
two reference species
all three species
two reference species
the type species
the Aliidiomarina species
a novel species
other species
species
the type species
only one species
a novel species
the type species
species demarcation
the type species
two different species
a novel species
the species descriptions
a novel species
the type species


## Extracting verbs and verb phrases

In [42]:
import textacy

In [43]:
# define a pattern to search for in the document 
patterns = [{"POS": "ADV"}, {"POS": "VERB"}]

In [44]:
verb_phrases = textacy.extract.token_matches(doc, patterns=patterns)

In [45]:
for verb_phrase in verb_phrases:
    print(verb_phrase)

validly published
originally designed
routinely cultured
preliminarily analysed
also supported
closely related
then comparing
validly published
validly published
then isolated
routinely cultivated
closely related
closely related
further verify
otherwise specified
consistently shown
otherwise indicated
