### Installing, Importing and Preprocessing¶

In [2]:
# Import spacy
import spacy

# Install English language model
!spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ------------------------- -------------- 8.1/12.8 MB 41.8 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 50.2 MB/s eta 0:00:00
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
# Import os to upload documents and metadata
import os

# Load spaCy visualizer
from spacy import displacy

# Import pandas DataFrame packages
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

# Import graphing package
import plotly.express as px

In [4]:
#Load main dataset
nobel_df = pd.read_csv("nobel.csv")
nobel_df.head()

Unnamed: 0,Filename,Document
0,The Nobel Peace Prize Lecture 2013_Organisatio...,"Your Majesties,\nYour Royal Highness,\nDisting..."
1,The Nobel Peace Prize Lecture 2014_Kailash Sat...,(My dear children of the world …)\n\nYour Maje...
2,The Nobel Peace Prize Lecture 2014_Malala Yous...,Bismillah hir rahman ir rahim.\nIn the name of...
3,The Nobel Peace Prize Lecture 2015_National Di...,"His Majesty,\nTheir Highnesses,\nThe respected..."
4,The Nobel Peace Prize Lecture 2016_Juan Manuel...,Your Majesties; Your Royal Highnesses; disting...


In [10]:
# Load metadata
metadata_df = pd.read_csv("metadata.csv")
metadata_df.head()

Unnamed: 0,Filename,Prize Laureate,Year Awarded,Place of Birth/Place of Foundation,Date of Birth/Date of Foundation,Prize Share
0,The Nobel Peace Prize Lecture 2013_Organisatio...,Organisation for the Prohibition of Chemical W...,2013,,1997,1/1
1,The Nobel Peace Prize Lecture 2014_Kailash Sat...,Kailash Satyarthi,2014,India,1954,1/2
2,The Nobel Peace Prize Lecture 2014_Malala Yous...,Malala Yousafzai,2014,Pakistan,1997,1/2
3,The Nobel Peace Prize Lecture 2015_National Di...,National Dialogue Quartet,2015,Tunisia,2013,1/1
4,The Nobel Peace Prize Lecture 2016_Juan Manuel...,Juan Manuel Santos,2016,Colombia,1951,1/1


In [11]:
# Merge metadata and lectures into new DataFrame
# Will only keep rows where both lectures and metadata are present
final_nobel_df = metadata_df.merge(nobel_df,on='Filename')

In [12]:
# Print DataFrame
final_nobel_df.head()

Unnamed: 0,Filename,Prize Laureate,Year Awarded,Place of Birth/Place of Foundation,Date of Birth/Date of Foundation,Prize Share,Document
0,The Nobel Peace Prize Lecture 2013_Organisatio...,Organisation for the Prohibition of Chemical W...,2013,,1997,1/1,"Your Majesties,\nYour Royal Highness,\nDisting..."
1,The Nobel Peace Prize Lecture 2014_Kailash Sat...,Kailash Satyarthi,2014,India,1954,1/2,(My dear children of the world …)\n\nYour Maje...
2,The Nobel Peace Prize Lecture 2014_Malala Yous...,Malala Yousafzai,2014,Pakistan,1997,1/2,Bismillah hir rahman ir rahim.\nIn the name of...
3,The Nobel Peace Prize Lecture 2015_National Di...,National Dialogue Quartet,2015,Tunisia,2013,1/1,"His Majesty,\nTheir Highnesses,\nThe respected..."
4,The Nobel Peace Prize Lecture 2016_Juan Manuel...,Juan Manuel Santos,2016,Colombia,1951,1/1,Your Majesties; Your Royal Highnesses; disting...


### Text Enrichment with spaCy

In [13]:
# Load nlp pipeline
nlp = spacy.load('en_core_web_sm')

In [14]:
#Define example sentence
sentence = "This is 'an' example? sentence"

# Call the nlp model on the sentence
doc = nlp(sentence)

In [15]:
# Define a function that runs the nlp pipeline on any given input text
def process_text(text):
    return nlp(text)

In [16]:
# Apply the function to the "Document" column, so that the nlp pipeline is called on each lecture
final_nobel_df['Doc'] = final_nobel_df['Document'].apply(process_text)

### Tokenization

In [17]:
# Define a function to retrieve tokens from a doc object
def get_token(doc):
    return [(token.text) for token in doc]

In [18]:
# Run the token retrieval function on the doc objects in the dataframe
final_nobel_df['Tokens'] = final_nobel_df['Doc'].apply(get_token)
final_nobel_df.head()

Unnamed: 0,Filename,Prize Laureate,Year Awarded,Place of Birth/Place of Foundation,Date of Birth/Date of Foundation,Prize Share,Document,Doc,Tokens
0,The Nobel Peace Prize Lecture 2013_Organisatio...,Organisation for the Prohibition of Chemical W...,2013,,1997,1/1,"Your Majesties,\nYour Royal Highness,\nDisting...","(Your, Majesties, ,, \n, Your, Royal, Highness...","[Your, Majesties, ,, \n, Your, Royal, Highness..."
1,The Nobel Peace Prize Lecture 2014_Kailash Sat...,Kailash Satyarthi,2014,India,1954,1/2,(My dear children of the world …)\n\nYour Maje...,"((, My, dear, children, of, the, world, …, ), ...","[(, My, dear, children, of, the, world, …, ), ..."
2,The Nobel Peace Prize Lecture 2014_Malala Yous...,Malala Yousafzai,2014,Pakistan,1997,1/2,Bismillah hir rahman ir rahim.\nIn the name of...,"(Bismillah, hir, rahman, ir, rahim, ., \n, In,...","[Bismillah, hir, rahman, ir, rahim, ., \n, In,..."
3,The Nobel Peace Prize Lecture 2015_National Di...,National Dialogue Quartet,2015,Tunisia,2013,1/1,"His Majesty,\nTheir Highnesses,\nThe respected...","(His, Majesty, ,, \n, Their, Highnesses, ,, \n...","[His, Majesty, ,, \n, Their, Highnesses, ,, \n..."
4,The Nobel Peace Prize Lecture 2016_Juan Manuel...,Juan Manuel Santos,2016,Colombia,1951,1/1,Your Majesties; Your Royal Highnesses; disting...,"(Your, Majesties, ;, Your, Royal, Highnesses, ...","[Your, Majesties, ;, Your, Royal, Highnesses, ..."


In [19]:
tokens = final_nobel_df[['Document', 'Tokens']].copy()
tokens.head()

Unnamed: 0,Document,Tokens
0,"Your Majesties,\nYour Royal Highness,\nDisting...","[Your, Majesties, ,, \n, Your, Royal, Highness..."
1,(My dear children of the world …)\n\nYour Maje...,"[(, My, dear, children, of, the, world, …, ), ..."
2,Bismillah hir rahman ir rahim.\nIn the name of...,"[Bismillah, hir, rahman, ir, rahim, ., \n, In,..."
3,"His Majesty,\nTheir Highnesses,\nThe respected...","[His, Majesty, ,, \n, Their, Highnesses, ,, \n..."
4,Your Majesties; Your Royal Highnesses; disting...,"[Your, Majesties, ;, Your, Royal, Highnesses, ..."


### Lemmatization

In [20]:
# Define a function to retrieve lemmas from a doc object
def get_lemma(doc):
    return [(token.lemma_) for token in doc]

# Run the lemma retrieval function on the doc objects in the dataframe
final_nobel_df['Lemmas'] = final_nobel_df['Doc'].apply(get_lemma)

In [21]:
print(f'"Peace" appears in the text tokens column ' + str(final_nobel_df['Tokens'].apply(lambda x: x.count('peace')).sum()) + ' times.')
print(f'"Peace" appears in the lemmas column ' + str(final_nobel_df['Lemmas'].apply(lambda x: x.count('peace')).sum()) + ' times.')

"Peace" appears in the text tokens column 151 times.
"Peace" appears in the lemmas column 155 times.


### Part of Speech Tagging

In [22]:
# Define a function to retrieve lemmas from a doc object
def get_pos(doc):
    #Return the coarse- and fine-grained part of speech text for each token in the doc
    return [(token.pos_, token.tag_) for token in doc]

# Define a function to retrieve parts of speech from a doc object
final_nobel_df['POS'] = final_nobel_df['Doc'].apply(get_pos)

In [23]:
# Define function to extract proper nouns from Doc object
def extract_proper_nouns(doc):
    return [token.text for token in doc if token.pos_ == 'PROPN']

# Apply function to Doc column and store resulting proper nouns in new column
final_nobel_df['Proper_Nouns'] = final_nobel_df['Doc'].apply(extract_proper_nouns)

In [24]:
list(final_nobel_df.loc[[5, 10], 'Proper_Nouns'])

[['Majesties',
  'Nobel',
  'Committee',
  'Esteemed',
  'Nobel',
  'Peace',
  'Prize',
  'International',
  'Campaign',
  'Nuclear',
  'Weapons',
  'Nobel',
  'Committee',
  'Red',
  'Cross',
  'Red',
  'Crescent',
  'UN',
  'Ours',
  'Nobel',
  'Laureate',
  'William',
  'Faulkner',
  'Armageddon',
  'blocs',
  'Cold',
  'War',
  'Cold',
  'War',
  'Fear',
  'Earth',
  'Faulkner',
  'Nobel',
  'ICAN',
  'International',
  'Physicians',
  'Prevention',
  'Nuclear',
  'War',
  'Man',
  'Cold',
  'War',
  'Iraq',
  'Iran',
  'Kashmir',
  'North',
  'Korea',
  'Nobel',
  'Peace',
  'Laureate',
  'Martin',
  'Luther',
  'King',
  'Jr',
  'ICAN',
  'Setsuko',
  'Thurlow',
  'hibakusha',
  'ICAN',
  'UN',
  'Treaty',
  'Prohibition',
  'Nuclear',
  'Weapons',
  'Treaty',
  'Prohibition',
  'Nuclear',
  'Weapons',
  'United',
  'States',
  'Russia',
  'Britain',
  'France',
  'China',
  'India',
  'Pakistan',
  'Armageddon',
  'Israel',
  'North',
  'Korea',
  'ruin',
  'Treaty',
  'Prohibit

### Named Entity Recognition

In [25]:
# Define function to extract named entities from doc objects
def extract_named_entities(doc):
    return [ent.label_ for ent in doc.ents]

# Apply function to Doc column and store resulting named entities in new column
final_nobel_df['Named_Entities'] = final_nobel_df['Doc'].apply(extract_named_entities)
final_nobel_df['Named_Entities']

0     [ORG, ORG, ORG, ORG, GPE, DATE, PERSON, ORG, O...
1     [ORG, ORG, PERSON, PERSON, PERSON, LOC, LOC, L...
2     [ORG, PERSON, ORG, DATE, DATE, ORG, ORG, PRODU...
3     [PERSON, ORG, WORK_OF_ART, ORG, ORG, ORG, ORG,...
4     [ORG, ORG, GPE, DATE, NORP, DATE, DATE, DATE, ...
5     [ORG, ORG, DATE, DATE, WORK_OF_ART, CARDINAL, ...
6     [DATE, GPE, GPE, ORG, CARDINAL, GPE, DATE, LOC...
7     [ORG, ORG, ORG, PERSON, LOC, ORG, GPE, DATE, D...
8     [ORG, ORG, PERSON, ORG, ORG, GPE, GPE, NORP, N...
9     [DATE, QUANTITY, GPE, DATE, DATE, GPE, GPE, GP...
10    [ORG, TIME, DATE, WORK_OF_ART, CARDINAL, WORK_...
11    [ORG, GPE, ORG, GPE, GPE, ORG, GPE, PERSON, GP...
12    [ORG, PERSON, ORG, ORG, DATE, WORK_OF_ART, PER...
13    [PERSON, ORG, GPE, DATE, NORP, WORK_OF_ART, OR...
14    [ORG, WORK_OF_ART, DATE, ORG, ORG, NORP, PERSO...
15    [PERSON, ORG, ORG, WORK_OF_ART, WORK_OF_ART, W...
Name: Named_Entities, dtype: object

In [26]:
# Define function to extract text tagged with named entities from doc objects
def extract_named_entities(doc):
    return [ent for ent in doc.ents]

# Apply function to Doc column and store resulting text in new column
final_nobel_df['NE_Words'] = final_nobel_df['Doc'].apply(extract_named_entities)
final_nobel_df['NE_Words']

0     [(Your, Royal, Highness), (the, Norwegian, Nob...
1     [(Excellencies), (the, Norwegian, Nobel, Commi...
2     [(Bismillah), (rahman, ir, rahim), (the, Norwe...
3     [(Majesty), (the, Nobel, Committee), (Ladies, ...
4     [(Royal, Highnesses), (the, Norwegian, Nobel, ...
5     [(the, Norwegian, Nobel, Committee), (Esteemed...
6     [(6, October, 1996), (Lemera), (the, Democrati...
7     [(Excellencies), (the, Committee), (The, Nobel...
8     [(the, Norwegian, Nobel, Committee), (Fellow, ...
9     [(April, 10th, ,, 1815), (six, thousand, miles...
10    [(the, Nobel, Committee), (the, morning), (Oct...
11    [(the, Norwegian, Nobel, Committee), (Jamal, K...
12    [(Royal, Majesties), (Royal, Highnesses), (Hon...
13    [(Royal, Highnesses), (the, Norwegian, Nobel, ...
14    [(the, Norwegian, Nobel, Committee), (the, Nob...
15    [(Royal, Highnesses), (Excellencies), (the, No...
Name: NE_Words, dtype: object

In [27]:
# Extract the first Doc object
doc = final_nobel_df['Doc'][1]

# Visualize named entity tagging in a single paper
displacy.render(doc, style='ent', jupyter=True)

In [29]:
final_nobel_df.to_csv('final_nobel_lectures.csv', index=False)
final_nobel_df.head()

Unnamed: 0,Filename,Prize Laureate,Year Awarded,Place of Birth/Place of Foundation,Date of Birth/Date of Foundation,Prize Share,Document,Doc,Tokens,Lemmas,POS,Proper_Nouns,Named_Entities,NE_Words
0,The Nobel Peace Prize Lecture 2013_Organisatio...,Organisation for the Prohibition of Chemical W...,2013,,1997,1/1,"Your Majesties,\nYour Royal Highness,\nDisting...","(Your, Majesties, ,, \n, Your, Royal, Highness...","[Your, Majesties, ,, \n, Your, Royal, Highness...","[your, Majesties, ,, \n, your, Royal, Highness...","[(PRON, PRP$), (PROPN, NNPS), (PUNCT, ,), (SPA...","[Majesties, Royal, Highness, Nobel, Committee,...","[ORG, ORG, ORG, ORG, GPE, DATE, PERSON, ORG, O...","[(Your, Royal, Highness), (the, Norwegian, Nob..."
1,The Nobel Peace Prize Lecture 2014_Kailash Sat...,Kailash Satyarthi,2014,India,1954,1/2,(My dear children of the world …)\n\nYour Maje...,"((, My, dear, children, of, the, world, …, ), ...","[(, My, dear, children, of, the, world, …, ), ...","[(, my, dear, child, of, the, world, …, ), \n\...","[(PUNCT, -LRB-), (PRON, PRP$), (ADJ, JJ), (NOU...","[Majesties, Royal, Highnesses, Excellencies, N...","[ORG, ORG, PERSON, PERSON, PERSON, LOC, LOC, L...","[(Excellencies), (the, Norwegian, Nobel, Commi..."
2,The Nobel Peace Prize Lecture 2014_Malala Yous...,Malala Yousafzai,2014,Pakistan,1997,1/2,Bismillah hir rahman ir rahim.\nIn the name of...,"(Bismillah, hir, rahman, ir, rahim, ., \n, In,...","[Bismillah, hir, rahman, ir, rahim, ., \n, In,...","[Bismillah, hir, rahman, ir, rahim, ., \n, in,...","[(PROPN, NNP), (PROPN, NNP), (PROPN, NNP), (PR...","[Bismillah, hir, rahman, ir, rahim, God, Majes...","[ORG, PERSON, ORG, DATE, DATE, ORG, ORG, PRODU...","[(Bismillah), (rahman, ir, rahim), (the, Norwe..."
3,The Nobel Peace Prize Lecture 2015_National Di...,National Dialogue Quartet,2015,Tunisia,2013,1/1,"His Majesty,\nTheir Highnesses,\nThe respected...","(His, Majesty, ,, \n, Their, Highnesses, ,, \n...","[His, Majesty, ,, \n, Their, Highnesses, ,, \n...","[his, Majesty, ,, \n, their, Highnesses, ,, \n...","[(PRON, PRP$), (PROPN, NNP), (PUNCT, ,), (SPAC...","[Majesty, Highnesses, Nobel, Committee, Ladies...","[PERSON, ORG, WORK_OF_ART, ORG, ORG, ORG, ORG,...","[(Majesty), (the, Nobel, Committee), (Ladies, ..."
4,The Nobel Peace Prize Lecture 2016_Juan Manuel...,Juan Manuel Santos,2016,Colombia,1951,1/1,Your Majesties; Your Royal Highnesses; disting...,"(Your, Majesties, ;, Your, Royal, Highnesses, ...","[Your, Majesties, ;, Your, Royal, Highnesses, ...","[your, Majesties, ;, your, Royal, Highnesses, ...","[(PRON, PRP$), (PROPN, NNPS), (PUNCT, :), (PRO...","[Majesties, Royal, Highnesses, Nobel, Committe...","[ORG, ORG, GPE, DATE, NORP, DATE, DATE, DATE, ...","[(Royal, Highnesses), (the, Norwegian, Nobel, ..."
