### Installing, Importing and Preprocessing

In [55]:
# Import spacy
import spacy

# Install English language model
!spacy download en_core_web_sm

# Import os to upload documents and metadata
import os

# Load spaCy visualizer
from spacy import displacy

# Import pandas DataFrame packages
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

# Import graphing package
import plotly.graph_objects as go
import plotly.express as px

'spacy' 不是内部或外部命令，也不是可运行的程序
或批处理文件。


In [56]:
# Create empty lists for file names and contents
texts = []
file_names = []

# Iterate through each file in the folder
for _file_name in os.listdir('C:/Users/idaid/Desktop/The_Beatles/Yellow_Submarine'):
# Look for only text files
    if _file_name.endswith('.txt'):
    # Append contents of each text file to text list
        print(_file_name)
        texts.append(open('C:/Users/idaid/Desktop/The_Beatles/Yellow_Submarine' + '/' + _file_name, 'r', encoding='utf-8').read())
        # Append name of each file to file name list
        file_names.append(_file_name)

All Together Now.txt
All You Need Is Love.txt
Baby You're a Rich Man.txt
Eleanor Rigby.txt
Hey Bulldog.txt
It's All Too Much.txt
Love You To.txt
Lucy in the Sky with Diamonds.txt
Nowhere Man.txt
Only A Northern Song.txt
Sgt. Pepper's Lonely Hearts Club Band.txt
Think for Yourself.txt
When I'm Sixty-Four.txt
With a Little Help From My Friends.txt
Yellow Submarine.txt


In [61]:
# Create dictionary object associating each file name with its text
d = {'Filename':file_names,'Text':texts}

In [62]:
# Turn dictionary into a dataframe
text_df = pd.DataFrame(d)

In [63]:
text_df.head()

Unnamed: 0,Filename,Text
0,All Together Now.txt,"One, two, three, four\nCan I have a little mor..."
1,All You Need Is Love.txt,"(Love, love, love)\n(Love, love, love)\n(Love,..."
2,Baby You're a Rich Man.txt,How does it feel to be\nOne of the beautiful p...
3,Eleanor Rigby.txt,"I'm Eleanor Rigby, I picked up the rice\nIn th..."
4,Hey Bulldog.txt,Sheepdog standing in the rain\nBullfrog doing ...


In [64]:
# Remove extra spaces from texts
text_df['Text'] = paper_df['Text'].str.replace('\s+', ' ', regex=True).str.strip()
text_df['Filename'] = paper_df['Filename'].apply(lambda x: x[:-4])
text_df.head()

Unnamed: 0,Filename,Text
0,All Together Now,"One, two, three, four Can I have a little more..."
1,All You Need Is Love,"(Love, love, love) (Love, love, love) (Love, l..."
2,Baby You're a Rich Man,How does it feel to be One of the beautiful pe...
3,Eleanor Rigby,"I'm Eleanor Rigby, I picked up the rice In the..."
4,Hey Bulldog,Sheepdog standing in the rain Bullfrog doing i...


In [65]:
# Load metadata.
metadata_df = pd.read_csv('C:/Users/idaid/Desktop/The_Beatles/metadata.csv')
metadata_df.head()

Unnamed: 0,Song ID,Song Title,Genre,Writer,Song Duration
0,1,Yellow Submarine,Psychedelic rock,Paul McCartney,2:39
1,2,Hey Bulldog,Hard rock; Psychedelic rock,John Lennon,3:12
2,3,Eleanor Rigby,Baroque pop,Paul McCartney,2:06
3,4,Love You To,Indian rock; Experimental rock; Raga rock,George Harrison,2:58
4,5,All Together Now,Music hall children's music skiffle,All Together Now,2:11


In [68]:
# Remove .txt from title of each song
paper_df['Filename'] = paper_df['Filename'].str.replace('.txt', '', regex=True)

# Rename column from song ID to Title
metadata_df.rename(columns={"Song Title": "Filename"}, inplace=True)

In [69]:
# Merge metadata and text into new DataFrame
# Will only keep rows where both essay and metadata are present
final_song_df = metadata_df.merge(text_df,on='Filename')

In [70]:
# Print DataFrame
final_song_df.head()

Unnamed: 0,Song ID,Filename,Genre,Writer,Song Duration,Text
0,1,Yellow Submarine,Psychedelic rock,Paul McCartney,2:39,In the town where I was born Lived a man who s...
1,2,Hey Bulldog,Hard rock; Psychedelic rock,John Lennon,3:12,Sheepdog standing in the rain Bullfrog doing i...
2,3,Eleanor Rigby,Baroque pop,Paul McCartney,2:06,"I'm Eleanor Rigby, I picked up the rice In the..."
3,4,Love You To,Indian rock; Experimental rock; Raga rock,George Harrison,2:58,"Each day just goes so fast I turn around, it's..."
4,5,All Together Now,Music hall children's music skiffle,All Together Now,2:11,"One, two, three, four Can I have a little more..."


The resulting DataFrame is now ready for analysis.

## Text Enrichment with spaCy

In [75]:
# Load nlp pipeline
nlp = spacy.load('en_core_web_sm')

# Check what functions it performs
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [76]:
#Define example sentence
sentence = "This is 'an' example? sentence"

# Call the nlp model on the sentence
doc = nlp(sentence)

In [77]:
# Loop through each token in doc object
for token in doc:
    # Print text and part of speech for each
    print(token.text, token.pos_)

This PRON
is AUX
' PUNCT
an DET
' PUNCT
example NOUN
? PUNCT
sentence NOUN


In [78]:
# Define a function that runs the nlp pipeline on any given input text
def process_text(text):
    return nlp(text)

In [79]:
# Apply the function to the "Text" column, so that the nlp pipeline is called on each song
final_song_df['Doc'] = final_song_df['Text'].apply(process_text)
final_song_df.head()

Unnamed: 0,Song ID,Filename,Genre,Writer,Song Duration,Text,Doc
0,1,Yellow Submarine,Psychedelic rock,Paul McCartney,2:39,In the town where I was born Lived a man who s...,"(In, the, town, where, I, was, born, Lived, a,..."
1,2,Hey Bulldog,Hard rock; Psychedelic rock,John Lennon,3:12,Sheepdog standing in the rain Bullfrog doing i...,"(Sheepdog, standing, in, the, rain, Bullfrog, ..."
2,3,Eleanor Rigby,Baroque pop,Paul McCartney,2:06,"I'm Eleanor Rigby, I picked up the rice In the...","(I, 'm, Eleanor, Rigby, ,, I, picked, up, the,..."
3,4,Love You To,Indian rock; Experimental rock; Raga rock,George Harrison,2:58,"Each day just goes so fast I turn around, it's...","(Each, day, just, goes, so, fast, I, turn, aro..."
4,5,All Together Now,Music hall children's music skiffle,All Together Now,2:11,"One, two, three, four Can I have a little more...","(One, ,, two, ,, three, ,, four, Can, I, have,..."


### Text Reduction

#### Tokenization

In [80]:
# Define a function to retrieve tokens from a doc object
def get_token(doc):
    return [(token.text) for token in doc]

In [81]:
# Run the token retrieval function on the doc objects in the dataframe
final_song_df['Tokens'] = final_song_df['Doc'].apply(get_token)
final_song_df.head()

Unnamed: 0,Song ID,Filename,Genre,Writer,Song Duration,Text,Doc,Tokens
0,1,Yellow Submarine,Psychedelic rock,Paul McCartney,2:39,In the town where I was born Lived a man who s...,"(In, the, town, where, I, was, born, Lived, a,...","[In, the, town, where, I, was, born, Lived, a,..."
1,2,Hey Bulldog,Hard rock; Psychedelic rock,John Lennon,3:12,Sheepdog standing in the rain Bullfrog doing i...,"(Sheepdog, standing, in, the, rain, Bullfrog, ...","[Sheepdog, standing, in, the, rain, Bullfrog, ..."
2,3,Eleanor Rigby,Baroque pop,Paul McCartney,2:06,"I'm Eleanor Rigby, I picked up the rice In the...","(I, 'm, Eleanor, Rigby, ,, I, picked, up, the,...","[I, 'm, Eleanor, Rigby, ,, I, picked, up, the,..."
3,4,Love You To,Indian rock; Experimental rock; Raga rock,George Harrison,2:58,"Each day just goes so fast I turn around, it's...","(Each, day, just, goes, so, fast, I, turn, aro...","[Each, day, just, goes, so, fast, I, turn, aro..."
4,5,All Together Now,Music hall children's music skiffle,All Together Now,2:11,"One, two, three, four Can I have a little more...","(One, ,, two, ,, three, ,, four, Can, I, have,...","[One, ,, two, ,, three, ,, four, Can, I, have,..."


In [83]:
tokens = final_song_df[['Text', 'Tokens']].copy()
tokens.head()

Unnamed: 0,Text,Tokens
0,In the town where I was born Lived a man who s...,"[In, the, town, where, I, was, born, Lived, a,..."
1,Sheepdog standing in the rain Bullfrog doing i...,"[Sheepdog, standing, in, the, rain, Bullfrog, ..."
2,"I'm Eleanor Rigby, I picked up the rice In the...","[I, 'm, Eleanor, Rigby, ,, I, picked, up, the,..."
3,"Each day just goes so fast I turn around, it's...","[Each, day, just, goes, so, fast, I, turn, aro..."
4,"One, two, three, four Can I have a little more...","[One, ,, two, ,, three, ,, four, Can, I, have,..."


#### Lemmatization

In [85]:
# Define a function to retrieve lemmas from a doc object
def get_lemma(doc):
    return [(token.lemma_) for token in doc]

# Run the lemma retrieval function on the doc objects in the dataframe
final_song_df['Lemmas'] = final_song_df['Doc'].apply(get_lemma)
final_song_df

Unnamed: 0,Song ID,Filename,Genre,Writer,Song Duration,Text,Doc,Tokens,Lemmas
0,1,Yellow Submarine,Psychedelic rock,Paul McCartney,2:39,In the town where I was born Lived a man who s...,"(In, the, town, where, I, was, born, Lived, a,...","[In, the, town, where, I, was, born, Lived, a,...","[in, the, town, where, I, be, bear, live, a, m..."
1,2,Hey Bulldog,Hard rock; Psychedelic rock,John Lennon,3:12,Sheepdog standing in the rain Bullfrog doing i...,"(Sheepdog, standing, in, the, rain, Bullfrog, ...","[Sheepdog, standing, in, the, rain, Bullfrog, ...","[sheepdog, stand, in, the, rain, Bullfrog, do,..."
2,3,Eleanor Rigby,Baroque pop,Paul McCartney,2:06,"I'm Eleanor Rigby, I picked up the rice In the...","(I, 'm, Eleanor, Rigby, ,, I, picked, up, the,...","[I, 'm, Eleanor, Rigby, ,, I, picked, up, the,...","[I, be, Eleanor, Rigby, ,, I, pick, up, the, r..."
3,4,Love You To,Indian rock; Experimental rock; Raga rock,George Harrison,2:58,"Each day just goes so fast I turn around, it's...","(Each, day, just, goes, so, fast, I, turn, aro...","[Each, day, just, goes, so, fast, I, turn, aro...","[each, day, just, go, so, fast, I, turn, aroun..."
4,5,All Together Now,Music hall children's music skiffle,All Together Now,2:11,"One, two, three, four Can I have a little more...","(One, ,, two, ,, three, ,, four, Can, I, have,...","[One, ,, two, ,, three, ,, four, Can, I, have,...","[one, ,, two, ,, three, ,, four, can, I, have,..."
5,6,Lucy in the Sky with Diamonds,Psychedelic pop; Acid rock; Psychedelic rock,John Lennon,3:28,Picture yourself in a boat on a river With tan...,"(Picture, yourself, in, a, boat, on, a, river,...","[Picture, yourself, in, a, boat, on, a, river,...","[picture, yourself, in, a, boat, on, a, river,..."
6,7,Think for Yourself,Rock,George Harrison,2:19,I've got a word or two To say about the things...,"(I, 've, got, a, word, or, two, To, say, about...","[I, 've, got, a, word, or, two, To, say, about...","[I, have, get, a, word, or, two, to, say, abou..."
7,8,Sgt. Pepper's Lonely Hearts Club Band,Psychedelische rock,Paul McCartney,2:03,It was twenty years ago today Sgt. Pepper taug...,"(It, was, twenty, years, ago, today, Sgt, ., P...","[It, was, twenty, years, ago, today, Sgt, ., P...","[it, be, twenty, year, ago, today, Sgt, ., Pep..."
8,9,With a Little Help From My Friends,Pop rock; Psychedelic pop,John Lennon,2:44,What would you think if I sang out of tune Wou...,"(What, would, you, think, if, I, sang, out, of...","[What, would, you, think, if, I, sang, out, of...","[what, would, you, think, if, I, sing, out, of..."
9,13,When I'm Sixty-Four,Pop,Paul McCartney,2:37,"When I get older losing my hair, many years fr...","(When, I, get, older, losing, my, hair, ,, man...","[When, I, get, older, losing, my, hair, ,, man...","[when, I, get, old, lose, my, hair, ,, many, y..."


### Text Annotation

In [86]:
# Define a function to retrieve lemmas from a doc object
def get_pos(doc):
    #Return the coarse- and fine-grained part of speech text for each token in the doc
    return [(token.pos_, token.tag_) for token in doc]

# Define a function to retrieve parts of speech from a doc object
final_song_df['POS'] = final_song_df['Doc'].apply(get_pos)

In [87]:
# Create a list of part of speech tags
list(final_song_df['POS'])

[[('ADP', 'IN'),
  ('DET', 'DT'),
  ('NOUN', 'NN'),
  ('SCONJ', 'WRB'),
  ('PRON', 'PRP'),
  ('AUX', 'VBD'),
  ('VERB', 'VBN'),
  ('VERB', 'VBD'),
  ('DET', 'DT'),
  ('NOUN', 'NN'),
  ('PRON', 'WP'),
  ('VERB', 'VBD'),
  ('ADP', 'IN'),
  ('NOUN', 'NN'),
  ('CCONJ', 'CC'),
  ('PRON', 'PRP'),
  ('VERB', 'VBD'),
  ('PRON', 'PRP'),
  ('ADP', 'IN'),
  ('PRON', 'PRP$'),
  ('NOUN', 'NN'),
  ('ADP', 'IN'),
  ('DET', 'DT'),
  ('NOUN', 'NN'),
  ('ADP', 'IN'),
  ('NOUN', 'NNS'),
  ('ADV', 'RB'),
  ('PRON', 'PRP'),
  ('VERB', 'VBD'),
  ('ADP', 'RP'),
  ('ADP', 'IN'),
  ('DET', 'DT'),
  ('NOUN', 'NN'),
  ('PUNCT', '``'),
  ('PROPN', 'NNP'),
  ('PRON', 'PRP'),
  ('VERB', 'VBD'),
  ('DET', 'DT'),
  ('NOUN', 'NN'),
  ('ADP', 'IN'),
  ('NOUN', 'NN'),
  ('CCONJ', 'CC'),
  ('PRON', 'PRP'),
  ('VERB', 'VBD'),
  ('ADP', 'IN'),
  ('DET', 'DT'),
  ('NOUN', 'NNS'),
  ('ADP', 'IN'),
  ('PRON', 'PRP$'),
  ('ADJ', 'JJ'),
  ('NOUN', 'NN'),
  ('PRON', 'PRP'),
  ('PRON', 'DT'),
  ('VERB', 'VBP'),
  ('ADP', 'IN'),
 

In [88]:
spacy.explain("IN")

'conjunction, subordinating or preposition'

In [90]:
# Define function to extract proper nouns from Doc object
def extract_proper_nouns(doc):
    return [token.text for token in doc if token.pos_ == 'PROPN']

# Apply function to Doc column and store resulting proper nouns in new column
final_song_df['Proper_Nouns'] = final_song_df['Doc'].apply(extract_proper_nouns)
final_song_df

Unnamed: 0,Song ID,Filename,Genre,Writer,Song Duration,Text,Doc,Tokens,Lemmas,POS,Proper_Nouns
0,1,Yellow Submarine,Psychedelic rock,Paul McCartney,2:39,In the town where I was born Lived a man who s...,"(In, the, town, where, I, was, born, Lived, a,...","[In, the, town, where, I, was, born, Lived, a,...","[in, the, town, where, I, be, bear, live, a, m...","[(ADP, IN), (DET, DT), (NOUN, NN), (SCONJ, WRB...","[Til, Yellow, Yellow, Yellow, Yellow, Mr., Boa..."
1,2,Hey Bulldog,Hard rock; Psychedelic rock,John Lennon,3:12,Sheepdog standing in the rain Bullfrog doing i...,"(Sheepdog, standing, in, the, rain, Bullfrog, ...","[Sheepdog, standing, in, the, rain, Bullfrog, ...","[sheepdog, stand, in, the, rain, Bullfrog, do,...","[(NOUN, NN), (VERB, VBG), (ADP, IN), (DET, DT)...","[Bullfrog, Jackknife, Wigwam, Bulldog, Bulldog..."
2,3,Eleanor Rigby,Baroque pop,Paul McCartney,2:06,"I'm Eleanor Rigby, I picked up the rice In the...","(I, 'm, Eleanor, Rigby, ,, I, picked, up, the,...","[I, 'm, Eleanor, Rigby, ,, I, picked, up, the,...","[I, be, Eleanor, Rigby, ,, I, pick, up, the, r...","[(PRON, PRP), (AUX, VBP), (PROPN, NNP), (PROPN...","[Eleanor, Rigby, Eleanor, Rigby, Father, McKen..."
3,4,Love You To,Indian rock; Experimental rock; Raga rock,George Harrison,2:58,"Each day just goes so fast I turn around, it's...","(Each, day, just, goes, so, fast, I, turn, aro...","[Each, day, just, goes, so, fast, I, turn, aro...","[each, day, just, go, so, fast, I, turn, aroun...","[(DET, DT), (NOUN, NN), (ADV, RB), (VERB, VBZ)...",[]
4,5,All Together Now,Music hall children's music skiffle,All Together Now,2:11,"One, two, three, four Can I have a little more...","(One, ,, two, ,, three, ,, four, Can, I, have,...","[One, ,, two, ,, three, ,, four, Can, I, have,...","[one, ,, two, ,, three, ,, four, can, I, have,...","[(NUM, CD), (PUNCT, ,), (NUM, CD), (PUNCT, ,),...","[B, C, D, E, F, G, H, J, bom, Bom, Chop, Bom, ..."
5,6,Lucy in the Sky with Diamonds,Psychedelic pop; Acid rock; Psychedelic rock,John Lennon,3:28,Picture yourself in a boat on a river With tan...,"(Picture, yourself, in, a, boat, on, a, river,...","[Picture, yourself, in, a, boat, on, a, river,...","[picture, yourself, in, a, boat, on, a, river,...","[(VERB, VB), (PRON, PRP), (ADP, IN), (DET, DT)...","[Somebody, Lucy, Lucy, Lucy, Newspaper, Climb,..."
6,7,Think for Yourself,Rock,George Harrison,2:19,I've got a word or two To say about the things...,"(I, 've, got, a, word, or, two, To, say, about...","[I, 've, got, a, word, or, two, To, say, about...","[I, have, get, a, word, or, two, to, say, abou...","[(PRON, PRP), (AUX, VBP), (VERB, VBN), (DET, D...",[]
7,8,Sgt. Pepper's Lonely Hearts Club Band,Psychedelische rock,Paul McCartney,2:03,It was twenty years ago today Sgt. Pepper taug...,"(It, was, twenty, years, ago, today, Sgt, ., P...","[It, was, twenty, years, ago, today, Sgt, ., P...","[it, be, twenty, year, ago, today, Sgt, ., Pep...","[(PRON, PRP), (AUX, VBD), (NUM, CD), (NOUN, NN...","[Sgt, Pepper, Sgt, Pepper, Lonely, Hearts, Clu..."
8,9,With a Little Help From My Friends,Pop rock; Psychedelic pop,John Lennon,2:44,What would you think if I sang out of tune Wou...,"(What, would, you, think, if, I, sang, out, of...","[What, would, you, think, if, I, sang, out, of...","[what, would, you, think, if, I, sing, out, of...","[(PRON, WP), (AUX, MD), (PRON, PRP), (VERB, VB...","[Mmm, Mmm, Mmm, Mmm, Mmm, Mmm]"
9,13,When I'm Sixty-Four,Pop,Paul McCartney,2:37,"When I get older losing my hair, many years fr...","(When, I, get, older, losing, my, hair, ,, man...","[When, I, get, older, losing, my, hair, ,, man...","[when, I, get, old, lose, my, hair, ,, many, y...","[(SCONJ, WRB), (PRON, PRP), (VERB, VBP), (ADJ,...","[Birthday, sunday, Isle, Wight, Vera, Chuck, D..."


#### Named Entity Recognition

In [91]:
# Get all NE labels and assign to variable
labels = nlp.get_pipe("ner").labels

# Print each label and its description
for label in labels:
    print(label + ' : ' + spacy.explain(label))

CARDINAL : Numerals that do not fall under another type
DATE : Absolute or relative dates or periods
EVENT : Named hurricanes, battles, wars, sports events, etc.
FAC : Buildings, airports, highways, bridges, etc.
GPE : Countries, cities, states
LANGUAGE : Any named language
LAW : Named documents made into laws.
LOC : Non-GPE locations, mountain ranges, bodies of water
MONEY : Monetary values, including unit
NORP : Nationalities or religious or political groups
ORDINAL : "first", "second", etc.
ORG : Companies, agencies, institutions, etc.
PERCENT : Percentage, including "%"
PERSON : People, including fictional
PRODUCT : Objects, vehicles, foods, etc. (not services)
QUANTITY : Measurements, as of weight or distance
TIME : Times smaller than a day
WORK_OF_ART : Titles of books, songs, etc.


In [92]:
# Define function to extract named entities from doc objects
def extract_named_entities(doc):
    return [ent.label_ for ent in doc.ents]

# Apply function to Doc column and store resulting named entities in new column
final_song_df['Named_Entities'] = final_song_df['Doc'].apply(extract_named_entities)
final_song_df['Named_Entities']

0     [PERSON, PERSON, PERSON, PERSON, PERSON, PERSO...
1                                 [PERSON, PERSON, ORG]
2     [PERSON, PERSON, PERSON, PERSON, PERSON, ORG, ...
3                             [WORK_OF_ART, DATE, DATE]
4     [CARDINAL, CARDINAL, CARDINAL, CARDINAL, CARDI...
5     [PERSON, PERSON, PERSON, PERSON, PERSON, PERSO...
6                                            [CARDINAL]
7     [DATE, DATE, ORG, TIME, PERSON, ORG, PERSON, O...
8     [PERSON, PERSON, DATE, PERSON, PERSON, ORDINAL...
9     [DATE, DATE, CARDINAL, CARDINAL, DATE, TIME, C...
10    [WORK_OF_ART, ORG, PERSON, PERSON, PERSON, PER...
11                           [PERSON, GPE, ORG, PERSON]
Name: Named_Entities, dtype: object

In [93]:
# Extract the first Doc object
doc = final_song_df['Doc'][1]

# Visualize named entity tagging in a single song
displacy.render(doc, style='ent', jupyter=True)

### Download Enriched Dataset

In [94]:
# Save DataFrame as csv (in Google Drive)
# Use this step only to save  csv to your computer's working directory
final_song_df.to_csv('Yellow_Submarine_with_spaCy_tags.csv')