# Preprocessing and Importing Corpus
## Using the Annual Review of Anthropology 

In [1]:
# Install and import spacy and plotly.
%pip install spaCy
%pip install plotly
%pip install nbformat ==5.1.2


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
zsh:1: =5.1.2 not found
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Import spacy
import spacy

# Install English language model
!spacy download en_core_web_sm

# Import os to upload documents and metadata
import os

# Load spaCy visualizer
from spacy import displacy

# Import pandas DataFrame packages
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

# Import graphing package
import plotly.graph_objects as go
import plotly.express as px

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
# Create empty lists for file names and contents
texts = []
file_names = []

# Iterate through each file in the folder
for _file_name in os.listdir('anthroreview_txt_files'):
# Look for only text files
    if _file_name.endswith('.txt'):
    # Append contents of each text file to text list
        texts.append(open('anthroreview_txt_files' + '/' + _file_name, 'r', encoding='utf-8').read())
        # Append name of each file to file name list
        file_names.append(_file_name)

In [4]:
# Create dictionary object associating each file name with its text
filename_dictionary = {'Filename':file_names,'Text':texts}

In [5]:
reviews_df = pd.DataFrame(filename_dictionary)

In [6]:
reviews_df.head()

Unnamed: 0,Filename,Text
0,vol1.2.txt,Conceptual Progress in Physical Anthropology: ...
1,vol52.13.txt,Biocultural Lactation: Integrated Approaches ...
2,vol52.12.txt,Gut Microbial Intersections with Human Ecolog...
3,vol1.3.txt,Studies of Modern Man\\nThis review of studies...
4,vol1.1.txt,Culture as Behavior: Structure and Emergence\n...


In [32]:
# Remove extra spaces from papers
reviews_df['Text'] = reviews_df['Text'].str.replace('\s+', ' ', regex=True).str.strip()
reviews_df.head()

Unnamed: 0,Filename,Text
0,vol1.2,Conceptual Progress in Physical Anthropology: ...
1,vol52.13,Biocultural Lactation: Integrated Approaches t...
2,vol52.12,Gut Microbial Intersections with Human Ecology...
3,vol1.3,Studies of Modern Man\ This review of studies ...
4,vol1.1,Culture as Behavior: Structure and Emergence T...


In [33]:
# Load metadata.
metadata_df = pd.read_csv('anthro_reviews.csv')
metadata_df.head()

Unnamed: 0,title,author,year,text_filename
0,Culture as Behavior: Structure and Emergence,Conrad M. Arensberg,1972,vol1.1
1,Conceptual Progress in Physical Anthropology: ...,Bernard G. Campbell,1972,vol1.2
2,Studies of Modern Man,D. F. Roberts and J. C. Bear,1972,vol1.3
3,Dating Methods,Joseph W. Michels,1972,vol1.4
4,Archaeological Settlement Patterns,Jeffrey R. Parsons,1972,vol1.5


In [34]:
# Remove .txt from title of each paper
reviews_df['Filename'] = reviews_df['Filename'].str.replace('.txt', '', regex=True)

# Rename column from paper ID to Title
metadata_df.rename(columns={"text_filename": "Filename"}, inplace=True)

In [35]:
# Merge metadata and papers into new DataFrame
# Will only keep rows where both essay and metadata are present
merge_df = metadata_df.merge(reviews_df,on='Filename')

In [36]:
# Print DataFrame
merge_df.head()

Unnamed: 0,title,author,year,Filename,Text
0,Culture as Behavior: Structure and Emergence,Conrad M. Arensberg,1972,vol1.1,Culture as Behavior: Structure and Emergence T...
1,Conceptual Progress in Physical Anthropology: ...,Bernard G. Campbell,1972,vol1.2,Conceptual Progress in Physical Anthropology: ...
2,Studies of Modern Man,D. F. Roberts and J. C. Bear,1972,vol1.3,Studies of Modern Man\ This review of studies ...
3,Dating Methods,Joseph W. Michels,1972,vol1.4,Dating Methods\ Archaeological dating has unde...
4,Archaeological Settlement Patterns,Jeffrey R. Parsons,1972,vol1.5,Archaeological Settlement Patterns\ This paper...


# Text Enrichment and Tokenization
Based on Corpus Analysis with spaCy by Megan Kane

In [37]:
# Load nlp pipeline
nlp = spacy.load('en_core_web_sm')

# Check what functions it performs
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [38]:
# Define a function that runs the nlp pipeline on any given input text
def process_text(text):
    return nlp(text)

In [39]:
# Apply the function to the "Text" column, so that the nlp pipeline is called on each text sample
merge_df['Doc'] = merge_df['Text'].apply(process_text)

In [40]:
# Define a function to retrieve tokens from a doc object
def get_token(doc):
    return [(token.text) for token in doc]

In [42]:
# Run the token retrieval function on the doc objects in the dataframe
merge_df['Tokens'] = merge_df['Doc'].apply(get_token)
merge_df.head()

Unnamed: 0,title,author,year,Filename,Text,Doc,Tokens
0,Culture as Behavior: Structure and Emergence,Conrad M. Arensberg,1972,vol1.1,Culture as Behavior: Structure and Emergence T...,"(Culture, as, Behavior, :, Structure, and, Eme...","[Culture, as, Behavior, :, Structure, and, Eme..."
1,Conceptual Progress in Physical Anthropology: ...,Bernard G. Campbell,1972,vol1.2,Conceptual Progress in Physical Anthropology: ...,"(Conceptual, Progress, in, Physical, Anthropol...","[Conceptual, Progress, in, Physical, Anthropol..."
2,Studies of Modern Man,D. F. Roberts and J. C. Bear,1972,vol1.3,Studies of Modern Man\ This review of studies ...,"(Studies, of, Modern, Man\, This, review, of, ...","[Studies, of, Modern, Man\, This, review, of, ..."
3,Dating Methods,Joseph W. Michels,1972,vol1.4,Dating Methods\ Archaeological dating has unde...,"(Dating, Methods\, Archaeological, dating, has...","[Dating, Methods\, Archaeological, dating, has..."
4,Archaeological Settlement Patterns,Jeffrey R. Parsons,1972,vol1.5,Archaeological Settlement Patterns\ This paper...,"(Archaeological, Settlement, Patterns\, This, ...","[Archaeological, Settlement, Patterns\, This, ..."


#### Lemmatization 

In [43]:
# Define a function to retrieve lemmas from a doc object
def get_lemma(doc):
    return [(token.lemma_) for token in doc]

# Run the lemma retrieval function on the doc objects in the dataframe
merge_df['Lemmas'] = merge_df['Doc'].apply(get_lemma)

In [44]:
print(f'"Anthropology" appears in the text tokens column ' + str(merge_df['Tokens'].apply(lambda x: x.count('anthropology')).sum()) + ' times.')
print(f'"Anthropology" appears in the lemmas column ' + str(merge_df['Lemmas'].apply(lambda x: x.count('anthropology')).sum()) + ' times.')

"Anthropology" appears in the text tokens column 58 times.
"Anthropology" appears in the lemmas column 62 times.


#### Text Annotation

In [45]:
# Define a function to retrieve lemmas from a doc object
def get_pos(doc):
    #Return the coarse- and fine-grained part of speech text for each token in the doc
    return [(token.pos_, token.tag_) for token in doc]

# Define a function to retrieve parts of speech from a doc object
merge_df['POS'] = merge_df['Doc'].apply(get_pos)

In [84]:
# Create a list of part of speech tags
pos_list =  list(merge_df['POS'])

In [87]:
pos_list[0]

[('NOUN', 'NN'),
 ('ADP', 'IN'),
 ('NOUN', 'NN'),
 ('PUNCT', ':'),
 ('PROPN', 'NNP'),
 ('CCONJ', 'CC'),
 ('PROPN', 'NNP'),
 ('DET', 'DT'),
 ('ADJ', 'JJ'),
 ('NOUN', 'NN'),
 ('ADP', 'IN'),
 ('DET', 'DT'),
 ('NOUN', 'NN'),
 ('CCONJ', 'CC'),
 ('VERB', 'VB'),
 ('NOUN', 'NN'),
 ('ADP', 'IN'),
 ('NOUN', 'NN'),
 ('PRON', 'WDT'),
 ('AUX', 'VBZ'),
 ('PART', 'TO'),
 ('VERB', 'VB'),
 ('DET', 'DT'),
 ('NOUN', 'NN'),
 ('VERB', 'VBZ'),
 ('DET', 'DT'),
 ('NOUN', 'NN'),
 ('ADP', 'IN'),
 ('ADJ', 'JJ'),
 ('NOUN', 'NN'),
 ('ADP', 'IN'),
 ('ADJ', 'JJ'),
 ('ADJ', 'JJ'),
 ('NOUN', 'NNS'),
 ('PUNCT', '.'),
 ('PRON', 'PRP$'),
 ('ADJ', 'JJ'),
 ('NOUN', 'NN'),
 ('ADP', 'IN'),
 ('NOUN', 'NN'),
 ('PUNCT', 'HYPH'),
 ('NOUN', 'NN'),
 ('CCONJ', 'CC'),
 ('PRON', 'PRP$'),
 ('ADJ', 'JJ'),
 ('NOUN', 'NN'),
 ('ADP', 'IN'),
 ('DET', 'DT'),
 ('ADJ', 'JJ'),
 ('NOUN', 'NNS'),
 ('ADP', 'IN'),
 ('ADJ', 'JJ'),
 ('NOUN', 'NN'),
 ('NOUN', 'NN'),
 ('ADP', 'IN'),
 ('PRON', 'PRP$'),
 ('VERB', 'VBG'),
 ('NOUN', 'NN'),
 ('PUNCT', '.')

In [88]:
# Define function to extract proper nouns from Doc object
def extract_proper_nouns(doc):
    return [token.text for token in doc if token.pos_ == 'PROPN']

# Apply function to Doc column and store resulting proper nouns in new column
merge_df['Proper_Nouns'] = merge_df['Doc'].apply(extract_proper_nouns)

In [89]:
#choosing to look at the proper nouns from one text from 1973 (1) and one from 2023 (40)

list(merge_df.loc[[1, 40], 'Proper_Nouns'])

[['Progress',
  'Physical',
  'Anthropology',
  'Fossil',
  'Alfred',
  'North',
  'Whitehead',
  'William',
  'Ockham',
  'essentia',
  'non',
  'sunt',
  'multiplicanda',
  'prater',
  'necessitatem',
  'Whitehead'],
 ['Paradigm',
  'Shift\\',
  'Gupta',
  'Stoolman',
  'American',
  'Anthropological',
  'Association',
  'Gupta',
  'Herb',
  'Lewis',
  'Gupta',
  'American',
  'Anthropological',
  'Association',
  'Communities',
  'Lewis',
  'Native',
  'North',
  'Native',
  'North',
  'Americans',
  'Native',
  'American',
  'Nations',
  'United',
  'States',
  'Native',
  'North',
  'Americans',
  '\\',
  'Native',
  'American',
  'Graves',
  'Protection',
  'Repatriation',
  'Act',
  'NAGPRA',
  'Krmpotich',
  'Phillips']]

In [49]:
# Define function to extract named entities from doc objects
def extract_named_entities(doc):
    return [ent.label_ for ent in doc.ents]

# Apply function to Doc column and store resulting named entities in new column
merge_df['Named_Entities'] = merge_df['Doc'].apply(extract_named_entities)
merge_df['Named_Entities']

0                [CARDINAL, CARDINAL, CARDINAL, PERSON]
1     [PERSON, PERSON, PERSON, CARDINAL, CARDINAL, D...
2                                    [DATE, DATE, DATE]
3     [ORG, CARDINAL, CARDINAL, ORG, CARDINAL, ORG, ...
4     [LANGUAGE, CARDINAL, CARDINAL, GPE, GPE, NORP,...
5     [GPE, NORP, NORP, CARDINAL, DATE, ORDINAL, ORD...
6     [ORG, CARDINAL, CARDINAL, PERSON, DATE, ORDINA...
7     [ORG, DATE, DATE, ORDINAL, DATE, DATE, PERSON,...
8     [ORG, PERSON, LAW, FAC, PERSON, CARDINAL, PERS...
9     [ORG, DATE, ORG, CARDINAL, CARDINAL, ORG, CARD...
10    [ORDINAL, ORG, CARDINAL, PERSON, CARDINAL, CAR...
11    [ORG, DATE, CARDINAL, ORG, PERSON, CARDINAL, C...
12    [CARDINAL, ORDINAL, ORDINAL, NORP, PERSON, CAR...
13    [ORG, CARDINAL, DATE, DATE, DATE, PERSON, ORG,...
14         [ORG, CARDINAL, CARDINAL, CARDINAL, ORDINAL]
15    [NORP, NORP, NORP, NORP, NORP, NORP, ORG, CARD...
16    [PERSON, PERSON, PRODUCT, DATE, DATE, LOC, PER...
17                                [NORP, DATE, O

In [50]:
# Define function to extract text tagged with named entities from doc objects
def extract_named_entities(doc):
    return [ent for ent in doc.ents]

# Apply function to Doc column and store resulting text in new column
merge_df['NE_Words'] = merge_df['Doc'].apply(extract_named_entities)
merge_df['NE_Words']

0                [(one), (one), (Two), (Charles, Snow)]
1     [(Conceptual, Progress), (Fossil, Man\), (Alfr...
2                     [(mid-1969), (mid-1971), (today)]
3     [(Methods\, Archaeological), (3), (One), (Mich...
4     [(English), (two), (one), (America), (England)...
5     [(USSR), (South, American), (Andean), (three),...
6     [(Subsistence), (at, least, one), (three), (Jo...
7     [(Subsistence), (annual), (the, previous, year...
8     [(Boas, 13), (Harris, 40), (Kroeber, 46), (Whi...
9     [(Social, Strategies, and, Social, Relationshi...
10    [(first), (Casagrande, &, Hale), (6), (Werner)...
11    [(Morgan, 's, Systems, of, Consanguinity, and,...
12    [(five), (first), (first), (Aristotelian), (Ar...
13    [(Semantics), (one), (the, late, 1940s), (the,...
14    [(Linguistic, Models), (two), (two), (two), (f...
15    [(American), (Indian), (American), (Indian), (...
16    [(Ruth, Tringham), (Tringham), (the, Neolithic...
17    [(European), (between, 1988, and, 1993), (

In [83]:
#saving new CSV file with tokens, lemmas, and parts-of-speech, without the Doc column.
merge_df.drop(columns=['Doc'])
merge_df.to_csv('anthroreviews_with_spaCy.csv')

#### Now that we have saved our spaCy analysis, let's visualize with the new information collected

In [53]:
# Create new DataFrame for analysis purposes
ner_analysis_df = merge_df[['year', 'Named_Entities']]

In [54]:
# Convert named entity lists to strings so we can count specific entities
ner_analysis_df['Named_Entities'] = ner_analysis_df['Named_Entities'].apply(lambda x: ' '.join(x))

# Get the number of each type of entity in each paper, including people, locations, and dates
person_counts = ner_analysis_df['Named_Entities'].str.count('PERSON')
loc_counts = ner_analysis_df['Named_Entities'].str.count('LOC')
date_counts = ner_analysis_df['Named_Entities'].str.count('DATE')

In [55]:
# Append named entity counts to new DataFrame
ner_counts_df = pd.DataFrame()
ner_counts_df['Volume'] = ner_analysis_df['year']
ner_counts_df['PERSON_Counts'] = person_counts
ner_counts_df['LOC_Counts'] = loc_counts
ner_counts_df['DATE_Counts'] = date_counts

ner_counts_df.head()

Unnamed: 0,Volume,PERSON_Counts,LOC_Counts,DATE_Counts
0,1972,1,0,0
1,1972,4,0,3
2,1972,0,0,3
3,1972,0,0,0
4,1972,1,1,1


In [90]:
# Calculate average usage of each named entity type
average_ner_df = ner_counts_df.groupby(['Volume']).mean()
average_ner_df = average_ner_df.round(0)
average_ner_df = average_ner_df.reset_index()
average_ner_df

# Use plotly to plot proper noun use per genre
fig = px.bar(average_ner_df, x="Volume", y=["PERSON_Counts", 'LOC_Counts', "DATE_Counts"], title="Average Named Entity Usage in 1972 vs 2023", barmode='group')
fig.show()

It seems that in later volumes, more dates are used (possibly from citing more literature) as opposed to the first volume. The 2023 journals average 7 words, while in 1972, those journals averaged only 1. Additionally, spaCy recoginzed more locations in the 2023 volume, whereas there were no locations recoginzed within the first journal volume. 