### Part 2: Named Entity Recognition (NER) 

In [1]:
import pandas as pd
import nltk
import re

In [2]:
import spacy
from spacy import displacy

In [3]:
import numpy as np
import pycountry as pc
import ast
from ast import literal_eval

In [4]:
from collections import defaultdict

Loaded spacy for NER.

In [6]:
ner = spacy.load("en_core_web_sm")

In [7]:
all_books = pd.read_csv('../librivox_df')

#### Step 1: Filtered all English language books for NER

In [386]:
books_en = all_books[all_books['Language'] == 'English']

Note: Used Transcripts of The Great Gatsby for displacy.

In [9]:
gatsby = books_en['Transcripts'].iloc[147]

In [10]:
gatsby1 = ner(gatsby)

In [418]:
displacy.render(gatsby1,style="ent",jupyter=True)

TypeError: 'NoneType' object is not subscriptable

In [12]:
books_en

Unnamed: 0,ID,Title,Author,Views,Favorites,Date_uploaded,Reviews,Reviews_n,Transcripts,Genre,Language,Runtime,Narrated_by
1,1,The Book of Enoch,Unknown,155927,102.0,"December 9, 2018",['\n I enjoyed this recording very much...,1,"r, please visit Librevox.org. Section 1. Edito...",Religion,English,04:28:56,CJ Plogue
3,3,The Meditations of the Emperor Marcus Aurelius...,Marcus Aurelius,302755,58.0,"January 2, 2018",['\n I recently discovered this gem of ...,1,"ation or to volunteer, please visit Librevox.o...","Classics (Greek & Latin Antiquity), Biography ...",English,04:47:46,LibriVox Volunteers
4,4,Three Things,Ella Wheeler Wilcox,516824,7.0,"July 1, 2018",[],0,"gs there are, eternal in their worth. Love tha...",Multi-version (Weekly and Fortnightly poetry),English,00:12:59,LibriVox Volunteers
6,6,"Biographical Memoir of John Wesley Powell, 183...",William Morris Davis,402442,4.0,"March 2, 2018",[],0,"n or to volunteer, please visit Librevox.org. ...",Biography & Autobiography,English,03:40:51,Melanie Schleeter McCalmont
8,8,True Stories of Crime from the District Attorn...,Arthur Cheney Train,364346,17.0,"August 16, 2018",[],0,"tion or to volunteer, please visit Librevox.or...",True Crime,English,06:49:50,Colleen McMahon
...,...,...,...,...,...,...,...,...,...,...,...,...,...
368,68,Doctor Rabbit and Tom Wildcat,Thomas Clark Hinkle,10073,1.0,"November 9, 2022",[],0,isit Lieberbox.org. Read by Laura. Dr. Rabbit ...,"Animals & Nature, Myths, Legends & Fairy Tales",English,01:22:37,Laurie Banza
370,70,"Dramatic Reading Scene and Story Collection, V...",Various,56821,2.0,"April 30, 2022",[],0,"ng, all Librevox recordings are in the public ...",Dramatic Readings,English,03:17:50,LibriVox Volunteers
372,72,"The Mormon Battalion, Its History and Achievem...",B. H. Roberts,29381,1.0,"November 4, 2022",['\n If you are someone who is really f...,1,"to volunteer, please visit Libravox.org. Read ...","War & Military, History",English,02:51:09,Wayne Cooke
373,73,Dogs and Puppies,Frances Trego Montgomery,32539,2.0,"November 10, 2022",[],0,riVox.org. Read by Prajakta. Docs and Puppets ...,Action & Adventure,English,00:51:56,LibriVox Volunteers


#### Step 1 : Obtained NER labels for all English books.
Focus is on 'GPE', 'LOC', 'NORP', 'PERSON' labels for now.

In [15]:
def nerList(text):
    doc = ner(text)
    d = defaultdict(list)
    all_labels = ['GPE', 'LOC', 'NORP', 'PERSON']
    for word in doc.ents:
        if word.label_ in all_labels:
            d[word.label_].append(word.text)
    for k,v in d.items():
        m = set(i for i in v) #if v.count(i) > 1)
        d[k] = list(m)
    
    return d

In [14]:
books_d = books_en['Transcripts'].apply(nerList)

In [16]:
df = pd.DataFrame(list(books_d))

In [17]:
ner_df = df.replace(np.nan, '[]', regex=True)

In [21]:
ner_df = pd.read_csv('../ner_df')

In [23]:
ner_df.head(20)

Unnamed: 0,NORP,PERSON,GPE,LOC
0,"['Hebrew', 'Jewish', 'Greek', 'Christian', 'po...","['Canon Charles', 'John Herkiness', 'Jubilee',...","['Erimich', 'c.p', 'Jordan', 'Palestine', 'Gal...","['Northern Palestine', 'the Accursed Valley', ..."
1,['Platonist'],"['Alexander the Gremarian', 'Maximus', 'Tuskeu...","['Relious', 'Epilonia', 'Greece', 'Marcus']",[]
2,[],[],[],[]
3,"['Greek', 'Latin']","['William Morris Davis', 'John Wesley Powell',...","['Ohio', 'Malaska', 'Illinois', 'the United St...","['the Genesee Valley', 'Mississippi']"
4,"['Dutch', 'American']","['P. Body', 'Gang', 'forgeri', 'Prentice Parke...","['St. Louis', 'West 38th Street', 'St. Paul', ...",['the Bay window']
5,[],['jaw'],"['Tolstry', 'Nima', 'Bulkka']",['Bulkka']
6,[],"['Keef', 'Kurt Keef', 'Dan', 'Sikamoor Hight',...","['Texas', 'Miss.', 'Boston', 'Baylem', 'Tattle...",[]
7,"['Maltese', 'African']","['Etna', 'Dobel', 'Ambrose Pierce', 'Dale Grow...","['mizzon', 'Malta', 'Naples', 'Italy', 'Chicago']",[]
8,"['Thibian', 'Ejipteri', 'Alizian', 'English', ...","['Nil', 'Patrick Sivil', 'Edward Maud Thompson...","['Nebcad', 'Missiers', 'Nathleris', 'Les', 'Tu...",['Heliopolitan']
9,"['French', 'metropolis']","['Martin', 'Nory', 'Horus', 'Simpson', 'Lannig...","['Foguri', 'Fogretis', 'quay', 'Magna Carta', ...",[]


#### Step 2: Extracting Features from GPE Column.
GPE stands for Geo-Political Entity, and this includes Politically accepted entities, like Country, City, States. Using these features, we determined how many countries are mentioned in the text and which country is mentioned the most.

In [None]:
## Extracting list from inside the string in every row.

In [26]:
gpe = ner_df['GPE'].apply(lambda x: literal_eval(str(x)))

In [28]:
def country_name_check(input_list):
    pc_list = list(pc.countries)
    names = []
    for i in pc_list:
        names.append(i.name)
        ## if any part of x in input list is in names
    
    country = []
    for j in input_list:
        if j in names: #any(js in j for js in pc_list):
            country.append(j)
    return country

In [195]:
# list of all country names appearing in each book
gpe_countries = gpe.apply(country_name_check).rename('countries')
gpe_countries.head(3)

0    [Jordan, Israel]
1            [Greece]
2                  []
Name: countries, dtype: object

In [197]:
gpe_countries_n = gpe_countries.apply(len)

In [198]:
unique = set(x for l in list(gpe_countries) for x in l)   

In [199]:
rem_ = ['the United States Department', 'the United States Forest Service', 'the United States Review', 'State United States of America'
       , 'Dick Chadwick', 'the Jersey Shore', 'Savannah Georgia', 'South Australia', 'Malice', 'Bologna Italy', 'Old Ireland']

In [201]:
## set of all unique country names appearing in all books
unique_countries = unique - set(rem_)
unique_countries

{'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Bahamas',
 'Belgium',
 'Brazil',
 'Canada',
 'China',
 'Denmark',
 'Egypt',
 'Ethiopia',
 'France',
 'Georgia',
 'Germany',
 'Greece',
 'Hong Kong',
 'Hungary',
 'Iceland',
 'India',
 'Ireland',
 'Israel',
 'Italy',
 'Japan',
 'Jordan',
 'Libya',
 'Lithuania',
 'Luxembourg',
 'Malaysia',
 'Malta',
 'Mexico',
 'New Zealand',
 'Norway',
 'Poland',
 'Portugal',
 'Romania',
 'Serbia',
 'South Africa',
 'Spain',
 'Sweden',
 'Switzerland',
 'Turkey',
 'Uganda',
 'United States'}

In [366]:
countries_n = []
for i in gpe_countries:
    c = 0
    for j in i:
        
        if j in unique_countries:
            c+=1
    countries_n.append(c)


In [369]:
countries_n = pd.Series(countries_n)

In [34]:
from urllib.request import urlopen
import json

In [35]:
def get_key(val):
    for key, value in dict(states).items():
        if val in value:
            return key

#### JSON data of all countries, their capitals, states and other details
Using this data, state and capital names that appeared in the books were determined and also, to which countries they belong.

In [36]:
js = 'https://raw.githubusercontent.com/dr5hn/countries-states-cities-database/master/countries%2Bstates.json'
response = urlopen(js)
data_json = json.loads(response.read())

In [37]:
states = defaultdict(list)
for c in data_json:
    for i in c['states']:
        states[c['name']].append(i['name'])

In [40]:
# country from states
def countryFromStates(L):
    countries = []
    for i in L:
        for v in states.values():
            if i in v:
                countries.append(get_key(i))
    return countries

In [102]:
c_states = gpe.apply(countryFromStates).rename('c_states')

In [43]:
capitals = {}
for country in data_json:
    #for k,v in country.items():
    capitals[country['capital']] = country['name']

In [53]:
# country from capital
def countryFromCapitals(L):
    countries = []
    for i in L:
        if i in capitals.keys():
            countries.append(capitals[i])
    return countries

In [98]:
c_caps = gpe.apply(countryFromCapitals).rename('c_caps')

In [115]:
all_countries = c_caps+c_states+gpe_countries

In [121]:
## frequency of all countries mentioned (by name or through capital/states)
all_countries.explode().value_counts()

United States                     133
United Kingdom                     94
France                             62
Austria                            21
Italy                              18
Germany                            17
Israel                             11
China                              11
Spain                              11
Australia                          11
Egypt                              10
Russia                              8
Canada                              8
Belgium                             7
Hungary                             7
Palestinian Territory Occupied      7
Japan                               6
Greece                              6
Poland                              5
Georgia                             4
Switzerland                         4
Jordan                              4
Portugal                            3
India                               3
New Zealand                         3
Luxembourg                          2
Syria       

In [165]:
## getting unique values
def getUnique(L):
    U = []
    for i in L:
        i = list(set(i))
        U.append(i)
    return U

In [176]:
u_per_row = pd.Series(getUnique(all_countries))

In [178]:
## number of times a country appears in each book
u_per_row.explode().value_counts()

United States                     69
United Kingdom                    63
France                            40
Italy                             14
Germany                           14
Austria                           11
China                             11
Egypt                             10
Australia                         10
Spain                              9
Israel                             8
Greece                             6
Palestinian Territory Occupied     6
Poland                             5
Canada                             5
Hungary                            5
Russia                             4
Belgium                            4
Switzerland                        4
Georgia                            4
Jordan                             4
New Zealand                        3
India                              3
Japan                              3
Brazil                             2
South Africa                       2
Ireland                            2
C

In [186]:
## dummy column for books that mention US and books that don't
US_d = u_per_row.isin(['United States']).astype('int')

In [192]:
## number of states mentioned in each book
states_n = c_states.apply(len)

In [193]:
## number of unique countries (including their states and capitals) mentioned in each book
all_countries_n = u_per_row.apply(len)

#### Step3: Sorting the names from PERSON column
The PERSON label contained all entities with names of Characters (with a few words that were assigned this label through error). These were sorted manually into male, female and neutral (for the mistaken words) using excel. (Excel file attached) This was used to determine how many male and female characters each book contained. 

In [206]:
names_sorted = pd.read_excel('../Sorted_names.xlsx')

In [213]:
names_sorted.head(5)

Unnamed: 0,Names,Gender
0,Mill',F
1,Chaucer',M
2,W. Meyer',M
3,Moon',N
4,Bad DaM',N


In [283]:
from string import whitespace

In [296]:
def cleanNames(L):
    New = []
    for i in L:
        k = i.strip(whitespace + '"\'')
        New.append(k)
    return New

In [299]:
names_sorted['clean_names'] = cleanNames(names_sorted['Names'])  #.apply(cleanNames)

In [300]:
female_names = list(names_sorted[names_sorted['Gender'] == 'F']['clean_names'])
male_names = list(names_sorted[names_sorted['Gender'] == 'M']['clean_names'])
neutral_names = list(names_sorted[names_sorted['Gender'] == 'N']['clean_names'])

In [331]:
characters = ner_df['PERSON'].apply(lambda x: literal_eval(str(x)))

In [332]:
characters

0      [Canon Charles, John Herkiness, Jubilee, Moon,...
1      [Alexander the Gremarian, Maximus, Tuskeulum, ...
2                                                     []
3      [William Morris Davis, John Wesley Powell, Cri...
4      [P. Body, Gang, forgeri, Prentice Parker Singl...
                             ...                        
236    [Rose Chickenhouse, Hurry, Laura, Tom Wildcat,...
237    [Ferdinand DeVero, Count Adelaan, Samuel Pickw...
238    [Alexander W. Donafin, Saldo, Cook, Wayne Cook...
239    [Jip Seeds, Buster, Fritz, Jip Seed, Francis T...
240    [Cuberly, Grandpa, Cubrely, Poole, Hortell, cu...
Name: PERSON, Length: 241, dtype: object

In [327]:
# def assignGender(L):
#     G = []
#     for i in L:
#         temp = []
#         for j in i:
#             if j in male_names:
#                 temp.append('M')
#             if j in female_names:
#                 temp.append('F')
#             else:
#                 temp.append('N')
#         G.append(temp)
#     return G

In [345]:
def genderFrequency(L1, L2):
    freq = []
    for i in L1:
        c = 0
        for j in i:
            if j in L2:
                c+=1
        freq.append(c)
    return freq

In [348]:
male_n = pd.Series(genderFrequency(characters,male_names))
female_n = pd.Series(genderFrequency(characters,female_names))
neutral_n = pd.Series(genderFrequency(characters,neutral_names))

In [353]:
Person_df = pd.concat([male_n,female_n,neutral_n],axis=1).rename(columns = {0:'male_n',1:'female_n',2:'neutral_n'})

In [354]:
Person_df

Unnamed: 0,male_n,female_n,neutral_n
0,4,0,3
1,0,0,0
2,0,0,0
3,1,0,0
4,4,2,1
...,...,...,...
236,1,2,1
237,6,1,0
238,7,1,1
239,5,1,0


#### Step4: Get all NER Processed columns

In [410]:
countries_n

0      2
1      1
2      0
3      0
4      0
      ..
236    0
237    0
238    2
239    0
240    0
Length: 241, dtype: int64

In [411]:
all_ner = pd.concat([ner_df, countries_n, states_n,all_countries_n,US_d,Person_df]
                    ,axis =1).rename(columns={'c_states':'states_n',0:'countries_n',1:'all_countries_n', 2:'US_d'})

In [412]:
all_ner

Unnamed: 0,NORP,PERSON,GPE,LOC,countries_n,states_n,all_countries_n,US_d,male_n,female_n,neutral_n
0,"['Hebrew', 'Jewish', 'Greek', 'Christian', 'po...","['Canon Charles', 'John Herkiness', 'Jubilee',...","['Erimich', 'c.p', 'Jordan', 'Palestine', 'Gal...","['Northern Palestine', 'the Accursed Valley', ...",2,1,3,0,4,0,3
1,['Platonist'],"['Alexander the Gremarian', 'Maximus', 'Tuskeu...","['Relious', 'Epilonia', 'Greece', 'Marcus']",[],1,0,1,0,0,0,0
2,[],[],[],[],0,0,0,0,0,0,0
3,"['Greek', 'Latin']","['William Morris Davis', 'John Wesley Powell',...","['Ohio', 'Malaska', 'Illinois', 'the United St...","['the Genesee Valley', 'Mississippi']",0,4,1,0,1,0,0
4,"['Dutch', 'American']","['P. Body', 'Gang', 'forgeri', 'Prentice Parke...","['St. Louis', 'West 38th Street', 'St. Paul', ...",['the Bay window'],0,2,1,0,4,2,1
...,...,...,...,...,...,...,...,...,...,...,...
236,[],"['Rose Chickenhouse', 'Hurry', 'Laura', 'Tom W...",[],[],0,0,0,0,1,2,1
237,['English'],"['Ferdinand DeVero', 'Count Adelaan', 'Samuel ...","['Gowland', 'Librevox', 'Persona', 'Veyola']",[],0,0,0,0,6,1,0
238,"['Greek', 'American', 'Zedafans', 'Persians', ...","['Alexander W. Donafin', 'Saldo', 'Cook', 'Way...","['El Paso', 'Tigris', 'Zedafans', 'Mexico', 'M...","['the Gulf of Mexico', 'the Pacific Ocean', 'M...",2,1,3,0,7,1,1
239,['French'],"['Jip Seeds', 'Buster', 'Fritz', 'Jip Seed', '...",['Puppets'],['Europe'],0,0,0,0,5,1,0


In [413]:
books = books_en.reset_index().drop('ID', axis = 1)

In [414]:
en_books_df = pd.concat([books, all_ner], axis = 1).drop('index', axis = 1)

#### Table to be used for Features Extraction

In [415]:
en_books_df

Unnamed: 0,Title,Author,Views,Favorites,Date_uploaded,Reviews,Reviews_n,Transcripts,Genre,Language,...,PERSON,GPE,LOC,countries_n,states_n,all_countries_n,US_d,male_n,female_n,neutral_n
0,The Book of Enoch,Unknown,155927,102.0,"December 9, 2018",['\n I enjoyed this recording very much...,1,"r, please visit Librevox.org. Section 1. Edito...",Religion,English,...,"['Canon Charles', 'John Herkiness', 'Jubilee',...","['Erimich', 'c.p', 'Jordan', 'Palestine', 'Gal...","['Northern Palestine', 'the Accursed Valley', ...",2,1,3,0,4,0,3
1,The Meditations of the Emperor Marcus Aurelius...,Marcus Aurelius,302755,58.0,"January 2, 2018",['\n I recently discovered this gem of ...,1,"ation or to volunteer, please visit Librevox.o...","Classics (Greek & Latin Antiquity), Biography ...",English,...,"['Alexander the Gremarian', 'Maximus', 'Tuskeu...","['Relious', 'Epilonia', 'Greece', 'Marcus']",[],1,0,1,0,0,0,0
2,Three Things,Ella Wheeler Wilcox,516824,7.0,"July 1, 2018",[],0,"gs there are, eternal in their worth. Love tha...",Multi-version (Weekly and Fortnightly poetry),English,...,[],[],[],0,0,0,0,0,0,0
3,"Biographical Memoir of John Wesley Powell, 183...",William Morris Davis,402442,4.0,"March 2, 2018",[],0,"n or to volunteer, please visit Librevox.org. ...",Biography & Autobiography,English,...,"['William Morris Davis', 'John Wesley Powell',...","['Ohio', 'Malaska', 'Illinois', 'the United St...","['the Genesee Valley', 'Mississippi']",0,4,1,0,1,0,0
4,True Stories of Crime from the District Attorn...,Arthur Cheney Train,364346,17.0,"August 16, 2018",[],0,"tion or to volunteer, please visit Librevox.or...",True Crime,English,...,"['P. Body', 'Gang', 'forgeri', 'Prentice Parke...","['St. Louis', 'West 38th Street', 'St. Paul', ...",['the Bay window'],0,2,1,0,4,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236,Doctor Rabbit and Tom Wildcat,Thomas Clark Hinkle,10073,1.0,"November 9, 2022",[],0,isit Lieberbox.org. Read by Laura. Dr. Rabbit ...,"Animals & Nature, Myths, Legends & Fairy Tales",English,...,"['Rose Chickenhouse', 'Hurry', 'Laura', 'Tom W...",[],[],0,0,0,0,1,2,1
237,"Dramatic Reading Scene and Story Collection, V...",Various,56821,2.0,"April 30, 2022",[],0,"ng, all Librevox recordings are in the public ...",Dramatic Readings,English,...,"['Ferdinand DeVero', 'Count Adelaan', 'Samuel ...","['Gowland', 'Librevox', 'Persona', 'Veyola']",[],0,0,0,0,6,1,0
238,"The Mormon Battalion, Its History and Achievem...",B. H. Roberts,29381,1.0,"November 4, 2022",['\n If you are someone who is really f...,1,"to volunteer, please visit Libravox.org. Read ...","War & Military, History",English,...,"['Alexander W. Donafin', 'Saldo', 'Cook', 'Way...","['El Paso', 'Tigris', 'Zedafans', 'Mexico', 'M...","['the Gulf of Mexico', 'the Pacific Ocean', 'M...",2,1,3,0,7,1,1
239,Dogs and Puppies,Frances Trego Montgomery,32539,2.0,"November 10, 2022",[],0,riVox.org. Read by Prajakta. Docs and Puppets ...,Action & Adventure,English,...,"['Jip Seeds', 'Buster', 'Fritz', 'Jip Seed', '...",['Puppets'],['Europe'],0,0,0,0,5,1,0


In [416]:
#en_books_df.to_csv('../en_books_df', index = False, header=True)