In [1]:
# to manipulate dataframes
import pandas as pd

# for natural language processing: named entity recognition
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [2]:
df = pd.read_csv('sample.csv')

In [3]:
# named entity recognition
ARTICLE = nlp(''.join(str(df.iloc[:,0].tolist())))

In [4]:
ARTICLE

['Hulu', 'Netflix', 'Donald Trump', 'Kamala Harris', 'Hilary Clinton', 'Bernie Sanders', 'Joe Biden', 'Texas', 'Coronavirus', 'COVID-19', 'Apple', 'Hulu', 'Netflix', 'Donald Trump', 'Kamala Harris', 'Hilary Clinton', 'Bernie Sanders', 'Joe Biden', 'Texas', 'Coronavirus', 'COVID-19', 'Apple', 'Hulu', 'Netflix', 'Donald Trump', 'Kamala Harris', 'Hilary Clinton', 'Bernie Sanders', 'Joe Biden', 'Texas', 'Coronavirus', 'COVID-19', 'Apple', 'Hulu', 'Netflix', 'Donald Trump', 'Kamala Harris', 'Hilary Clinton', 'Bernie Sanders', 'Joe Biden', 'Texas', 'Coronavirus', 'COVID-19', 'Apple', 'Hulu', 'Netflix', 'Donald Trump', 'Kamala Harris', 'Hilary Clinton', 'Bernie Sanders', 'Joe Biden', 'Texas', 'Coronavirus', 'COVID-19', 'Apple', 'Hulu', 'Netflix', 'Donald Trump', 'Kamala Harris', 'Hilary Clinton', 'Bernie Sanders', 'Joe Biden', 'Texas', 'Coronavirus', 'COVID-19', 'Apple', 'Hulu', 'Netflix', 'Donald Trump', 'Kamala Harris', 'Hilary Clinton', 'Bernie Sanders', 'Joe Biden', 'Texas', 'Coronavirus'

In [5]:
def ner_person(df):
    # person: people, including fictional characters
    person_list = []
    for ent in ARTICLE.ents:
        if ent.label_ == 'PERSON':
            person_list.append(ent.text)
    person = Counter(person_list).most_common(40)
    return person

def ner_group(df):
    # norp: nationalities or religious or political groups
    norp_list = []
    for ent in ARTICLE.ents:
        if ent.label_ == 'NORP':
            norp_list.append(ent.text)
    group = Counter(norp_list).most_common(40)
    return group

def ner_org(df):
    # org: companies, agencies, institutions, etc
    org_list = []
    for ent in ARTICLE.ents:
        if ent.label_ == 'ORG':
            org_list.append(ent.text)
    org = Counter(org_list).most_common(40)
    return org

def ner_geo(df):
    # gpe: countries, cities, states
    geo_list = []
    for ent in ARTICLE.ents:
        if ent.label_ == 'GPE':
            geo_list.append(ent.text)
    geo = Counter(geo_list).most_common(40)
    return geo

def ner_product(df):
    # product: objects, vehicles, foods, etc. (Not services.) 
    product_list = []
    for ent in ARTICLE.ents:
        if ent.label_ == 'PRODUCT':
            product_list.append(ent.text)
    product = Counter(product_list).most_common(40)
    return product

def ner_event(df):
    # event: named hurricanes, battles, wars, sports events, etc 
    event_list = []
    for ent in ARTICLE.ents:
        if ent.label_ == 'EVENT':
            event_list.append(ent.text)
    event = Counter(event_list).most_common(40)
    return event

In [6]:
ner_person(df)

[("Donald Trump'", 306),
 ("Kamala Harris'", 90),
 ("Hilary Clinton'", 90),
 ('Clinton', 72),
 ('Coronavirus', 72),
 ("Elizabeth Warren'", 61)]

In [7]:
ner_group(df)

[('Republicans', 32), ('Democrats', 32)]

In [8]:
ner_org(df)

[('Coronavirus', 90),
 ('Apple', 89),
 ("St. Patricks's Day", 63),
 ("'Harley Davidson'", 58),
 ("Apple Watch'", 43)]

In [9]:
ner_geo(df)

[('Texas', 99), ('Philippines', 73), ('Italy', 20), ('Iran', 9), ('China', 1)]

In [10]:
ner_product(df)

[]

In [11]:
ner_event(df)

[("Black Friday'", 37)]