### Named entity recognition will allow us to identify brand names, public figures, and other marketable entities which would be of importance in a business analytic application.

Make sure to install necessary dependencies.

In [15]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


Imports

In [16]:
import os
import pandas as pd
import spacy

Read filename and contents into a pandas dataframe

In [17]:
data = {'FileName': [], 'Text': []}

for transcript in os.listdir('../DATA'):
    with open(f'../DATA/{transcript}', 'r') as file:
        text = file.read()
        data['FileName'].append(transcript)
        data['Text'].append(text)

df = pd.DataFrame(data)
df.head()

Unnamed: 0,FileName,Text
0,2KEI29IfOp4.txt,You guys welcome back a cheat day Jared has re...
1,--aOisk7Hf8.txt,How low guys it's me. Hello. Welcome back to a...
2,2omuOarg2hE.txt,"Hi guys, I have been real lazy I could do a 15..."
3,2wfWK2Z9A58.txt,"Gentlemen, I'm here today to meet an old frien..."
4,2uaGw1D-X0Y.txt,Is that a little music? No. Is that me is now ...


Use the en_core_web_sm english pipeline and a multitude of entity categories.

Process each row and keep the entities as a dictionary for now in a new column.

In [18]:
entity_recognition = spacy.load('en_core_web_sm')
entity_recognition_categories = ['PERSON', 'ORG', 'GPE', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LAW', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']
df['Entities'] = ''

for index, row in df.iterrows():
    doc = entity_recognition(row['Text'])
    entities = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in entity_recognition_categories]
    df.at[index, 'Entities'] = entities

df.head()

Unnamed: 0,FileName,Text,Entities
0,2KEI29IfOp4.txt,You guys welcome back a cheat day Jared has re...,"[(Jared, ORG), (Thrones, ORG), (Atlantis, PROD..."
1,--aOisk7Hf8.txt,How low guys it's me. Hello. Welcome back to a...,"[(Halloween, DATE), (Halloween, DATE), (seven ..."
2,2omuOarg2hE.txt,"Hi guys, I have been real lazy I could do a 15...","[(15 minute, TIME), (Animal Crossing, PERSON),..."
3,2wfWK2Z9A58.txt,"Gentlemen, I'm here today to meet an old frien...","[(today, DATE), (Kevin, PERSON), (One, CARDINA..."
4,2uaGw1D-X0Y.txt,Is that a little music? No. Is that me is now ...,"[(today, DATE), (Wednesday upload day, DATE), ..."


Save the dataframe to a CSV file

In [19]:
df.to_csv('../OUTPUT/Named_Entity_Recognition.csv', index=False)