An example spaCy NLP pipeline on [some Reddit post data](https://www.kaggle.com/datasets/mswarbrickjones/reddit-selfposts).

Download the data and extract into a `/data` folder.

In [1]:
import spacy
from textacy import preprocessing, extract
import pandas as pd
import re
from collections import Counter

In [2]:
from tqdm import tqdm_notebook
from tqdm.autonotebook import tqdm

# Register for Pandas functions
tqdm.pandas()

  from tqdm.autonotebook import tqdm


##### Download en_core_web_sm
python -m spacy download en_core_web_sm

# Read in sample

In [3]:
# Post info
posts = pd.read_csv('data/rspct.tsv', sep='\t')

posts.head()

Unnamed: 0,id,subreddit,title,selftext
0,6d8knd,talesfromtechsupport,Remember your command line switches...,"Hi there, <lb>The usual. Long time lerker, fi..."
1,58mbft,teenmom,"So what was Matt ""addicted"" to?",Did he ever say what his addiction was or is h...
2,8f73s7,Harley,No Club Colors,Funny story. I went to college in Las Vegas. T...
3,6ti6re,ringdoorbell,"Not door bell, but floodlight mount height.",I know this is a sub for the 'Ring Doorbell' b...
4,77sxto,intel,Worried about my 8700k small fft/data stress r...,"Prime95 (regardless of version) and OCCT both,..."


In [4]:
# Subreddit info
subreddit_info = pd.read_csv('data/subreddit_info.csv').set_index(['subreddit'])

subreddit_info.head()

Unnamed: 0_level_0,category_1,category_2,category_3,in_data,reason_for_exclusion
subreddit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
whatsthatbook,advice/question,book,,True,
CasualConversation,advice/question,broad,,False,too_broad
Clairvoyantreadings,advice/question,broad,,False,too_broad
DecidingToBeBetter,advice/question,broad,,False,too_broad
HelpMeFind,advice/question,broad,,False,too_broad


In [5]:
# Join
df = posts.join(subreddit_info, on='subreddit')

df.head()

Unnamed: 0,id,subreddit,title,selftext,category_1,category_2,category_3,in_data,reason_for_exclusion
0,6d8knd,talesfromtechsupport,Remember your command line switches...,"Hi there, <lb>The usual. Long time lerker, fi...",writing/stories,tech support,,True,
1,58mbft,teenmom,"So what was Matt ""addicted"" to?",Did he ever say what his addiction was or is h...,tv_show,teen mom,,True,
2,8f73s7,Harley,No Club Colors,Funny story. I went to college in Las Vegas. T...,autos,harley davidson,,True,
3,6ti6re,ringdoorbell,"Not door bell, but floodlight mount height.",I know this is a sub for the 'Ring Doorbell' b...,hardware/tools,doorbells,,True,
4,77sxto,intel,Worried about my 8700k small fft/data stress r...,"Prime95 (regardless of version) and OCCT both,...",electronics,cpu,intel,True,


In [6]:
# See some category 1s that aren't *too* big
df.groupby('category_1')['id'].count().reset_index().sort_values('id', ascending=False).head(50).tail(20)

Unnamed: 0,category_1,id
5,autos,20000
0,advice/question,18000
1,animals,17000
12,education,17000
31,social_group,16000
25,politics/viewpoint,16000
15,food/drink,15000
8,card_game,15000
34,stem,14000
17,hardware/tools,14000


In [7]:
# Filter to books
df = df[df.category_1 == 'books']

# Reset index for easier subsetting/sampling
df.reset_index(inplace=True)
df.drop('index', axis=1, inplace=True)

# If desired, filter to top 1k for faster testing
if False:
    df = df.head(1000)

df.head()

Unnamed: 0,id,subreddit,title,selftext,category_1,category_2,category_3,in_data,reason_for_exclusion
0,8dasa5,harrypotter,Better idea for Fantastic beast adaptation?,I fell asleep watching the Fantastic Beast mov...,books,harry potter,,True,
1,6gygi1,dresdenfiles,(No Spoilers) Preview of Peace Talks,Could someone please tell me how many chapters...,books,dresden files,,True,
2,5xpvak,Parahumans,Looking for 1-2 players for Worm Campaign,Like the title says im looking for 1-2 more pl...,books,parahumans,,True,
3,7p3vuy,Malazan,Just finished TtH,"I just finished Toll the Hounds, and I dont kn...",books,malazan,,True,
4,5vkojs,Stormlight_Archive,[TWoK]/[WoR]/[NO SPOILERS] Something that has ...,"Here on earth, green/blue eyes are not a domin...",books,cosmere,,True,


# Create a basic spaCy pipeline

In [8]:
# Get all text in column
df['text'] = df['title'] + ' ' + df['selftext']

df.head()

Unnamed: 0,id,subreddit,title,selftext,category_1,category_2,category_3,in_data,reason_for_exclusion,text
0,8dasa5,harrypotter,Better idea for Fantastic beast adaptation?,I fell asleep watching the Fantastic Beast mov...,books,harry potter,,True,,Better idea for Fantastic beast adaptation? I ...
1,6gygi1,dresdenfiles,(No Spoilers) Preview of Peace Talks,Could someone please tell me how many chapters...,books,dresden files,,True,,(No Spoilers) Preview of Peace Talks Could som...
2,5xpvak,Parahumans,Looking for 1-2 players for Worm Campaign,Like the title says im looking for 1-2 more pl...,books,parahumans,,True,,Looking for 1-2 players for Worm Campaign Like...
3,7p3vuy,Malazan,Just finished TtH,"I just finished Toll the Hounds, and I dont kn...",books,malazan,,True,,Just finished TtH I just finished Toll the Hou...
4,5vkojs,Stormlight_Archive,[TWoK]/[WoR]/[NO SPOILERS] Something that has ...,"Here on earth, green/blue eyes are not a domin...",books,cosmere,,True,,[TWoK]/[WoR]/[NO SPOILERS] Something that has ...


In [9]:
# Initialize a pipeline
nlp = spacy.load('en_core_web_sm')

nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7fa0cd9031c0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7fa0cd903340>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7fa0cda33a00>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7fa0cd5a3b40>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7fa0cd4bcb00>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7fa0cda33c30>)]

In [10]:
# Create function to process
def process_text(text):
    # Remove html tags like <lb>
    text = re.sub('<[^<>]*>', '', text)
    
    # Mask emails and phone numbers
    text = preprocessing.replace.emails(text)
    text = preprocessing.replace.phone_numbers(text)
    
    # NLP it
    doc = nlp(text)
    
    return doc

In [11]:
# Function for displayign results
def display_nlp(doc):
    rows = []
    for i, t in enumerate(doc):
        if not t.is_punct:
            row = {'token': i, 'text': t.text, 'lemma_': t.lemma_,
                'is_stop': t.is_stop, 'is_alpha': t.is_alpha, 'pos_': t.pos_,
                'dep_': t.dep_, 'ent_type_': t.ent_type_, 'ent_iob_': t.ent_iob_, 
                'ent_id': t.ent_id_, 'like_email_': t.like_email
            }
            rows.append(row)
    df = pd.DataFrame(rows).set_index('token')
    df.index.name = None
    return df

# Process a single record

In [12]:
# One record
doc = process_text(df.text[0])

doc

Better idea for Fantastic beast adaptation? I fell asleep watching the Fantastic Beast movie so I’m guessing it wasn’t great.  I feel like it would be really well adapted into a TV show (not much knowledge on media rights).  Each chapter would be an episode and use different characters to show the discovery or a crazy moment involving the creature.  

In [13]:
pd.set_option('display.max_rows', None)

display_nlp(doc)

Unnamed: 0,text,lemma_,is_stop,is_alpha,pos_,dep_,ent_type_,ent_iob_,ent_id,like_email_
0,Better,well,False,True,ADJ,amod,,O,,False
1,idea,idea,False,True,NOUN,ROOT,,O,,False
2,for,for,True,True,ADP,prep,,O,,False
3,Fantastic,fantastic,False,True,ADJ,amod,NORP,B,,False
4,beast,beast,False,True,NOUN,compound,,O,,False
5,adaptation,adaptation,False,True,NOUN,pobj,,O,,False
7,I,I,True,True,PRON,nsubj,,O,,False
8,fell,fall,False,True,VERB,ROOT,,O,,False
9,asleep,asleep,False,True,ADJ,advmod,,O,,False
10,watching,watch,False,True,VERB,advcl,,O,,False


In [14]:
pd.set_option('display.max_rows', 60)

# Process all records in a dataframe

### First using tqdm's progress_apply

In [15]:
df['doc_apply'] = df['text'].progress_apply(process_text)

  0%|          | 0/12000 [00:00<?, ?it/s]

In [16]:
# Inspect
df.head()

Unnamed: 0,id,subreddit,title,selftext,category_1,category_2,category_3,in_data,reason_for_exclusion,text,doc_apply
0,8dasa5,harrypotter,Better idea for Fantastic beast adaptation?,I fell asleep watching the Fantastic Beast mov...,books,harry potter,,True,,Better idea for Fantastic beast adaptation? I ...,"(Better, idea, for, Fantastic, beast, adaptati..."
1,6gygi1,dresdenfiles,(No Spoilers) Preview of Peace Talks,Could someone please tell me how many chapters...,books,dresden files,,True,,(No Spoilers) Preview of Peace Talks Could som...,"((, No, Spoilers, ), Preview, of, Peace, Talks..."
2,5xpvak,Parahumans,Looking for 1-2 players for Worm Campaign,Like the title says im looking for 1-2 more pl...,books,parahumans,,True,,Looking for 1-2 players for Worm Campaign Like...,"(Looking, for, 1, -, 2, players, for, Worm, Ca..."
3,7p3vuy,Malazan,Just finished TtH,"I just finished Toll the Hounds, and I dont kn...",books,malazan,,True,,Just finished TtH I just finished Toll the Hou...,"(Just, finished, TtH, I, just, finished, Toll,..."
4,5vkojs,Stormlight_Archive,[TWoK]/[WoR]/[NO SPOILERS] Something that has ...,"Here on earth, green/blue eyes are not a domin...",books,cosmere,,True,,[TWoK]/[WoR]/[NO SPOILERS] Something that has ...,"([, TWoK]/[WoR]/[NO, SPOILERS, ], Something, t..."


In [17]:
# Visualize the tokens for one of them
display_nlp(df.loc[1, 'doc_apply']).head(50)

Unnamed: 0,text,lemma_,is_stop,is_alpha,pos_,dep_,ent_type_,ent_iob_,ent_id,like_email_
1,No,No,True,True,PROPN,compound,,O,,False
2,Spoilers,Spoilers,False,True,PROPN,nmod,,O,,False
4,Preview,Preview,False,True,PROPN,nsubj,,O,,False
5,of,of,True,True,ADP,prep,,O,,False
6,Peace,Peace,False,True,PROPN,compound,,O,,False
7,Talks,Talks,False,True,PROPN,pobj,,O,,False
8,Could,could,True,True,AUX,aux,,O,,False
9,someone,someone,True,True,PRON,nsubj,,O,,False
10,please,please,True,True,INTJ,intj,,O,,False
11,tell,tell,False,True,VERB,ROOT,,O,,False


### Second, using spacy's batch functionality

In [18]:
# Function for doing the textacy preprocessing
def preprocess_text(text):
    # Remove html tags like <lb>
    text = re.sub('<[^<>]*>', '', text)
    
    # Mask emails and phone numbers
    text = preprocessing.replace.emails(text)
    text = preprocessing.replace.phone_numbers(text)
    
    return text

In [19]:
df['doc_spacy_preprocess'] = df['text'].progress_apply(preprocess_text)

  0%|          | 0/12000 [00:00<?, ?it/s]

In [20]:
# Inspect
df.head()

Unnamed: 0,id,subreddit,title,selftext,category_1,category_2,category_3,in_data,reason_for_exclusion,text,doc_apply,doc_spacy_preprocess
0,8dasa5,harrypotter,Better idea for Fantastic beast adaptation?,I fell asleep watching the Fantastic Beast mov...,books,harry potter,,True,,Better idea for Fantastic beast adaptation? I ...,"(Better, idea, for, Fantastic, beast, adaptati...",Better idea for Fantastic beast adaptation? I ...
1,6gygi1,dresdenfiles,(No Spoilers) Preview of Peace Talks,Could someone please tell me how many chapters...,books,dresden files,,True,,(No Spoilers) Preview of Peace Talks Could som...,"((, No, Spoilers, ), Preview, of, Peace, Talks...",(No Spoilers) Preview of Peace Talks Could som...
2,5xpvak,Parahumans,Looking for 1-2 players for Worm Campaign,Like the title says im looking for 1-2 more pl...,books,parahumans,,True,,Looking for 1-2 players for Worm Campaign Like...,"(Looking, for, 1, -, 2, players, for, Worm, Ca...",Looking for 1-2 players for Worm Campaign Like...
3,7p3vuy,Malazan,Just finished TtH,"I just finished Toll the Hounds, and I dont kn...",books,malazan,,True,,Just finished TtH I just finished Toll the Hou...,"(Just, finished, TtH, I, just, finished, Toll,...",Just finished TtH I just finished Toll the Hou...
4,5vkojs,Stormlight_Archive,[TWoK]/[WoR]/[NO SPOILERS] Something that has ...,"Here on earth, green/blue eyes are not a domin...",books,cosmere,,True,,[TWoK]/[WoR]/[NO SPOILERS] Something that has ...,"([, TWoK]/[WoR]/[NO, SPOILERS, ], Something, t...",[TWoK]/[WoR]/[NO SPOILERS] Something that has ...


In [21]:
def process_spacy_batch(nlp, df, input_col, output_col, batch_size=50):
    df[output_col] = None
    
    docs = list()

    for i in tqdm(range(0, len(df), batch_size)):
        loop_docs = nlp.pipe(df[input_col][i:i+batch_size])
        docs += list(loop_docs)

    df[output_col] = docs

In [22]:
process_spacy_batch(nlp, df, 'doc_spacy_preprocess', 'doc_spacy')

  0%|          | 0/240 [00:00<?, ?it/s]

Looks a bit faster, ~33% (roughly 20s versus roughly 30s for 1k rows).  Let's functionize it.

# Extract certain pieces of info from the docs

### First, let's extract all nouns and do a counter

In [23]:
# Function for extract nouns
def extract_nouns(doc):
    patterns = ["POS:NOUN"]
    spans = extract.matches.token_matches(doc, patterns=patterns)
    return [s.lemma_ for s in spans]

In [24]:
# Test on one row
extract_nouns(df.loc[1, 'doc_spacy'])

['chapter',
 'teaser',
 'chapter',
 'atleast',
 'one',
 'other',
 'link',
 'copyright',
 'infringement']

In [25]:
# Apply to all
df['nouns'] = df['doc_spacy'].progress_apply(extract_nouns)

  0%|          | 0/12000 [00:00<?, ?it/s]

In [26]:
df.head(10)

Unnamed: 0,id,subreddit,title,selftext,category_1,category_2,category_3,in_data,reason_for_exclusion,text,doc_apply,doc_spacy_preprocess,doc_spacy,nouns
0,8dasa5,harrypotter,Better idea for Fantastic beast adaptation?,I fell asleep watching the Fantastic Beast mov...,books,harry potter,,True,,Better idea for Fantastic beast adaptation? I ...,"(Better, idea, for, Fantastic, beast, adaptati...",Better idea for Fantastic beast adaptation? I ...,"(Better, idea, for, Fantastic, beast, adaptati...","[idea, beast, adaptation, movie, tv, show, kno..."
1,6gygi1,dresdenfiles,(No Spoilers) Preview of Peace Talks,Could someone please tell me how many chapters...,books,dresden files,,True,,(No Spoilers) Preview of Peace Talks Could som...,"((, No, Spoilers, ), Preview, of, Peace, Talks...",(No Spoilers) Preview of Peace Talks Could som...,"((, No, Spoilers, ), Preview, of, Peace, Talks...","[chapter, teaser, chapter, atleast, one, other..."
2,5xpvak,Parahumans,Looking for 1-2 players for Worm Campaign,Like the title says im looking for 1-2 more pl...,books,parahumans,,True,,Looking for 1-2 players for Worm Campaign Like...,"(Looking, for, 1, -, 2, players, for, Worm, Ca...",Looking for 1-2 players for Worm Campaign Like...,"(Looking, for, 1, -, 2, players, for, Worm, Ca...","[player, title, player, campaign, system, fill..."
3,7p3vuy,Malazan,Just finished TtH,"I just finished Toll the Hounds, and I dont kn...",books,malazan,,True,,Just finished TtH I just finished Toll the Hou...,"(Just, finished, TtH, I, just, finished, Toll,...",Just finished TtH I just finished Toll the Hou...,"(Just, finished, TtH, I, just, finished, Toll,...","[TtH, book, serie, favorite, writing, thing, b..."
4,5vkojs,Stormlight_Archive,[TWoK]/[WoR]/[NO SPOILERS] Something that has ...,"Here on earth, green/blue eyes are not a domin...",books,cosmere,,True,,[TWoK]/[WoR]/[NO SPOILERS] Something that has ...,"([, TWoK]/[WoR]/[NO, SPOILERS, ], Something, t...",[TWoK]/[WoR]/[NO SPOILERS] Something that has ...,"([, TWoK]/[WoR]/[NO, SPOILERS, ], Something, t...","[TWoK]/[WoR]/[NO, spoiler, spoiler, earth, eye..."
5,6vdtnb,Malazan,"I'm newish to the series, and this sub","I just wanted to say hi, and that I really lik...",books,malazan,,True,,"I'm newish to the series, and this sub I just ...","(I, 'm, newish, to, the, series, ,, and, this,...","I'm newish to the series, and this sub I just ...","(I, 'm, newish, to, the, series, ,, and, this,...","[series, sub, place, community, thing, series,..."
6,67blaq,dresdenfiles,So do you guys think the masquerade is going t...,It seems less and less plausible that humanity...,books,dresden files,,True,,So do you guys think the masquerade is going t...,"(So, do, you, guys, think, the, masquerade, is...",So do you guys think the masquerade is going t...,"(So, do, you, guys, think, the, masquerade, is...","[guy, masquerade, series, end, humanity, dark,..."
7,7uhfal,dresdenfiles,First person narrative...,So I've had a dilemma lately. I love the physi...,books,dresden files,,True,,First person narrative... So I've had a dilemm...,"(First, person, narrative, ..., So, I, 've, ha...",First person narrative... So I've had a dilemm...,"(First, person, narrative, ..., So, I, 've, ha...","[person, narrative, dilemma, act, handwriting,..."
8,5gpg4i,harrypotter,Were Dementors once human? Theory,"To give a bit of backstory, Dementors were fir...",books,harry potter,,True,,Were Dementors once human? Theory To give a bi...,"(Were, Dementors, once, human, ?, Theory, To, ...",Were Dementors once human? Theory To give a bi...,"(Were, Dementors, once, human, ?, Theory, To, ...","[theory, bit, backstory, dementor, unveiling, ..."
9,7ca4d7,eroticauthors,Does anybody have any experience writing espio...,"My latest short was a hardcore, mind control l...",books,erotic fiction,,True,,Does anybody have any experience writing espio...,"(Does, anybody, have, any, experience, writing...",Does anybody have any experience writing espio...,"(Does, anybody, have, any, experience, writing...","[experience, espionage, spy, erotica, mind, st..."


In [27]:
# Do a noun counter
noun_counter = Counter()

df['nouns'].progress_map(noun_counter.update)

  0%|          | 0/12000 [00:00<?, ?it/s]

0        None
1        None
2        None
3        None
4        None
         ... 
11995    None
11996    None
11997    None
11998    None
11999    None
Name: nouns, Length: 12000, dtype: object

In [28]:
# Top 20 nouns
noun_counter.most_common(20)

[('book', 12037),
 ('time', 5065),
 ('story', 4182),
 ('series', 3922),
 ('thing', 3729),
 ('character', 3263),
 ('people', 2812),
 ('spoiler', 2795),
 ('power', 2751),
 ('way', 2735),
 ('movie', 2428),
 ('question', 2277),
 ('world', 2128),
 ('year', 1949),
 ('name', 1905),
 ('part', 1748),
 ('lot', 1745),
 ('day', 1629),
 ('chapter', 1606),
 ('one', 1570)]

### Second, let's extract all noun-verb sequences and do a counter

In [29]:
# Function for extract nouns
def extract_noun_verbs(doc):
    patterns = ["POS:NOUN:+ POS:VERB:+", "POS:VERB:+ POS:NOUN:+"]
    spans = extract.matches.token_matches(doc, patterns=patterns)
    return ['_'.join([w.lemma_ for w in s]) for s in spans]

In [30]:
# Test on one row
extract_noun_verbs(df.loc[0, 'doc_spacy'])

['moment_involve']

In [31]:
# Apply to all
df['noun_verbs'] = df['doc_spacy'].progress_apply(extract_noun_verbs)

  0%|          | 0/12000 [00:00<?, ?it/s]

In [32]:
# Do a noun counter
noun_verbs_counter = Counter()

df['noun_verbs'].progress_map(noun_verbs_counter.update)

  0%|          | 0/12000 [00:00<?, ?it/s]

0        None
1        None
2        None
3        None
4        None
         ... 
11995    None
11996    None
11997    None
11998    None
11999    None
Name: noun_verbs, Length: 12000, dtype: object

In [33]:
# Top 20 nouns
noun_verbs_counter.most_common(20)

[('make_sense', 311),
 ('take_place', 202),
 ('guy_think', 186),
 ('page_read', 105),
 ('have_trouble', 99),
 ('read_book', 90),
 ('re_-_read', 82),
 ('re_-', 79),
 ('need_help', 75),
 ('title_say', 70),
 ('people_think', 65),
 ('write_erotica', 64),
 ('book_have', 59),
 ('wizarde_world', 54),
 ('have_access', 49),
 ('question_regard', 47),
 ('do_thing', 43),
 ('-_read', 43),
 ('have_time', 43),
 ('people_say', 42)]

In [34]:
##https://stackoverflow.com/questions/69181078/spacy-how-do-you-add-custom-ner-labels-to-a-pre-trained-model
#
#import spacy
#import random
#from spacy import util
#from spacy.tokens import Doc
#from spacy.training import Example
#from spacy.language import Language
#
#def print_doc_entities(_doc: Doc):
#    if _doc.ents:
#        for _ent in _doc.ents:
#            print(f"     {_ent.text} {_ent.label_}")
#    else:
#        print("     NONE")
#
#def customizing_pipeline_component(nlp: Language):
#    # NOTE: Starting from Spacy 3.0, training via Python API was changed. For information see - https://spacy.io/usage/v3#migrating-training-python
#    train_data = [
#        ('We need to deliver it to Festy.', [(25, 30, 'DISTRICT')]),
#        ('I like red oranges', [])
#    ]
#
#    # Result before training
#    print(f"\nResult BEFORE training:")
#    doc = nlp(u'I need a taxi to Festy.')
#    print_doc_entities(doc)
#
#    # Disable all pipe components except 'ner'
#    disabled_pipes = []
#    for pipe_name in nlp.pipe_names:
#        if pipe_name != 'ner':
#            nlp.disable_pipes(pipe_name)
#            disabled_pipes.append(pipe_name)
#
#    print("   Training ...")
#    optimizer = nlp.create_optimizer()
#    for _ in range(25):
#        random.shuffle(train_data)
#        for raw_text, entity_offsets in train_data:
#            doc = nlp.make_doc(raw_text)
#            example = Example.from_dict(doc, {"entities": entity_offsets})
#            nlp.update([example], sgd=optimizer)
#
#    # Enable all previously disabled pipe components
#    for pipe_name in disabled_pipes:
#        nlp.enable_pipe(pipe_name)
#
#    # Result after training
#    print(f"Result AFTER training:")
#    doc = nlp(u'I need a taxi to Festy.')
#    print_doc_entities(doc)
#
#nlp = spacy.load('en_core_web_sm')
#customizing_pipeline_component(nlp)