An example spaCy NLP pipeline on [some Reddit post data](https://www.kaggle.com/datasets/mswarbrickjones/reddit-selfposts).

Download the data and extract into a `/data` folder.

In [1]:
import spacy
from textacy import preprocessing, extract
import pandas as pd
import re
from collections import Counter

In [2]:
from tqdm import tqdm_notebook
from tqdm.autonotebook import tqdm

# Register for Pandas functions
tqdm.pandas()

  from tqdm.autonotebook import tqdm


##### Download en_core_web_sm
python -m spacy download en_core_web_sm

# Read in sample

In [4]:
# Post info
posts = pd.read_csv('data/rspct.tsv', sep='\t')

posts.head()

Unnamed: 0,id,subreddit,title,selftext
0,6d8knd,talesfromtechsupport,Remember your command line switches...,"Hi there, <lb>The usual. Long time lerker, fi..."
1,58mbft,teenmom,"So what was Matt ""addicted"" to?",Did he ever say what his addiction was or is h...
2,8f73s7,Harley,No Club Colors,Funny story. I went to college in Las Vegas. T...
3,6ti6re,ringdoorbell,"Not door bell, but floodlight mount height.",I know this is a sub for the 'Ring Doorbell' b...
4,77sxto,intel,Worried about my 8700k small fft/data stress r...,"Prime95 (regardless of version) and OCCT both,..."


In [5]:
# Subreddit info
subreddit_info = pd.read_csv('data/subreddit_info.csv').set_index(['subreddit'])

subreddit_info.head()

Unnamed: 0_level_0,category_1,category_2,category_3,in_data,reason_for_exclusion
subreddit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
whatsthatbook,advice/question,book,,True,
CasualConversation,advice/question,broad,,False,too_broad
Clairvoyantreadings,advice/question,broad,,False,too_broad
DecidingToBeBetter,advice/question,broad,,False,too_broad
HelpMeFind,advice/question,broad,,False,too_broad


In [55]:
# Join
df = posts.join(subreddit_info, on='subreddit')

df.head()

Unnamed: 0,id,subreddit,title,selftext,category_1,category_2,category_3,in_data,reason_for_exclusion
0,6d8knd,talesfromtechsupport,Remember your command line switches...,"Hi there, <lb>The usual. Long time lerker, fi...",writing/stories,tech support,,True,
1,58mbft,teenmom,"So what was Matt ""addicted"" to?",Did he ever say what his addiction was or is h...,tv_show,teen mom,,True,
2,8f73s7,Harley,No Club Colors,Funny story. I went to college in Las Vegas. T...,autos,harley davidson,,True,
3,6ti6re,ringdoorbell,"Not door bell, but floodlight mount height.",I know this is a sub for the 'Ring Doorbell' b...,hardware/tools,doorbells,,True,
4,77sxto,intel,Worried about my 8700k small fft/data stress r...,"Prime95 (regardless of version) and OCCT both,...",electronics,cpu,intel,True,


In [56]:
# See some category 1s that aren't *too* big
df.groupby('category_1')['id'].count().reset_index().sort_values('id', ascending=False).head(50).tail(20)

Unnamed: 0,category_1,id
5,autos,20000
0,advice/question,18000
1,animals,17000
12,education,17000
31,social_group,16000
25,politics/viewpoint,16000
15,food/drink,15000
8,card_game,15000
34,stem,14000
17,hardware/tools,14000


In [86]:
# Filter to books
df = df[df.category_1 == 'books']

# Reset index for easier subsetting/sampling
df.reset_index(inplace=True)
df.drop('index', axis=1, inplace=True)

# Filter to top 1k for faster testing
df = df.head(1000)

df.head()

Unnamed: 0,id,subreddit,title,selftext,category_1,category_2,category_3,in_data,reason_for_exclusion,text,doc_apply,doc_spacy_preprocess,doc_spacy
0,8dasa5,harrypotter,Better idea for Fantastic beast adaptation?,I fell asleep watching the Fantastic Beast mov...,books,harry potter,,True,,Better idea for Fantastic beast adaptation? I ...,"(Better, idea, for, Fantastic, beast, adaptati...",Better idea for Fantastic beast adaptation? I ...,"(Better, idea, for, Fantastic, beast, adaptati..."
1,6gygi1,dresdenfiles,(No Spoilers) Preview of Peace Talks,Could someone please tell me how many chapters...,books,dresden files,,True,,(No Spoilers) Preview of Peace Talks Could som...,"((, No, Spoilers, ), Preview, of, Peace, Talks...",(No Spoilers) Preview of Peace Talks Could som...,"((, No, Spoilers, ), Preview, of, Peace, Talks..."
2,5xpvak,Parahumans,Looking for 1-2 players for Worm Campaign,Like the title says im looking for 1-2 more pl...,books,parahumans,,True,,Looking for 1-2 players for Worm Campaign Like...,"(Looking, for, 1, -, 2, players, for, Worm, Ca...",Looking for 1-2 players for Worm Campaign Like...,"(Looking, for, 1, -, 2, players, for, Worm, Ca..."
3,7p3vuy,Malazan,Just finished TtH,"I just finished Toll the Hounds, and I dont kn...",books,malazan,,True,,Just finished TtH I just finished Toll the Hou...,"(Just, finished, TtH, I, just, finished, Toll,...",Just finished TtH I just finished Toll the Hou...,"(Just, finished, TtH, I, just, finished, Toll,..."
4,5vkojs,Stormlight_Archive,[TWoK]/[WoR]/[NO SPOILERS] Something that has ...,"Here on earth, green/blue eyes are not a domin...",books,cosmere,,True,,[TWoK]/[WoR]/[NO SPOILERS] Something that has ...,"([, TWoK]/[WoR]/[NO, SPOILERS, ], Something, t...",[TWoK]/[WoR]/[NO SPOILERS] Something that has ...,"([, TWoK]/[WoR]/[NO, SPOILERS, ], Something, t..."


# Create a basic spaCy pipeline

In [87]:
# Get all text in column
df['text'] = df['title'] + ' ' + df['selftext']

df.head()

Unnamed: 0,id,subreddit,title,selftext,category_1,category_2,category_3,in_data,reason_for_exclusion,text,doc_apply,doc_spacy_preprocess,doc_spacy
0,8dasa5,harrypotter,Better idea for Fantastic beast adaptation?,I fell asleep watching the Fantastic Beast mov...,books,harry potter,,True,,Better idea for Fantastic beast adaptation? I ...,"(Better, idea, for, Fantastic, beast, adaptati...",Better idea for Fantastic beast adaptation? I ...,"(Better, idea, for, Fantastic, beast, adaptati..."
1,6gygi1,dresdenfiles,(No Spoilers) Preview of Peace Talks,Could someone please tell me how many chapters...,books,dresden files,,True,,(No Spoilers) Preview of Peace Talks Could som...,"((, No, Spoilers, ), Preview, of, Peace, Talks...",(No Spoilers) Preview of Peace Talks Could som...,"((, No, Spoilers, ), Preview, of, Peace, Talks..."
2,5xpvak,Parahumans,Looking for 1-2 players for Worm Campaign,Like the title says im looking for 1-2 more pl...,books,parahumans,,True,,Looking for 1-2 players for Worm Campaign Like...,"(Looking, for, 1, -, 2, players, for, Worm, Ca...",Looking for 1-2 players for Worm Campaign Like...,"(Looking, for, 1, -, 2, players, for, Worm, Ca..."
3,7p3vuy,Malazan,Just finished TtH,"I just finished Toll the Hounds, and I dont kn...",books,malazan,,True,,Just finished TtH I just finished Toll the Hou...,"(Just, finished, TtH, I, just, finished, Toll,...",Just finished TtH I just finished Toll the Hou...,"(Just, finished, TtH, I, just, finished, Toll,..."
4,5vkojs,Stormlight_Archive,[TWoK]/[WoR]/[NO SPOILERS] Something that has ...,"Here on earth, green/blue eyes are not a domin...",books,cosmere,,True,,[TWoK]/[WoR]/[NO SPOILERS] Something that has ...,"([, TWoK]/[WoR]/[NO, SPOILERS, ], Something, t...",[TWoK]/[WoR]/[NO SPOILERS] Something that has ...,"([, TWoK]/[WoR]/[NO, SPOILERS, ], Something, t..."


In [88]:
# Initialize a pipeline
nlp = spacy.load('en_core_web_sm')

nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7f875938a140>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7f87591b5b40>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7f87597eb300>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7f8759100540>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7f8759140f40>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7f87597ebca0>)]

In [89]:
# Create function to process
def process_text(text):
    # Remove html tags like <lb>
    text = re.sub('<[^<>]*>', '', text)
    
    # Mask emails and phone numbers
    text = preprocessing.replace.emails(text)
    text = preprocessing.replace.phone_numbers(text)
    
    # NLP it
    doc = nlp(text)
    
    return doc

In [90]:
# Function for displayign results
def display_nlp(doc):
    rows = []
    for i, t in enumerate(doc):
        if not t.is_punct:
            row = {'token': i, 'text': t.text, 'lemma_': t.lemma_,
                'is_stop': t.is_stop, 'is_alpha': t.is_alpha, 'pos_': t.pos_,
                'dep_': t.dep_, 'ent_type_': t.ent_type_, 'ent_iob_': t.ent_iob_, 
                'ent_id': t.ent_id_, 'like_email_': t.like_email
            }
            rows.append(row)
    df = pd.DataFrame(rows).set_index('token')
    df.index.name = None
    return df

# Process a single record

In [91]:
# One record
doc = process_text(df.text[0])

doc

Better idea for Fantastic beast adaptation? I fell asleep watching the Fantastic Beast movie so I’m guessing it wasn’t great.  I feel like it would be really well adapted into a TV show (not much knowledge on media rights).  Each chapter would be an episode and use different characters to show the discovery or a crazy moment involving the creature.  

In [92]:
pd.set_option('display.max_rows', None)

display_nlp(doc)

Unnamed: 0,text,lemma_,is_stop,is_alpha,pos_,dep_,ent_type_,ent_iob_,ent_id,like_email_
0,Better,well,False,True,ADJ,amod,,O,,False
1,idea,idea,False,True,NOUN,ROOT,,O,,False
2,for,for,True,True,ADP,prep,,O,,False
3,Fantastic,fantastic,False,True,ADJ,amod,NORP,B,,False
4,beast,beast,False,True,NOUN,compound,,O,,False
5,adaptation,adaptation,False,True,NOUN,pobj,,O,,False
7,I,I,True,True,PRON,nsubj,,O,,False
8,fell,fall,False,True,VERB,ROOT,,O,,False
9,asleep,asleep,False,True,ADJ,advmod,,O,,False
10,watching,watch,False,True,VERB,advcl,,O,,False


In [93]:
pd.set_option('display.max_rows', 60)

# Process all records in a dataframe

### First using tqdm's progress_apply

In [94]:
df['doc_apply'] = df['text'].progress_apply(process_text)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [95]:
# Inspect
df.head()

Unnamed: 0,id,subreddit,title,selftext,category_1,category_2,category_3,in_data,reason_for_exclusion,text,doc_apply,doc_spacy_preprocess,doc_spacy
0,8dasa5,harrypotter,Better idea for Fantastic beast adaptation?,I fell asleep watching the Fantastic Beast mov...,books,harry potter,,True,,Better idea for Fantastic beast adaptation? I ...,"(Better, idea, for, Fantastic, beast, adaptati...",Better idea for Fantastic beast adaptation? I ...,"(Better, idea, for, Fantastic, beast, adaptati..."
1,6gygi1,dresdenfiles,(No Spoilers) Preview of Peace Talks,Could someone please tell me how many chapters...,books,dresden files,,True,,(No Spoilers) Preview of Peace Talks Could som...,"((, No, Spoilers, ), Preview, of, Peace, Talks...",(No Spoilers) Preview of Peace Talks Could som...,"((, No, Spoilers, ), Preview, of, Peace, Talks..."
2,5xpvak,Parahumans,Looking for 1-2 players for Worm Campaign,Like the title says im looking for 1-2 more pl...,books,parahumans,,True,,Looking for 1-2 players for Worm Campaign Like...,"(Looking, for, 1, -, 2, players, for, Worm, Ca...",Looking for 1-2 players for Worm Campaign Like...,"(Looking, for, 1, -, 2, players, for, Worm, Ca..."
3,7p3vuy,Malazan,Just finished TtH,"I just finished Toll the Hounds, and I dont kn...",books,malazan,,True,,Just finished TtH I just finished Toll the Hou...,"(Just, finished, TtH, I, just, finished, Toll,...",Just finished TtH I just finished Toll the Hou...,"(Just, finished, TtH, I, just, finished, Toll,..."
4,5vkojs,Stormlight_Archive,[TWoK]/[WoR]/[NO SPOILERS] Something that has ...,"Here on earth, green/blue eyes are not a domin...",books,cosmere,,True,,[TWoK]/[WoR]/[NO SPOILERS] Something that has ...,"([, TWoK]/[WoR]/[NO, SPOILERS, ], Something, t...",[TWoK]/[WoR]/[NO SPOILERS] Something that has ...,"([, TWoK]/[WoR]/[NO, SPOILERS, ], Something, t..."


In [97]:
# Visualize the tokens for one of them
display_nlp(df.loc[1, 'doc_apply']).head(50)

Unnamed: 0,text,lemma_,is_stop,is_alpha,pos_,dep_,ent_type_,ent_iob_,ent_id,like_email_
1,No,No,True,True,PROPN,compound,,O,,False
2,Spoilers,Spoilers,False,True,PROPN,nmod,,O,,False
4,Preview,Preview,False,True,PROPN,nsubj,,O,,False
5,of,of,True,True,ADP,prep,,O,,False
6,Peace,Peace,False,True,PROPN,compound,,O,,False
7,Talks,Talks,False,True,PROPN,pobj,,O,,False
8,Could,could,True,True,AUX,aux,,O,,False
9,someone,someone,True,True,PRON,nsubj,,O,,False
10,please,please,True,True,INTJ,intj,,O,,False
11,tell,tell,False,True,VERB,ROOT,,O,,False


### Second, using spacy's batch functionality

In [98]:
# Function for doing the textacy preprocessing
def preprocess_text(text):
    # Remove html tags like <lb>
    text = re.sub('<[^<>]*>', '', text)
    
    # Mask emails and phone numbers
    text = preprocessing.replace.emails(text)
    text = preprocessing.replace.phone_numbers(text)
    
    return text

In [99]:
df['doc_spacy_preprocess'] = df['text'].progress_apply(preprocess_text)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [100]:
# Inspect
df.head()

Unnamed: 0,id,subreddit,title,selftext,category_1,category_2,category_3,in_data,reason_for_exclusion,text,doc_apply,doc_spacy_preprocess,doc_spacy
0,8dasa5,harrypotter,Better idea for Fantastic beast adaptation?,I fell asleep watching the Fantastic Beast mov...,books,harry potter,,True,,Better idea for Fantastic beast adaptation? I ...,"(Better, idea, for, Fantastic, beast, adaptati...",Better idea for Fantastic beast adaptation? I ...,"(Better, idea, for, Fantastic, beast, adaptati..."
1,6gygi1,dresdenfiles,(No Spoilers) Preview of Peace Talks,Could someone please tell me how many chapters...,books,dresden files,,True,,(No Spoilers) Preview of Peace Talks Could som...,"((, No, Spoilers, ), Preview, of, Peace, Talks...",(No Spoilers) Preview of Peace Talks Could som...,"((, No, Spoilers, ), Preview, of, Peace, Talks..."
2,5xpvak,Parahumans,Looking for 1-2 players for Worm Campaign,Like the title says im looking for 1-2 more pl...,books,parahumans,,True,,Looking for 1-2 players for Worm Campaign Like...,"(Looking, for, 1, -, 2, players, for, Worm, Ca...",Looking for 1-2 players for Worm Campaign Like...,"(Looking, for, 1, -, 2, players, for, Worm, Ca..."
3,7p3vuy,Malazan,Just finished TtH,"I just finished Toll the Hounds, and I dont kn...",books,malazan,,True,,Just finished TtH I just finished Toll the Hou...,"(Just, finished, TtH, I, just, finished, Toll,...",Just finished TtH I just finished Toll the Hou...,"(Just, finished, TtH, I, just, finished, Toll,..."
4,5vkojs,Stormlight_Archive,[TWoK]/[WoR]/[NO SPOILERS] Something that has ...,"Here on earth, green/blue eyes are not a domin...",books,cosmere,,True,,[TWoK]/[WoR]/[NO SPOILERS] Something that has ...,"([, TWoK]/[WoR]/[NO, SPOILERS, ], Something, t...",[TWoK]/[WoR]/[NO SPOILERS] Something that has ...,"([, TWoK]/[WoR]/[NO, SPOILERS, ], Something, t..."


In [113]:
def process_spacy_batch(nlp, df, input_col, output_col, batch_size=50):
    #for i in tqdm(range(0, len(posts), batch_size)):
    #    docs = nlp.pipe(df[input_col][i:i+batch_size])
    #    df[output_col][i:i+batch_size] = list(docs)
    df[output_col] = None
    
    docs = list()

    for i in tqdm(range(0, len(df), batch_size)):
        loop_docs = nlp.pipe(df[input_col][i:i+batch_size])
        docs += list(loop_docs)

    df[output_col] = docs

In [115]:
process_spacy_batch(nlp, df, 'doc_spacy_preprocess', 'doc_spacy')

  0%|          | 0/20 [00:00<?, ?it/s]

Looks a bit faster, ~33% (roughly 20s versus roughly 30s for 1k rows).  Let's functionize it.

# Extract certain pieces of info from the docs

### First, let's extract all nouns and do a counter

In [116]:
# Function for extract nouns
def extract_nouns(doc):
    patterns = ["POS:NOUN"]
    spans = extract.matches.token_matches(doc, patterns=patterns)
    return [s.lemma_ for s in spans]

In [117]:
# Test on one row
extract_nouns(df.loc[1, 'doc_spacy'])

['chapter',
 'teaser',
 'chapter',
 'atleast',
 'one',
 'other',
 'link',
 'copyright',
 'infringement']

In [118]:
# Apply to all
df['nouns'] = df['doc_spacy'].progress_apply(extract_nouns)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [119]:
df.head(10)

Unnamed: 0,id,subreddit,title,selftext,category_1,category_2,category_3,in_data,reason_for_exclusion,text,doc_apply,doc_spacy_preprocess,doc_spacy,nouns
0,8dasa5,harrypotter,Better idea for Fantastic beast adaptation?,I fell asleep watching the Fantastic Beast mov...,books,harry potter,,True,,Better idea for Fantastic beast adaptation? I ...,"(Better, idea, for, Fantastic, beast, adaptati...",Better idea for Fantastic beast adaptation? I ...,"(Better, idea, for, Fantastic, beast, adaptati...","[idea, beast, adaptation, movie, tv, show, kno..."
1,6gygi1,dresdenfiles,(No Spoilers) Preview of Peace Talks,Could someone please tell me how many chapters...,books,dresden files,,True,,(No Spoilers) Preview of Peace Talks Could som...,"((, No, Spoilers, ), Preview, of, Peace, Talks...",(No Spoilers) Preview of Peace Talks Could som...,"((, No, Spoilers, ), Preview, of, Peace, Talks...","[chapter, teaser, chapter, atleast, one, other..."
2,5xpvak,Parahumans,Looking for 1-2 players for Worm Campaign,Like the title says im looking for 1-2 more pl...,books,parahumans,,True,,Looking for 1-2 players for Worm Campaign Like...,"(Looking, for, 1, -, 2, players, for, Worm, Ca...",Looking for 1-2 players for Worm Campaign Like...,"(Looking, for, 1, -, 2, players, for, Worm, Ca...","[player, title, player, campaign, system, fill..."
3,7p3vuy,Malazan,Just finished TtH,"I just finished Toll the Hounds, and I dont kn...",books,malazan,,True,,Just finished TtH I just finished Toll the Hou...,"(Just, finished, TtH, I, just, finished, Toll,...",Just finished TtH I just finished Toll the Hou...,"(Just, finished, TtH, I, just, finished, Toll,...","[TtH, book, serie, favorite, writing, thing, b..."
4,5vkojs,Stormlight_Archive,[TWoK]/[WoR]/[NO SPOILERS] Something that has ...,"Here on earth, green/blue eyes are not a domin...",books,cosmere,,True,,[TWoK]/[WoR]/[NO SPOILERS] Something that has ...,"([, TWoK]/[WoR]/[NO, SPOILERS, ], Something, t...",[TWoK]/[WoR]/[NO SPOILERS] Something that has ...,"([, TWoK]/[WoR]/[NO, SPOILERS, ], Something, t...","[TWoK]/[WoR]/[NO, spoiler, spoiler, earth, eye..."
5,6vdtnb,Malazan,"I'm newish to the series, and this sub","I just wanted to say hi, and that I really lik...",books,malazan,,True,,"I'm newish to the series, and this sub I just ...","(I, 'm, newish, to, the, series, ,, and, this,...","I'm newish to the series, and this sub I just ...","(I, 'm, newish, to, the, series, ,, and, this,...","[series, sub, place, community, thing, series,..."
6,67blaq,dresdenfiles,So do you guys think the masquerade is going t...,It seems less and less plausible that humanity...,books,dresden files,,True,,So do you guys think the masquerade is going t...,"(So, do, you, guys, think, the, masquerade, is...",So do you guys think the masquerade is going t...,"(So, do, you, guys, think, the, masquerade, is...","[guy, masquerade, series, end, humanity, dark,..."
7,7uhfal,dresdenfiles,First person narrative...,So I've had a dilemma lately. I love the physi...,books,dresden files,,True,,First person narrative... So I've had a dilemm...,"(First, person, narrative, ..., So, I, 've, ha...",First person narrative... So I've had a dilemm...,"(First, person, narrative, ..., So, I, 've, ha...","[person, narrative, dilemma, act, handwriting,..."
8,5gpg4i,harrypotter,Were Dementors once human? Theory,"To give a bit of backstory, Dementors were fir...",books,harry potter,,True,,Were Dementors once human? Theory To give a bi...,"(Were, Dementors, once, human, ?, Theory, To, ...",Were Dementors once human? Theory To give a bi...,"(Were, Dementors, once, human, ?, Theory, To, ...","[theory, bit, backstory, dementor, unveiling, ..."
9,7ca4d7,eroticauthors,Does anybody have any experience writing espio...,"My latest short was a hardcore, mind control l...",books,erotic fiction,,True,,Does anybody have any experience writing espio...,"(Does, anybody, have, any, experience, writing...",Does anybody have any experience writing espio...,"(Does, anybody, have, any, experience, writing...","[experience, espionage, spy, erotica, mind, st..."


In [120]:
# Do a noun counter
noun_counter = Counter()

df['nouns'].progress_map(noun_counter.update)

  0%|          | 0/1000 [00:00<?, ?it/s]

0      None
1      None
2      None
3      None
4      None
       ... 
995    None
996    None
997    None
998    None
999    None
Name: nouns, Length: 1000, dtype: object

In [121]:
# Top 20 nouns
noun_counter.most_common(20)

[('book', 1085),
 ('time', 445),
 ('thing', 355),
 ('story', 355),
 ('series', 340),
 ('character', 247),
 ('world', 233),
 ('people', 231),
 ('power', 222),
 ('way', 219),
 ('spoiler', 212),
 ('question', 198),
 ('movie', 196),
 ('name', 163),
 ('year', 158),
 ('day', 148),
 ('lot', 144),
 ('point', 138),
 ('chapter', 137),
 ('idea', 136)]

### Second, let's extract all noun-verb sequences and do a counter

In [123]:
# Function for extract nouns
def extract_noun_verbs(doc):
    patterns = ["POS:NOUN:+ POS:VERB:+", "POS:VERB:+ POS:NOUN:+"]
    spans = extract.matches.token_matches(doc, patterns=patterns)
    return ['_'.join([w.lemma_ for w in s]) for s in spans]

In [125]:
# Test on one row
extract_noun_verbs(df.loc[0, 'doc_spacy'])

['moment_involve']

In [126]:
# Apply to all
df['noun_verbs'] = df['doc_spacy'].progress_apply(extract_noun_verbs)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [127]:
# Do a noun counter
noun_verbs_counter = Counter()

df['noun_verbs'].progress_map(noun_verbs_counter.update)

  0%|          | 0/1000 [00:00<?, ?it/s]

0      None
1      None
2      None
3      None
4      None
       ... 
995    None
996    None
997    None
998    None
999    None
Name: noun_verbs, Length: 1000, dtype: object

In [128]:
# Top 20 nouns
noun_verbs_counter.most_common(20)

[('make_sense', 23),
 ('guy_think', 15),
 ('take_place', 14),
 ('have_trouble', 11),
 ('read_book', 10),
 ('page_read', 8),
 ('thing_happen', 7),
 ('lock_chest', 7),
 ('book_have', 7),
 ('have_access', 7),
 ('people_say', 7),
 ('make_thing', 6),
 ('thrice_lock', 6),
 ('book_set', 6),
 ('thing_go', 6),
 ('see_people', 5),
 ('write_erotica', 5),
 ('make_money', 5),
 ('character_seem', 5),
 ('question_regard', 5)]

In [188]:
#https://stackoverflow.com/questions/69181078/spacy-how-do-you-add-custom-ner-labels-to-a-pre-trained-model

import spacy
import random
from spacy import util
from spacy.tokens import Doc
from spacy.training import Example
from spacy.language import Language

def print_doc_entities(_doc: Doc):
    if _doc.ents:
        for _ent in _doc.ents:
            print(f"     {_ent.text} {_ent.label_}")
    else:
        print("     NONE")

def customizing_pipeline_component(nlp: Language):
    # NOTE: Starting from Spacy 3.0, training via Python API was changed. For information see - https://spacy.io/usage/v3#migrating-training-python
    train_data = [
        ('We need to deliver it to Festy.', [(25, 30, 'DISTRICT')]),
        ('I like red oranges', [])
    ]

    # Result before training
    print(f"\nResult BEFORE training:")
    doc = nlp(u'I need a taxi to Festy.')
    print_doc_entities(doc)

    # Disable all pipe components except 'ner'
    disabled_pipes = []
    for pipe_name in nlp.pipe_names:
        if pipe_name != 'ner':
            nlp.disable_pipes(pipe_name)
            disabled_pipes.append(pipe_name)

    print("   Training ...")
    optimizer = nlp.create_optimizer()
    for _ in range(25):
        random.shuffle(train_data)
        for raw_text, entity_offsets in train_data:
            doc = nlp.make_doc(raw_text)
            example = Example.from_dict(doc, {"entities": entity_offsets})
            nlp.update([example], sgd=optimizer)

    # Enable all previously disabled pipe components
    for pipe_name in disabled_pipes:
        nlp.enable_pipe(pipe_name)

    # Result after training
    print(f"Result AFTER training:")
    doc = nlp(u'I need a taxi to Festy.')
    print_doc_entities(doc)

nlp = spacy.load('en_core_web_sm')
customizing_pipeline_component(nlp)


Result BEFORE training:
     Festy GPE
   Training ...
Result AFTER training:
     Festy DISTRICT


In [22]:
batch_size = 50

for i in range(0, len(posts), batch_size):
    docs = nlp.pipe(posts['selftext'][i:i+batch_size])
    
    for j, doc in enumerate(docs):
        for col, values in extract_nlp(doc)

False

# First, test out examples from a sentiment repo

https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest

In [1]:
import transformers

In [2]:
print(2)

2


In [3]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
#model.save_pretrained(MODEL)
text = "Covid cases are increasing fast!"
text = preprocess(text)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
# # TF
# model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
# model.save_pretrained(MODEL)
# text = "Covid cases are increasing fast!"
# encoded_input = tokenizer(text, return_tensors='tf')
# output = model(encoded_input)
# scores = output[0][0].numpy()
# scores = softmax(scores)
# Print labels and scores
ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = config.id2label[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1) negative 0.7236
2) neutral 0.2287
3) positive 0.0477


# Trying ammr/RobertaLarge-Gambling6.2

In [4]:
MODEL = f"ammr/RobertaLarge-Gambling6.2"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)

OSError: ammr/RobertaLarge-Gambling6.2 does not appear to have a file named config.json. Checkout 'https://huggingface.co/ammr/RobertaLarge-Gambling6.2/main' for available files.

In [3]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
#model.save_pretrained(MODEL)
text = "Covid cases are increasing fast!"
text = preprocess(text)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
# # TF
# model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
# model.save_pretrained(MODEL)
# text = "Covid cases are increasing fast!"
# encoded_input = tokenizer(text, return_tensors='tf')
# output = model(encoded_input)
# scores = output[0][0].numpy()
# scores = softmax(scores)
# Print labels and scores
ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = config.id2label[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")

RuntimeError: Failed to import transformers.models.auto.tokenization_auto because of the following error (look up to see its traceback):
libssl.so.10: cannot open shared object file: No such file or directory

# Warm-up: numpy

In [8]:
import numpy as np
import math

# Create random input and output data
x = np.linspace(-math.pi, math.pi, 2000)
y = np.sin(x)

# Randomly initialize weights
a = np.random.randn()
b = np.random.randn()
c = np.random.randn()
d = np.random.randn()

learning_rate = 1e-6
for t in range(2000):
    # Forward pass: compute predicted y
    # y = a + b x + c x^2 + d x^3
    y_pred = a + b * x + c * x ** 2 + d * x ** 3

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    if t % 100 == 99:
        print(t, loss)

    # Backprop to compute gradients of a, b, c, d with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_a = grad_y_pred.sum()
    grad_b = (grad_y_pred * x).sum()
    grad_c = (grad_y_pred * x ** 2).sum()
    grad_d = (grad_y_pred * x ** 3).sum()

    # Update weights
    a -= learning_rate * grad_a
    b -= learning_rate * grad_b
    c -= learning_rate * grad_c
    d -= learning_rate * grad_d

print(f'Result: y = {a} + {b} x + {c} x^2 + {d} x^3')

99 415.03803869054195
199 294.89318640576624
299 210.34355120956133
399 150.82212187163634
499 108.90683215126217
599 79.38105441571744
699 58.5767795388155
799 43.91390394828215
899 33.57690771092599
999 26.287842128834114
1099 21.14686557304085
1199 17.52017966501547
1299 14.961243719728937
1399 13.15536359648386
1499 11.880705767485411
1599 10.980858201652115
1699 10.34551177639144
1799 9.896854740008397
1899 9.579987903168718
1999 9.356170479379067
Result: y = -0.024309887664472608 + 0.8534384488443799 x + 0.004193860140398336 x^2 + -0.09286065221896801 x^3


# PyTorch: Tensors

In [11]:
import torch
import math

dtype = torch.float
device = torch.device("cpu")
#device = torch.device("cuda:0") # Uncomment this to run on GPU

# Create random input and output data
x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
y = torch.sin(x)

# Randomly initialize weights
a = torch.randn((), device=device, dtype=dtype)
b = torch.randn((), device=device, dtype=dtype)
c = torch.randn((), device=device, dtype=dtype)
d = torch.randn((), device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(2000):
    # Forward pass: compute predicted y
    y_pred = a + b * x + c * x ** 2 + d * x ** 3

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    if t % 100 == 99:
        print(t, loss)

    # Backprop to compute gradients of a, b, c, d with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_a = grad_y_pred.sum()
    grad_b = (grad_y_pred * x).sum()
    grad_c = (grad_y_pred * x ** 2).sum()
    grad_d = (grad_y_pred * x ** 3).sum()

    # Update weights using gradient descent
    a -= learning_rate * grad_a
    b -= learning_rate * grad_b
    c -= learning_rate * grad_c
    d -= learning_rate * grad_d


print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')

99 3893.76416015625
199 2734.6689453125
299 1922.3388671875
399 1352.7203369140625
499 953.0848388671875
599 672.5684814453125
699 475.5719909667969
799 337.1661682128906
899 239.88427734375
999 171.4793701171875
1099 123.36146545410156
1199 89.50177764892578
1299 65.66727447509766
1399 48.88423538208008
1499 37.06294631958008
1599 28.734058380126953
1699 22.86427116394043
1799 18.726476669311523
1899 15.808906555175781
1999 13.751249313354492
Result: y = 0.07287223637104034 + 0.8431879878044128 x + -0.012571671977639198 x^2 + -0.09140260517597198 x^3


# PyTorch: Tensors and autograd

In [12]:
import torch
import math

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0")  # Uncomment this to run on GPU

# Create Tensors to hold input and outputs.
# By default, requires_grad=False, which indicates that we do not need to
# compute gradients with respect to these Tensors during the backward pass.
x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
y = torch.sin(x)

# Create random Tensors for weights. For a third order polynomial, we need
# 4 weights: y = a + b x + c x^2 + d x^3
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
a = torch.randn((), device=device, dtype=dtype, requires_grad=True)
b = torch.randn((), device=device, dtype=dtype, requires_grad=True)
c = torch.randn((), device=device, dtype=dtype, requires_grad=True)
d = torch.randn((), device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(2000):
    # Forward pass: compute predicted y using operations on Tensors.
    y_pred = a + b * x + c * x ** 2 + d * x ** 3

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call a.grad, b.grad. c.grad and d.grad will be Tensors holding
    # the gradient of the loss with respect to a, b, c, d respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    with torch.no_grad():
        a -= learning_rate * a.grad
        b -= learning_rate * b.grad
        c -= learning_rate * c.grad
        d -= learning_rate * d.grad

        # Manually zero the gradients after updating weights
        a.grad = None
        b.grad = None
        c.grad = None
        d.grad = None

print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')

99 2132.51904296875
199 1510.9560546875
299 1071.3453369140625
399 760.41064453125
499 540.4814453125
599 384.9165954589844
699 274.87603759765625
799 197.03587341308594
899 141.97201538085938
999 103.01905059814453
1099 75.46246337890625
1199 55.96778106689453
1299 42.17603302001953
1399 32.41878890991211
1499 25.515701293945312
1599 20.631790161132812
1699 17.176414489746094
1799 14.731683731079102
1899 13.00197982788086
1999 11.778152465820312
Result: y = 0.05754031613469124 + 0.8591004014015198 x + -0.009926660917699337 x^2 + -0.09366601705551147 x^3


# PyTorch: Defining new autograd functions

In [13]:
import torch
import math


class LegendrePolynomial3(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return 0.5 * (5 * input ** 3 - 3 * input)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        return grad_output * 1.5 * (5 * input ** 2 - 1)


dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0")  # Uncomment this to run on GPU

# Create Tensors to hold input and outputs.
# By default, requires_grad=False, which indicates that we do not need to
# compute gradients with respect to these Tensors during the backward pass.
x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
y = torch.sin(x)

# Create random Tensors for weights. For this example, we need
# 4 weights: y = a + b * P3(c + d * x), these weights need to be initialized
# not too far from the correct result to ensure convergence.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
a = torch.full((), 0.0, device=device, dtype=dtype, requires_grad=True)
b = torch.full((), -1.0, device=device, dtype=dtype, requires_grad=True)
c = torch.full((), 0.0, device=device, dtype=dtype, requires_grad=True)
d = torch.full((), 0.3, device=device, dtype=dtype, requires_grad=True)

learning_rate = 5e-6
for t in range(2000):
    # To apply our Function, we use Function.apply method. We alias this as 'P3'.
    P3 = LegendrePolynomial3.apply

    # Forward pass: compute predicted y using operations; we compute
    # P3 using our custom autograd operation.
    y_pred = a + b * P3(c + d * x)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    with torch.no_grad():
        a -= learning_rate * a.grad
        b -= learning_rate * b.grad
        c -= learning_rate * c.grad
        d -= learning_rate * d.grad

        # Manually zero the gradients after updating weights
        a.grad = None
        b.grad = None
        c.grad = None
        d.grad = None

print(f'Result: y = {a.item()} + {b.item()} * P3({c.item()} + {d.item()} x)')

99 209.95834350585938
199 144.66018676757812
299 100.70249938964844
399 71.03519439697266
499 50.97850799560547
599 37.403133392333984
699 28.206867218017578
799 21.973188400268555
899 17.7457275390625
999 14.877889633178711
1099 12.931766510009766
1199 11.610918045043945
1299 10.714258193969727
1399 10.10548210144043
1499 9.692106246948242
1599 9.411375045776367
1699 9.220745086669922
1799 9.091285705566406
1899 9.003361701965332
1999 8.943639755249023
Result: y = -5.423830273798558e-09 + -2.208526849746704 * P3(1.3320399228078372e-09 + 0.2554861009120941 x)


# PyTorch: nn

In [14]:
import torch
import math


# Create Tensors to hold input and outputs.
x = torch.linspace(-math.pi, math.pi, 2000)
y = torch.sin(x)

# For this example, the output y is a linear function of (x, x^2, x^3), so
# we can consider it as a linear layer neural network. Let's prepare the
# tensor (x, x^2, x^3).
p = torch.tensor([1, 2, 3])
xx = x.unsqueeze(-1).pow(p)

# In the above code, x.unsqueeze(-1) has shape (2000, 1), and p has shape
# (3,), for this case, broadcasting semantics will apply to obtain a tensor
# of shape (2000, 3) 

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. The Linear Module computes output from input using a
# linear function, and holds internal Tensors for its weight and bias.
# The Flatten layer flatens the output of the linear layer to a 1D tensor,
# to match the shape of `y`.
model = torch.nn.Sequential(
    torch.nn.Linear(3, 1),
    torch.nn.Flatten(0, 1)
)

# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-6
for t in range(2000):

    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Tensor of input data to the Module and it produces
    # a Tensor of output data.
    y_pred = model(xx)

    # Compute and print loss. We pass Tensors containing the predicted and true
    # values of y, and the loss function returns a Tensor containing the
    # loss.
    loss = loss_fn(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())

    # Zero the gradients before running the backward pass.
    model.zero_grad()

    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Tensors with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()

    # Update the weights using gradient descent. Each parameter is a Tensor, so
    # we can access its gradients like we did before.
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

# You can access the first layer of `model` like accessing the first item of a list
linear_layer = model[0]

# For linear layer, its parameters are stored as `weight` and `bias`.
print(f'Result: y = {linear_layer.bias.item()} + {linear_layer.weight[:, 0].item()} x + {linear_layer.weight[:, 1].item()} x^2 + {linear_layer.weight[:, 2].item()} x^3')

99 489.1260986328125
199 326.472412109375
299 218.90310668945312
399 147.7622833251953
499 100.712890625
599 69.59596252441406
699 49.01630401611328
799 35.405311584472656
899 26.40318489074707
999 20.44918441772461
1099 16.51112937927246
1199 13.906396865844727
1299 12.18358039855957
1399 11.044008255004883
1499 10.290224075317383
1599 9.791631698608398
1699 9.461831092834473
1799 9.243647575378418
1899 9.09931755065918
1999 9.003837585449219
Result: y = 0.0013163856929168105 + 0.8435145616531372 x + -0.00022709897893946618 x^2 + -0.0914490669965744 x^3


# PyTorch: optim

In [15]:
import torch
import math


# Create Tensors to hold input and outputs.
x = torch.linspace(-math.pi, math.pi, 2000)
y = torch.sin(x)

# Prepare the input tensor (x, x^2, x^3).
p = torch.tensor([1, 2, 3])
xx = x.unsqueeze(-1).pow(p)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(3, 1),
    torch.nn.Flatten(0, 1)
)
loss_fn = torch.nn.MSELoss(reduction='sum')

# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use RMSprop; the optim package contains many other
# optimization algorithms. The first argument to the RMSprop constructor tells the
# optimizer which Tensors it should update.
learning_rate = 1e-3
optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)
for t in range(2000):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(xx)

    # Compute and print loss.
    loss = loss_fn(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())

    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()


linear_layer = model[0]
print(f'Result: y = {linear_layer.bias.item()} + {linear_layer.weight[:, 0].item()} x + {linear_layer.weight[:, 1].item()} x^2 + {linear_layer.weight[:, 2].item()} x^3')

99 25590.3203125
199 10892.1796875
299 3849.25
399 978.232177734375
499 202.438232421875
599 82.48208618164062
699 50.78310775756836
799 30.701852798461914
899 19.048215866088867
999 13.197745323181152
1099 10.427064895629883
1199 9.196508407592773
1299 8.853813171386719
1399 8.817937850952148
1499 8.817169189453125
1599 8.817171096801758
1699 8.927461624145508
1799 9.17581558227539
1899 9.063740730285645
1999 8.94500732421875
Result: y = 0.0007251655333675444 + 0.8562337160110474 x + 0.0007269359775818884 x^2 + -0.09383764863014221 x^3


# PyTorch: Custom nn Modules

In [16]:
import torch
import math


class Polynomial3(torch.nn.Module):
    def __init__(self):
        """
        In the constructor we instantiate four parameters and assign them as
        member parameters.
        """
        super().__init__()
        self.a = torch.nn.Parameter(torch.randn(()))
        self.b = torch.nn.Parameter(torch.randn(()))
        self.c = torch.nn.Parameter(torch.randn(()))
        self.d = torch.nn.Parameter(torch.randn(()))

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        return self.a + self.b * x + self.c * x ** 2 + self.d * x ** 3

    def string(self):
        """
        Just like any class in Python, you can also define custom method on PyTorch modules
        """
        return f'y = {self.a.item()} + {self.b.item()} x + {self.c.item()} x^2 + {self.d.item()} x^3'


# Create Tensors to hold input and outputs.
x = torch.linspace(-math.pi, math.pi, 2000)
y = torch.sin(x)

# Construct our model by instantiating the class defined above
model = Polynomial3()

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the nn.Linear
# module which is members of the model.
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-6)
for t in range(2000):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print(f'Result: {model.string()}')

99 3708.881103515625
199 2468.661376953125
299 1644.7138671875
399 1097.1524658203125
499 733.1485595703125
599 491.08563232421875
699 330.0555114746094
799 222.89071655273438
899 151.54428100585938
999 104.0241470336914
1099 72.35916900634766
1199 51.24930953979492
1299 37.16911697387695
1399 27.77261734008789
1499 21.49852752685547
1599 17.30673599243164
1699 14.504561424255371
1799 12.630081176757812
1899 11.375333786010742
1999 10.534866333007812
Result: y = 0.021114975214004517 + 0.8214263319969177 x + -0.0036426838487386703 x^2 + -0.0883072018623352 x^3


# PyTorch: Control Flow + Weight Sharing

In [17]:
import random
import torch
import math


class DynamicNet(torch.nn.Module):
    def __init__(self):
        """
        In the constructor we instantiate five parameters and assign them as members.
        """
        super().__init__()
        self.a = torch.nn.Parameter(torch.randn(()))
        self.b = torch.nn.Parameter(torch.randn(()))
        self.c = torch.nn.Parameter(torch.randn(()))
        self.d = torch.nn.Parameter(torch.randn(()))
        self.e = torch.nn.Parameter(torch.randn(()))

    def forward(self, x):
        """
        For the forward pass of the model, we randomly choose either 4, 5
        and reuse the e parameter to compute the contribution of these orders.

        Since each forward pass builds a dynamic computation graph, we can use normal
        Python control-flow operators like loops or conditional statements when
        defining the forward pass of the model.

        Here we also see that it is perfectly safe to reuse the same parameter many
        times when defining a computational graph.
        """
        y = self.a + self.b * x + self.c * x ** 2 + self.d * x ** 3
        for exp in range(4, random.randint(4, 6)):
            y = y + self.e * x ** exp
        return y

    def string(self):
        """
        Just like any class in Python, you can also define custom method on PyTorch modules
        """
        return f'y = {self.a.item()} + {self.b.item()} x + {self.c.item()} x^2 + {self.d.item()} x^3 + {self.e.item()} x^4 ? + {self.e.item()} x^5 ?'


# Create Tensors to hold input and outputs.
x = torch.linspace(-math.pi, math.pi, 2000)
y = torch.sin(x)

# Construct our model by instantiating the class defined above
model = DynamicNet()

# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-8, momentum=0.9)
for t in range(30000):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    if t % 2000 == 1999:
        print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print(f'Result: {model.string()}')

1999 2329.285888671875
3999 1092.639892578125
5999 572.0111694335938
7999 257.60223388671875
9999 131.4246368408203
11999 65.56714630126953
13999 34.894676208496094
15999 21.250364303588867
17999 14.811595916748047
19999 11.761395454406738
21999 10.094457626342773
23999 9.536259651184082
25999 9.160627365112305
27999 8.763763427734375
29999 8.690313339233398
Result: y = 0.009464428760111332 + 0.8544765710830688 x + -0.0022061513736844063 x^2 + -0.09323178976774216 x^3 + 9.393560321768746e-05 x^4 ? + 9.393560321768746e-05 x^5 ?
