In [1]:
import re
import spacy
from spacy import displacy
import nltk
from nltk import sent_tokenize
from nltk import word_tokenize
from spacy.symbols import ORTH

2023-02-22 11:15:39.115107: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
text1 = '''
Contribute and support independent news channels. Consume a variety of sources of unbiased news.
Verify information from multiple sources before forwarding, Dont assume everything you read is from a trusted source.
Phone numbers to report fraudulent news articles are 18008896656 abd 1-(800)-887-8787
'''

pattern = "\d{11}"
matches = re.findall(pattern, text1)
matches

['18008896656']

In [3]:
text2 = '''
Note 1 - News consumption
Contribute and support independent news channels. Consume a variety of sources of unbiased news.
Verify information from multiple sources before forwarding, Dont assume everything you read is from a trusted source.
Phone numbers to report fraudulent news articles are 18008896656 abd 1-(800)-887-8787

Note 2 - Social Media Rules
When posting or commenting , keep discourse civil. Disagreements or opinions should be expressed as expressed 
in person, face to face and no special liberty should be taken because you are posting in an offline context.

Note 3 - Online bullying and flagging offensive content.
Make effort to combat online bullying by flagging offensive content to moderators or 
helping those who are bullied by offering supportive comments online. This is inline with what our kids are
taught in schools : Be an upstander not a bystander.

'''
pattern = "Note \d - ([^\n]*)"
matches = re.findall(pattern, text2)
matches

['News consumption',
 'Social Media Rules',
 'Online bullying and flagging offensive content.']

In [4]:
text3='''
The Gross cost of operating medicare for all in fY2021 Q2 was $158.08 million. 
The previous year without the medicare for all option the gross cost in FY2020 Q2 was $693 million
'''

pattern = "FY(\d{4} Q[1-4])"
matches = re.findall(pattern, text3,flags=re.IGNORECASE)
print(matches)
pattern = "FY(\d{4} Q[1-4])[^\$]+\$([\d.]+)"
matches = re.findall(pattern, text3,flags=re.IGNORECASE)
print(matches)

['2021 Q2', '2020 Q2']
[('2021 Q2', '158.08'), ('2020 Q2', '693')]


In [5]:
#check spacy vs nltk
nlp = spacy.load('en_core_web_sm')
doc = nlp("Dr.Test visited Japan. Mr.Sam visited Turkey")
for sent in doc.sents:
    print(sent)
    for word in sent:
        print("[", word, "]")

Dr.Test visited Japan.
[ Dr. ]
[ Test ]
[ visited ]
[ Japan ]
[ . ]
Mr.Sam visited Turkey
[ Mr. ]
[ Sam ]
[ visited ]
[ Turkey ]


In [6]:
sent_tokenize("Dr.Test visited Japan. Mr.Sam visited Turkey")

['Dr.Test visited Japan.', 'Mr.Sam visited Turkey']

In [7]:
word_tokenize("Dr.Test visited Japan. Mr.Sam visited Turkey")

['Dr.Test', 'visited', 'Japan', '.', 'Mr.Sam', 'visited', 'Turkey']

In [8]:
#NLP Preprocessing
#Tokenization: Sentence, Word then stemming, lemmatization
print(type(nlp))
print("Pipeline = ", nlp.pipeline)

<class 'spacy.lang.en.English'>
Pipeline =  [('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x7fca1203b760>), ('tagger', <spacy.pipeline.tagger.Tagger object at 0x7fca120c80a0>), ('parser', <spacy.pipeline.dep_parser.DependencyParser object at 0x7fca1150f740>), ('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x7fca12104e40>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer object at 0x7fca12112dc0>), ('ner', <spacy.pipeline.ner.EntityRecognizer object at 0x7fca120f2120>)]


In [9]:
#Test a blank pipeline with no tokens
nlp = spacy.blank("en")
print("Pipeline = ", nlp.pipeline)

Pipeline =  []


In [10]:
doc = nlp("I gave 10 $ to Pet shelter.")
print(doc[0], "Alpha -",doc[0].is_alpha, "Digit -",doc[0].is_digit)
print(doc[2], "Alpha -",doc[2].is_alpha, "Digit -",doc[2].is_digit)

I Alpha - True Digit - False
10 Alpha - False Digit - True


In [11]:
#Customize your tokenizer
doc = nlp("gimme more of the fact checked news and common sense opinions")
tokens = [token.text for token in doc]
tokens


['gimme',
 'more',
 'of',
 'the',
 'fact',
 'checked',
 'news',
 'and',
 'common',
 'sense',
 'opinions']

In [12]:
nlp.tokenizer.add_special_case("gimme",[{ORTH:"gim"},{ORTH:"me"}])
doc = nlp("gimme more of the fact checked news and common sense opinions")
tokens = [token.text for token in doc]
tokens

['gim',
 'me',
 'more',
 'of',
 'the',
 'fact',
 'checked',
 'news',
 'and',
 'common',
 'sense',
 'opinions']

In [13]:
doc = nlp("Dr.Test visited Japan. Mr.Sam visited Turkey")
for sent in doc.sents:
    print(sent)
    for word in sent:
        print("[", word, "]")

ValueError: [E030] Sentence boundaries unset. You can add the 'sentencizer' component to the pipeline with: `nlp.add_pipe('sentencizer')`. Alternatively, add the dependency parser or sentence recognizer, or set sentence boundaries by setting `doc[i].is_sent_start`.

In [15]:
#NLP add to pipeline
nlp.add_pipe('sentencizer')


<spacy.pipeline.sentencizer.Sentencizer at 0x7fca1169b140>

In [16]:
doc = nlp("Dr.Test visited Japan. Mr.Sam visited Turkey")
for sent in doc.sents:
    print(sent)
    for word in sent:
        print("[", word, "]")

Dr.Test visited Japan.
[ Dr. ]
[ Test ]
[ visited ]
[ Japan ]
[ . ]
Mr.Sam visited Turkey
[ Mr. ]
[ Sam ]
[ visited ]
[ Turkey ]


In [17]:
#NLP with a new language
nlp = spacy.blank("hi")
doc = nlp("भैया जी! 3000 ₹ उधार थे वो वापस देदो")
for token in doc:
    print(token, token.is_currency)

भैया False
जी False
! False
3000 False
₹ True
उधार False
थे False
वो False
वापस False
देदो False


In [18]:
#Extract the urls
nlp = spacy.load('en_core_web_sm')
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''

doc = nlp(text)
for sentence in doc.sents:
    #print(sentence)
    for word in sentence:
        if(word.like_url): 
            print(word)


http://www.data.gov/
http://www.science
http://data.gov.uk/.
http://www3.norc.org/gss+website/
http://www.europeansocialsurvey.org/.


In [22]:
#Extract the money transactions
transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"
doc = nlp(transactions)

for token in doc:
    if token.like_num and doc[token.i+1].is_currency: 
        print(token.text, doc[token.i+1].text)

two $
500 €


In [23]:
#Check pipeline Features
nlp = spacy.load("en_core_web_sm")
nlp.pipe_names


['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [24]:
doc = nlp("CaptainMontgomery paid 200$  for trip to Italy. He wanted to tour Australia next.")
for token in doc:
    print(token, "|",token.pos_,"|",token.lemma_)
    
print("\n\n")
for ent in doc.ents:
    print(ent)
displacy.render(doc, style="ent")

CaptainMontgomery | PROPN | CaptainMontgomery
paid | VERB | pay
200 | NUM | 200
$ | NUM | $
  | SPACE |  
for | ADP | for
trip | NOUN | trip
to | ADP | to
Italy | PROPN | Italy
. | PUNCT | .
He | PRON | he
wanted | VERB | want
to | PART | to
tour | VERB | tour
Australia | PROPN | Australia
next | ADV | next
. | PUNCT | .



CaptainMontgomery
200$
Italy
Australia


In [25]:
nlp = spacy.load("fr_core_news_sm")
doc = nlp("Tesla Inc va racheter Twitter pour $45 milliards de dollars")
for ent in doc.ents:
    print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))

Tesla Inc  |  PER  |  Named person or family.
Twitter  |  MISC  |  Miscellaneous entities, e.g. events, nationalities, products or works of art
