# Assignment 8 # 3rd Article

Spacy is a powerful library for linguistic data processing. It providesa pipeline of processing components: a tokenizer, a part-of-speechtagger, a dependency parser and a named-entity recognize

In [1]:
#!python -m spacy download en_core_web_lg 
import spacy
nlp = spacy.load("en_core_web_lg")

# small text

In [2]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)
for token in doc:
    print(token, end="| ")

My| best| friend| Ryan| Peters| likes| fancy| adventure| games| .| 

# Spacy Attributes

In [3]:
import pandas as pd

def display_nlp(doc, include_punct=False):
    """Generate data frame for visualization of spaCy tokens."""
    rows = []
    for i, t in enumerate(doc):
        if not t.is_punct or include_punct:
            row = {'token': i,  'text': t.text, 'lemma_': t.lemma_, 
                   'is_stop': t.is_stop, 'is_alpha': t.is_alpha,
                   'pos_': t.pos_, 'dep_': t.dep_, 
                   'ent_type_': t.ent_type_, 'ent_iob_': t.ent_iob_}
            rows.append(row)
    
    df = pd.DataFrame(rows).set_index('token')
    df.index.name = None
    return df
display_nlp(doc)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Unnamed: 0,text,lemma_,is_stop,is_alpha,pos_,dep_,ent_type_,ent_iob_
0,My,my,True,True,PRON,poss,,O
1,best,good,False,True,ADJ,amod,,O
2,friend,friend,False,True,NOUN,nsubj,,O
3,Ryan,Ryan,False,True,PROPN,compound,PERSON,B
4,Peters,Peters,False,True,PROPN,appos,PERSON,I
5,likes,like,False,True,VERB,ROOT,,O
6,fancy,fancy,False,True,ADJ,amod,,O
7,adventure,adventure,False,True,NOUN,compound,,O
8,games,game,False,True,NOUN,dobj,,O


# Removing Stop words using Spacy

In [4]:
text = "Dear Ryan, we need to sit down and talk. Regards, Pete"
doc = nlp(text)

non_stop = [t for t in doc if not t.is_stop and not t.is_punct]
print(non_stop)

[Dear, Ryan, need, sit, talk, Regards, Pete]


# FINDING NOUNS USING SPACY

In [5]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)

nouns = [t for t in doc if t.pos_ in ['NOUN', 'PROPN']]
print(nouns)

[friend, Ryan, Peters, adventure, games]


# Named Entity Recognition

In [6]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)

for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")

(Ryan Peters, PERSON) 

In [8]:
text = "James O'Neill, chairman of World Cargo Inc, lives in San Francisco." 
doc = nlp(text)

for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")

(James O'Neill, PERSON) (World Cargo Inc, ORG) (San Francisco, GPE) 

# Visualize NERS:

In [9]:
from spacy import displacy
displacy.render(doc, style='ent', jupyter=True)

# with a real dataset

In [15]:
pip install python -m spacy download en_core_web_sm

Note: you may need to restart the kernel to use updated packages.



Usage:   
  C:\Users\anush\AppData\Local\Programs\Python\Python312\python.exe -m pip install [options] <requirement specifier> [package-index-options] ...
  C:\Users\anush\AppData\Local\Programs\Python\Python312\python.exe -m pip install [options] -r <requirements file> [package-index-options] ...
  C:\Users\anush\AppData\Local\Programs\Python\Python312\python.exe -m pip install [options] [-e] <vcs project url> ...
  C:\Users\anush\AppData\Local\Programs\Python\Python312\python.exe -m pip install [options] [-e] <local project path> ...
  C:\Users\anush\AppData\Local\Programs\Python\Python312\python.exe -m pip install [options] <archive url/path> ...

no such option: -m


In [1]:
import requests
import re
import spacy
from bs4 import BeautifulSoup

# Function to extract text from a URL
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html.parser')  # Using 'html.parser' instead of 'html5lib'
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

# Load the spaCy English model
nlp = spacy.load('en_core_web_sm')

# Replace the URL string
new_url = "https://en.wikipedia.org/wiki/New_Haven,_Connecticut"
article_text = url_to_string(new_url)

# Process the article text with spaCy
article = nlp(article_text)

# Count the number of named entities in the article
num_entities = len(article.ents)
print("Number of named entities in the article:", num_entities)


Number of named entities in the article: 4173


# NERS

In [2]:
import spacy
from spacy import displacy  # Import displacy module

# Assuming 'article' is a spaCy processed document
# Replace 'article' with your processed spaCy document
displacy.render(article, style='ent', jupyter=True)


#NER TYPE

In [3]:
from collections import Counter

# Get labels of named entities
labels = [x.label_ for x in article.ents]

# Count occurrences of each label
label_counts = Counter(labels)

# Print label counts
print(label_counts)

Counter({'DATE': 978, 'ORG': 970, 'GPE': 874, 'PERSON': 449, 'CARDINAL': 331, 'NORP': 111, 'FAC': 95, 'WORK_OF_ART': 82, 'LOC': 71, 'ORDINAL': 62, 'PERCENT': 40, 'QUANTITY': 31, 'PRODUCT': 30, 'EVENT': 28, 'MONEY': 10, 'TIME': 6, 'LAW': 3, 'LANGUAGE': 2})


# Most popular NER types

In [4]:
items = [x.text for x in article.ents]
Counter(items).most_common(5)

[('New Haven', 289),
 ('Connecticut', 99),
 ('Yale', 39),
 ('the Wayback Machine', 38),
 ('first', 34)]

# SENTENCE TO ANALYSE

In [5]:
sentences = [x for x in article.sents]
print(sentences[0])

 New Haven, Connecticut - Wikipedia Jump to content Main menu Main menu move to sidebar hide Navigation Main pageContentsCurrent eventsRandom articleAbout WikipediaContact usDonate Contribute HelpLearn to editCommunity portalRecent changesUpload file Search Search Create account Log in Personal tools  Create account Log in Pages for logged out editors learn more ContributionsTalk Contents move to sidebar hide (Top) 1History Toggle History subsection 1.1Pre-colonial foundation as an independent colony 1.2As part of the Connecticut Colony 1.3Post-colonial period and industrialization 1.4Post-industrial era and urban redevelopment 1.5Timeline of notable firsts 2Geography Toggle Geography subsection 2.1Climate 2.2Streetscape 2.3Neighborhoods


In [6]:
displacy.render(nlp(str(sentences[0])), jupyter=True, style='ent')

# TYPE OF WORDS IN SENTENCE

In [17]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[0])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[(' ', 'SPACE', ' '),
 ('Music', 'PROPN', 'Music'),
 ('News', 'PROPN', 'News'),
 ('Billboard', 'PROPN', 'Billboard'),
 ('              ', 'SPACE', '              '),
 ('×', 'VERB', '×')]

# SENTENCE DEPENDENCY TREE

In [18]:
displacy.render(nlp(str(sentences[0])), style='dep', jupyter = True, options = {'distance': 120})