# **Gen-AI Hands-On 1 for Tokenization**

**Imports**

In [29]:
import requests
from bs4 import BeautifulSoup
import nltk
import spacy
from nltk. tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter
import textwrap

In [3]:
nlp = spacy.load('en_core_web_sm')

In [34]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

**Function** **Definitions**

In [5]:
def scrape_web_page(url):
  """
  Fetches HTML content from a given url
  Returns the HTML content if successful, otherwise None
  """
  response = requests.get(url)
  if response.status_code == 200:
    return response.text
  else:
    return None


In [21]:
def extract_text_from_html(html_content):
  """
  Extracts and concatenates text from all <p> tags in the html content
  """
  soup = BeautifulSoup(html_content, 'html.parser')
  paragraphs = soup.find_all('p')
  text = ' '.join([para.get_text() for para in paragraphs])
  return text

In [8]:
def tokenize_text(text):
  """
  Tokenizes the text into words and sentences
  """
  words = word_tokenize(text)
  sentences = sent_tokenize(text)
  return words, sentences

In [9]:
def remove_stop_words(words):
  """
  Removes stop words from a list of words
  """
  stop_words = set(stopwords.words('english'))
  return [ word for word in words if word.lower() not in stop_words]

In [10]:
def stem_words(words):
  """
  Stems words using the Porter Stemmer algorithm
  """
  stemmer = PorterStemmer()
  return [stemmer.stem(word) for word in words]

In [11]:
def lemmatize_text(text):
  """
  Lemmatizes words using the spacy library
  """
  doc = nlp(text)
  return [token.lemma_ for token in doc]


In [14]:
def extract_named_entities(text):
  """
  Extracts named entities using the spacy library
  """
  doc = nlp(text)
  return [(ent.text, ent.label_) for ent in doc.ents]

In [16]:
def pos_tag_nltk(words):
  """
  Performs part-of-speech tagging using the nltk library
  """
  return nltk.pos_tag(words)

In [12]:
def pos_tag_spacy(text):
  """
  Performs part-of-speech tagging using the spacy library
  """
  doc = nlp(text)
  return [(token.text, token.pos_) for token in doc]

In [18]:
def word_frequency(words):
  """
  Calculates the frequency of each word in the text
  """
  return Counter(words).most_common(10)


## Main Function

In [25]:
# Main function to perform all the tasks
def perform_nlp_tasks(text):
  """
  Orchestratesall NLP tasks and returns results as a dictionary
  """
  words, sentences = tokenize_text(text)
  filtered_words = remove_stop_words(words)
  stemmed_words = stem_words(filtered_words)
  lemmatized_words = lemmatize_text(text)
  entities = extract_named_entities(text)
  pos_tags_nltk = pos_tag_nltk(filtered_words)
  pos_tags_spacy = pos_tag_spacy(text)
  word_freq = word_frequency(filtered_words)

  return{
    "words" : words,
    "sentences" : sentences,
    "filtered_words" : filtered_words,
    "stemmed_words" : stemmed_words,
    "lemmatized_words" : lemmatized_words,
    "entities" : entities,
    "pos_tags_nltk" : pos_tags_nltk,
    "pos_tags_spacy" : pos_tags_spacy,
    "word_freq" : word_freq
  }

In [35]:
url = "https://en.wikipedia.org/wiki/Natural_language_processing"

html_content = scrape_web_page(url)

if html_content:
  text = extract_text_from_html(html_content)

  wrapped_text = textwrap.fill(text, width=80)

  with open("web_content.txt", "w") as file:
    file.write(wrapped_text)

  #Read the text from the file
  with open("web_content.txt", "r") as file:
    text_from_file = file.read()

  nlp_results = perform_nlp_tasks(text_from_file)

  output = (
      "========== Tokenized Words ==========\n"
      f"{nlp_results['words']}\n\n"
      "========== Sentences ==========\n"
      f"{nlp_results['sentences']}\n\n"
      "========== Filtered Words (Without Stop Words) ==========\n"
      f"{nlp_results['filtered_words']}\n\n"
      "========== Stemmed Words =========\n"
      f"{nlp_results['stemmed_words']}\n\n"
      "========== Lemmatized Words ==========\n"
      f"{nlp_results['lemmatized_words']}\n\n"
      "========== Named Entities ==========\n"
      f"{nlp_results['entities']}\n\n"
      "========== POS Tags (Using NLTK) ==========\n"
      f"{nlp_results['pos_tags_nltk']}\n\n"
      "========== POS Tags (Using SpaCY) ==========\n"
      f"{nlp_results['pos_tags_spacy']}\n\n"
      "========== Word Frequency ==========\n"
      f"{nlp_results['word_freq']}\n"
  )

  print(output)

  #save the output to a file
  with open("nlp_output.txt", "w") as file:
    file.write(output)

else:
  print("Failed to retrieve the content")

['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'subfield', 'of', 'computer', 'science', 'and', 'especially', 'artificial', 'intelligence', '.', 'It', 'is', 'primarily', 'concerned', 'with', 'providing', 'computers', 'with', 'the', 'ability', 'to', 'process', 'data', 'encoded', 'in', 'natural', 'language', 'and', 'is', 'thus', 'closely', 'related', 'to', 'information', 'retrieval', ',', 'knowledge', 'representation', 'and', 'computational', 'linguistics', ',', 'a', 'subfield', 'of', 'linguistics', '.', 'Typically', 'data', 'is', 'collected', 'in', 'text', 'corpora', ',', 'using', 'either', 'rule-based', ',', 'statistical', 'or', 'neural-based', 'approaches', 'in', 'machine', 'learning', 'and', 'deep', 'learning', '.', 'Major', 'tasks', 'in', 'natural', 'language', 'processing', 'are', 'speech', 'recognition', ',', 'text', 'classification', ',', 'natural-', 'language', 'understanding', ',', 'and', 'natural-language', 'generation', '.', 'Natural', 'language', 'processin