In [8]:
import pandas as pd
import re
import string
from bs4 import BeautifulSoup
import nltk
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import spacy

In [9]:
def clean_html(html):

    # parse html content
    soup = BeautifulSoup(html, "html.parser")

    for data in soup(['style', 'script', 'code', 'a']):
        # Remove tags
        data.decompose()

    # return data by retrieving the tag content
    return ' '.join(soup.stripped_strings)

In [10]:
raw_string = """
<p>
  <a
    href="https://forge.autodesk.com/#step-6-download-the-item"
    rel="nofollow noreferrer"
    >https://forge.autodesk.com/en//#step-6-download-the-item</a
  >
</p>
\n\n
<p>
  I have followed the tutorial and have successfully obtained the contents of
  the file, but where is the file being downloaded. In addition, how do I
  specify the location of where I want to download the file?
</p>
\n\n
<p>
  Result on Postman\n<a
    href="https://i.stack.imgur.com/VrdqP.png"
    rel="nofollow noreferrer"
    ><img
      src="https://i.stack.imgur.com/VrdqP.png"
      alt="enter image description here"
  /></a>
</p>
"""

In [18]:
# first round of cleaning up
first_s = clean_html(raw_string)
print(clean_html(raw_string))

I have followed the tutorial and have successfully obtained the contents of
  the file, but where is the file being downloaded. In addition, how do I
  specify the location of where I want to download the file? Result on Postman


In [14]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [15]:
nlp = spacy.load('en_core_web_sm')

In [16]:
# Load spacy
# nlp = spacy.load('en_core_web_sm')

def clean_string(text, stem="None"):

    final_string = ""

    # Make lower
    text = text.lower()

    # Remove line breaks
    # Note: that this line can be augmented and used over
    # to replace any characters with nothing or a space
    text = re.sub(r'\n', '', text)

    # Remove punctuation
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)

    # Remove stop words
    text = text.split()
    useless_words = nltk.corpus.stopwords.words("english")
    useless_words = useless_words + ['hi', 'im']

    text_filtered = [word for word in text if not word in useless_words]

    # Remove numbers
    text_filtered = [re.sub(r'\w*\d\w*', '', w) for w in text_filtered]

    # Stem or Lemmatize
    if stem == 'Stem':
        stemmer = PorterStemmer() 
        text_stemmed = [stemmer.stem(y) for y in text_filtered]
    elif stem == 'Lem':
        lem = WordNetLemmatizer()
        text_stemmed = [lem.lemmatize(y) for y in text_filtered]
    elif stem == 'Spacy':
        text_filtered = nlp(' '.join(text_filtered))
        text_stemmed = [y.lemma_ for y in text_filtered]
    else:
        text_stemmed = text_filtered

    final_string = ' '.join(text_stemmed)

    return final_string

In [21]:
# second round of cleaning up
second_s = clean_string(first_s)
print(second_s)

followed tutorial successfully obtained contents file file downloaded addition specify location want download file result postman


In [22]:
# create a new column like text_clean, 
# in case punctuation is needed