In [None]:
## Method 1

In [8]:
import pandas as pd
import re
import string
from bs4 import BeautifulSoup
import nltk
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import spacy

In [9]:
def clean_html(html):

    # parse html content
    soup = BeautifulSoup(html, "html.parser")

    for data in soup(['style', 'script', 'code', 'a']):
        # Remove tags
        data.decompose()

    # return data by retrieving the tag content
    return ' '.join(soup.stripped_strings)

In [10]:
raw_string = """
<p>
  <a
    href="https://forge.autodesk.com/#step-6-download-the-item"
    rel="nofollow noreferrer"
    >https://forge.autodesk.com/en//#step-6-download-the-item</a
  >
</p>
\n\n
<p>
  I have followed the tutorial and have successfully obtained the contents of
  the file, but where is the file being downloaded. In addition, how do I
  specify the location of where I want to download the file?
</p>
\n\n
<p>
  Result on Postman\n<a
    href="https://i.stack.imgur.com/VrdqP.png"
    rel="nofollow noreferrer"
    ><img
      src="https://i.stack.imgur.com/VrdqP.png"
      alt="enter image description here"
  /></a>
</p>
"""

In [18]:
# first round of cleaning up
first_s = clean_html(raw_string)
print(clean_html(raw_string))

I have followed the tutorial and have successfully obtained the contents of
  the file, but where is the file being downloaded. In addition, how do I
  specify the location of where I want to download the file? Result on Postman


In [14]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [15]:
nlp = spacy.load('en_core_web_sm')

In [16]:
# Load spacy
# nlp = spacy.load('en_core_web_sm')

def clean_string(text, stem="None"):

    final_string = ""

    # Make lower
    text = text.lower()

    # Remove line breaks
    # Note: that this line can be augmented and used over
    # to replace any characters with nothing or a space
    text = re.sub(r'\n', '', text)

    # Remove punctuation
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)

    # Remove stop words
    text = text.split()
    useless_words = nltk.corpus.stopwords.words("english")
    useless_words = useless_words + ['hi', 'im']

    text_filtered = [word for word in text if not word in useless_words]

    # Remove numbers
    text_filtered = [re.sub(r'\w*\d\w*', '', w) for w in text_filtered]

    # Stem or Lemmatize
    if stem == 'Stem':
        stemmer = PorterStemmer() 
        text_stemmed = [stemmer.stem(y) for y in text_filtered]
    elif stem == 'Lem':
        lem = WordNetLemmatizer()
        text_stemmed = [lem.lemmatize(y) for y in text_filtered]
    elif stem == 'Spacy':
        text_filtered = nlp(' '.join(text_filtered))
        text_stemmed = [y.lemma_ for y in text_filtered]
    else:
        text_stemmed = text_filtered

    final_string = ' '.join(text_stemmed)

    return final_string

In [21]:
# second round of cleaning up
second_s = clean_string(first_s)
print(second_s)

followed tutorial successfully obtained contents file file downloaded addition specify location want download file result postman


In [22]:
# create a new column like text_clean, 
# in case punctuation is needed

In [23]:
## Method 2

In [24]:
import re
# Regular expressions library for pattern matching
import string # Library for dealing with string operations
import nltk
# Natural Language Toolkit for text processing
nltk.download('stopwords') # Download stopwords list from NLTK
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /Users/user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
def preprocess_text(text) :
    # Convert text to lowercase
    text = text.lower ()
    # Remove special characters and digits using regular expressions
    text = re.sub(r'\d+', '', text) # Remove digits
    text = re.sub(r'[^\w\s]', '', text)
    # Remove special characters except
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    return tokens

In [33]:
def remove_stopwords (tokens):
    stop_words = set(stopwords.words ('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens

In [35]:
def perform_lemmatization(tokens):
    lemmatizer=nltk.WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens

In [41]:
def clean_text(text):
    tokens = preprocess_text(text)
    filtered_tokens = remove_stopwords (tokens)
    lemmatized_tokens = perform_lemmatization(filtered_tokens)
    clean_text = ' '.join(lemmatized_tokens)
    return clean_text

In [46]:
text_data = "Text cleaning is an important step in text analysis. It involve"
cleaned_data = clean_text(text_data)
print(cleaned_data)

text cleaning important step text analysis involve


In [48]:
cleaned_data = clean_text(first_s)
print(cleaned_data)

followed tutorial successfully obtained content file file downloaded addition specify location want download file result postman


In [None]:
## Method 3

In [53]:
import requests
from bs4 import BeautifulSoup
import pickle

In [56]:
# 
services = ['nat_gateway','api_gateway','route53']

In [76]:
text_data = [
['nat_gateway', '503 error', 'connection reset'],
['api_gateway', 'request', 'response'],
['route53', 'routing policy', 'failover', 'latency']
]

In [77]:
# # Pickle files for later use

# get data 
# # Make a new directory to hold the text files
# !mkdir data

for i, c in enumerate(services):
    with open("data/" + c + ".txt", "wb") as file:
        pickle.dump(text_data[i], file)

In [78]:
data = {}
for i, c in enumerate(services):
    with open("data/" + c + ".txt", "rb") as file:
        data[c] = pickle.load(file)

In [79]:
data.keys()

dict_keys(['nat_gateway', 'api_gateway', 'route53'])

In [81]:
data['route53'][0:2]

['route53', 'routing policy']

In [None]:
# clean data 

In [None]:
'''
Common data cleaning steps on all text:

Make text all lower case
Remove punctuation
Remove numerical values
Remove common non-sensical text (/n)
Tokenize text
Remove stop words
More data cleaning steps after tokenization:

Stemming / lemmatization
Parts of speech tagging
Create bi-grams or tri-grams
Deal with typos
And more...
'''

In [82]:
next(iter(data.keys()))

'nat_gateway'

In [83]:
# Notice that our dictionary 
next(iter(data.values()))

['nat_gateway', '503 error', 'connection reset']

In [85]:
# We are going to change this to key: , value: string 
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

In [86]:
data_combined = {key: [combine_text(value)] for (key, value) in data.items()}

In [88]:
# We can either keep it in dictionary format or put it into a pandas dataframe
import pandas as pd
pd.set_option('max_colwidth',150)

data_df = pd.DataFrame.from_dict(data_combined).transpose()
data_df.columns = ['description']
data_df = data_df.sort_index()
data_df

Unnamed: 0,description
api_gateway,api_gateway request response
nat_gateway,nat_gateway 503 error connection reset
route53,route53 routing policy failover latency


In [91]:
# Let's take a look at the transcript for Ali Wong
data_df.description.loc['route53']

'route53 routing policy failover latency'

In [92]:
# Apply a first round of text cleaning techniques
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [94]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_df.description.apply(round1))
data_clean

Unnamed: 0,description
api_gateway,apigateway request response
nat_gateway,natgateway error connection reset
route53,routing policy failover latency


In [95]:
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [97]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_clean.description.apply(round2))
data_clean

Unnamed: 0,description
api_gateway,apigateway request response
nat_gateway,natgateway error connection reset
route53,routing policy failover latency


In [98]:
# end of text pre-processing step

In [99]:
data_df

Unnamed: 0,description
api_gateway,api_gateway request response
nat_gateway,nat_gateway 503 error connection reset
route53,route53 routing policy failover latency


In [100]:
data_df['service'] = services
data_df

Unnamed: 0,description,service
api_gateway,api_gateway request response,nat_gateway
nat_gateway,nat_gateway 503 error connection reset,api_gateway
route53,route53 routing policy failover latency,route53


In [101]:
# Let's pickle it for later use
data_df.to_pickle("corpus.pkl")

In [103]:
"""
Document-Term Matrix
For many of the techniques we'll be using in future notebooks, 
the text must be tokenized, meaning broken down into smaller pieces. 
The most common tokenization technique is to break down text into words. 
We can do this using scikit-learn's CountVectorizer, where every row will 
represent a different document and every column will represent a different word.
In addition, with CountVectorizer, we can remove stop words. Stop words 
are common words that add no additional meaning to 
text such as 'a', 'the', etc.
"""

"\nDocument-Term Matrix\nFor many of the techniques we'll be using in future notebooks, \nthe text must be tokenized, meaning broken down into smaller pieces. \nThe most common tokenization technique is to break down text into words. \nWe can do this using scikit-learn's CountVectorizer, where every row will \nrepresent a different document and every column will represent a different word.\nIn addition, with CountVectorizer, we can remove stop words. Stop words \nare common words that add no additional meaning to \ntext such as 'a', 'the', etc.\n"

In [105]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean.description)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names_out())
data_dtm.index = data_clean.index
data_dtm

Unnamed: 0,apigateway,connection,error,failover,latency,natgateway,policy,request,reset,response,routing
api_gateway,1,0,0,0,0,0,0,1,0,1,0
nat_gateway,0,1,1,0,0,1,0,0,1,0,0
route53,0,0,0,1,1,0,1,0,0,0,1


In [106]:
# Let's pickle it for later use
data_dtm.to_pickle("dtm.pkl")

In [107]:
# Let's also pickle the cleaned data (before we put it in document-term matrix format) and the CountVectorizer object
data_clean.to_pickle('data_clean.pkl')
pickle.dump(cv, open("cv.pkl", "wb"))