# Import the modules


It imports the sys module, which provides access to system-specific parameters and functions.
It then uses the sys.executable attribute to determine the path to the Python interpreter currently running the script.
The !{...} syntax is used to execute shell commands directly from within Python code (commonly used in Jupyter notebooks or IPython). In this case, it installs two Python packages using pip:
nlp_rake: A package for extracting keywords from text using the Rapid Automatic Keyword Extraction (RAKE) algorithm.
wordcloud: A package for creating word clouds from text data.

In [None]:
import sys
!{sys.executable} -m pip install nlp_rake
!{sys.executable} -m pip install wordcloud

# Read the page

In [None]:
url = 'https://en.wikipedia.org/wiki/Hurricane_Harvey'

It imports the requests module, which allows you to make HTTP requests in Python.
The requests.get(url) line sends an HTTP GET request to the specified url.
The content attribute of the response object contains the raw bytes of the server’s response.
The .decode('utf-8') method converts these bytes into a UTF-8 encoded string.
Finally, it prints the first 1000 characters of the decoded text.
Remember that when you receive data in a response using the Requests library, it is initially received as bytes. If the response contains non-ASCII characters, you need to decode it from UTF-8 format into a string using .decode('utf-8').

In [None]:
import requests

text = requests.get(url).content.decode('utf-8')
print(text[:1000])


The code imports the HTMLParser class from the html.parser module. This class is used for parsing HTML and extracting information from it.
It defines a custom class called MyHTMLParser, which inherits from HTMLParser.

Inside MyHTMLParser, there are three methods:

handle_starttag(self, tag, attrs): 

This method is called when the parser encounters a start tag (e.g., <script> or <style>). If the tag is one of these, it sets the self.script flag to True.

handle_endtag(self, tag): 

This method is called when the parser encounters an end tag. If the tag is <script> or <style>, it sets the self.script flag back to False.
handle_data(self, data): This method is called when the parser encounters data (text content) between tags. If the data is empty or if the self.script flag is True, it returns without doing anything. Otherwise, it appends the data (with [ edit ] removed) to the self.res attribute.
An instance of MyHTMLParser is created with the name parser.
The parser.feed(text) line processes the input text using the custom parser.
Finally, the modified text (stored in parser.res) is printed, showing the first 1000 characters.
The purpose of this code appears to be cleaning up HTML content by removing script and style tags and extracting visible text. The handle_data method ensures that only relevant text is included in the result.

In [None]:
from html.parser import HTMLParser

class MyHTMLParser(HTMLParser):
    script = False
    res = ""
    def handle_starttag(self, tag, attrs):
        if tag.lower() in ["script","style"]:
            self.script = True
    def handle_endtag(self, tag):
        if tag.lower() in ["script","style"]:
            self.script = False
    def handle_data(self, data):
        if str.strip(data)=="" or self.script:
            return
        self.res += ' '+data.replace('[ edit ]','')

parser = MyHTMLParser()
parser.feed(text)
text = parser.res
print(text[:1000])

In [None]:
import nlp_rake
extractor = nlp_rake.Rake(max_words=2,min_freq=5,min_chars=10)
res = extractor.apply(text)
res

In [None]:
import matplotlib.pyplot as plt

def plot(pair_list):
    k,v = zip(*pair_list)
    plt.bar(range(len(k)),v)
    plt.xticks(range(len(k)),k,rotation='vertical')
    plt.show()

plot(res)

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

wc = WordCloud(background_color='white',width=800,height=600)
plt.figure(figsize=(15,7))
plt.imshow(wc.generate_from_frequencies({ k:v for k,v in res }))

# Plot the picture

In [None]:
plt.figure(figsize=(15,7))
plt.imshow(wc.generate(text))


In [None]:
wc.generate(text).to_file('ds_wordcloud.png')