# Using hugging face to parse a website and summarize

In [2]:
!pip -q install transformers

In [4]:
!pip -q install BeautifulSoup4

In [5]:
from transformers import pipeline
from bs4 import BeautifulSoup
import requests

## Load summarisation pipeline

In [6]:
summarizer = pipeline("summarization")

Downloading:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

### Get blog post

In [7]:
URL = "https://medium.com/the-ascent/i-discovered-its-never-too-late-to-invest-in-bitcoin-if-you-understand-it-d0848141144b"

In [8]:
r = requests.get(URL)

In [9]:
soup = BeautifulSoup(r.text, 'html.parser')
results = soup.find_all(['h1', 'p'])
text = [result.text for result in results]
ARTICLE = ' '.join(text)

In [11]:
ARTICLE

'I Discovered It’s Never Too Late to Invest in Bitcoin If You Understand It Bitcoin has gone mainstream. There are some people who hate me for it. I’m getting quite a few people all asking me the same question: “Is it too late to invest in bitcoin?” I don’t think it is. Bitcoin is a paradigm shift in a world that is high on worthless currency printed in excess. The only way to fund our excessive lifestyles is to increase the global debt and pay for it with money printed out of thin air. My approach to bitcoin is different. It’s a little less drastic. I’ve been playing with it for years and have become fascinated by it. The rewards for me understanding bitcoin are now over 17,900%+ on the initial investment. I don’t think I’ve ever made another investment like it. It’s likely I will never make another investment like it for the rest of my life. There may never be another investment like it. The unfortunate psychological challenge when you buy something that performs beyond your wildest 

### Chunk text

In [12]:
max_chunk = 500

In [13]:
ARTICLE = ARTICLE.replace('.', '.<eos>')
ARTICLE = ARTICLE.replace('?', '?<eos>')
ARTICLE = ARTICLE.replace('!', '!<eos>')

In [15]:
sentences = ARTICLE.split('<eos>')
current_chunk = 0 
chunks = []
for sentence in sentences:
    if len(chunks) == current_chunk + 1: 
        if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
            chunks[current_chunk].extend(sentence.split(' '))
        else:
            current_chunk += 1
            chunks.append(sentence.split(' '))
    else:
        print(current_chunk)
        chunks.append(sentence.split(' '))

for chunk_id in range(len(chunks)):
    chunks[chunk_id] = ' '.join(chunks[chunk_id])

0


In [16]:
len(chunks)

6

### Summarize text

In [17]:
res = summarizer(chunks, max_length=120, min_length=30, do_sample=False)

In [18]:
res[0]

{'summary_text': " Bitcoin is a paradigm shift in a world that is high on worthless currency printed in excess . The rewards for me understanding bitcoin are now over 17,900%+ on the initial investment . It's likely I will never make another investment like it for the rest of my life . Bitcoin confuses people and that’s what it was designed to do ."}

In [19]:
' '.join([summ['summary_text'] for summ in res])

" Bitcoin is a paradigm shift in a world that is high on worthless currency printed in excess . The rewards for me understanding bitcoin are now over 17,900%+ on the initial investment . It's likely I will never make another investment like it for the rest of my life . Bitcoin confuses people and that’s what it was designed to do .  Bitcoin keeps being the best-performing asset each year, and is now the best performing asset of the decade . The problem with bitcoin is you can never buy at a good price . Bitcoin has a fixed supply of coins and predictable code built into it that tells you its future . Investment firms are buying bitcoin, not because they want to, but because they have to .  When euphoria hits an asset like bitcoin, people lose their minds and throw a wall of money at it . The biggest risk of bitcoin has always been the potential for it to be banned . The likelihood of bitcoin getting banned is almost zero, but regulation just means you will have to be identified when yo

In [20]:
text = ' '.join([summ['summary_text'] for summ in res])

### Output to file

In [None]:
with open('summary.txt', 'w') as f:
    f.write(text)