In [None]:
# Import the necessary libraries that will be used in the project
# Web scraping libraris
from bs4 import BeautifulSoup

# Ablility to make HTTP requests
import requests

# Data visualization libraries
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import plotly.express as px

# Data processing libraries
import pandas as pd
from transformers import pipeline

# **"The cool thing about webscraping is that the internet becomes your database"**
- *Keith Galli*

## **What is Sentiment Analysis?**

Sentiment analysis is analysing and quantifying the emotions within text and attributing a feeling towards them.  This is typically either positive, negative, or neutral.

## **Import libraries**

## **Get data**
First thing we ned to do is choose a data source. I'll use [BBC News](https://www.bbc.co.uk/news) and choose an article URL.

Pass that URL into the `website` as a `string` to declare, or instantiate, the `website` variable.

In [None]:
# Provide the BBC newstory URL you would like to scrape
website = 'https://www.bbc.co.uk/news/business-68230697'
data = requests.get(website)
data

Hopefully a response of 200 is returned after executing the next section of code, which means there is a successful connection to the target URL.

Now let's take a look at the content:


In [None]:
# Add .content onto the end of data
data.content

This data is really hard to work with, specific parts of the collected data need to be reviewed by investigating the website code, selecting each wrapper around each block of text, and then collecting the text.

This is achieved by using `BeautifulSoup` to extract the data from the HTML code.

Pass data.content into the `BeautifulSoup()` function.

In [None]:
soup = BeautifulSoup(data.content)
soup

The webpage is converted into a `BeautifulSoup` object but it still doesn't look very user-friendly.

The `find_all` function from `BeauitfulSoup` is used the collect everything which has this attribute.

In [None]:
text_blocks = soup.find_all("div", attrs={"data-component": "text-block"})
text_blocks

Use a `for loop` to isolate just the text from `text_blocks`.  

What we need to do is make a `for` loop of text_blocks and then `print` the object `item.text`.

In [None]:
# For loop syntax:
# for x in y:
#       do something

# Get just the text of each block using a loop:
for item in text_blocks:
  print(item.text)

The webpage scraping is now complete. This is just plain text data.  Next steps:

* Process it so we can visualise it with a word cloud.
* Process it so the sentiment analyser can read it.

Let's do our sentiment analysis first.

# **Sentiment Analysis**

**SPOILER ALERT**: News is often perceived as quite a "negative" source of information.

In [None]:
# Process data for sentiment analyser:
text_list = [item.text for item in text_blocks]

# Pass in data to sentiment analyser:
sentiment_analyser = pipeline("sentiment-analysis", model="finiteautomata/bertweet-base-sentiment-analysis")
sentiments = sentiment_analyser(text_list)

# Take a look at the sentiments:
sentiments

The sentiment of every block is analysed. The next step is to process it again and place it into a `DataFrame` which is a kind of table that can be further analysed.

In [None]:
# Create sentiment dictionary:
sentiment_dictionary = {}
i = 1
for text, sentiment in zip(text_list, sentiments):
  sentiment_dictionary[f"block {i}"] = {
      "text": text,
      "sentiment_label": sentiment["label"],
      "sentiment_score": sentiment["score"]
                                }
  i += 1

# Place the sentiment_dictionary into a DataFrame
df = pd.DataFrame(data=sentiment_dictionary)
df

Transpose the results using pandas by calling our dataframe and adding .T onto the end of it.

In [None]:
df = df.T
df

Convert the sentiments into a new dataframe to which the sentiments can be measured.

In [None]:
# Create second dataframe with sentiment_labels and their counts
df2 = pd.DataFrame(df["sentiment_label"].value_counts())

# Reset column names and rename
df2 = df2.reset_index()
df2.columns = ['sentiment', 'count']
df2

# **Data Visualisation**

Creating a word cloud provides an idea of the wording used within the article.

In [None]:
# Alter these to change the size of the word cloud
figsize_height = 20
figsize_width = 10

# Join all text into a single string:
text_string = " ".join([item.text for item in text_blocks])

# Pass it into WordCloud:
word_cloud = WordCloud(collocations = False, background_color = 'white', width=800, height=400).generate(text_string)

# Create visualisation parameters:
plt.figure(figsize=(figsize_height,figsize_width))
plt.imshow(word_cloud,
           interpolation='bilinear',
           aspect="auto")
plt.axis("off")

# Visualise!
plt.show()

Visualise the sentiments by percentage with a chart.

In [None]:
# Configure colours
positive = "#777ae4"
negative =  "#ee9247"
neutral = "#e6e7f8"

# Compile pie chart
chart = px.pie(df2,
            hole =.5,
            values='count',
            names='sentiment',
            color_discrete_map={
                "POS": positive,
                "NEG": negative,
                "NEU": neutral
            },
            title='Pie chart showing percentage of sentiment')
chart