# Visualizations - examples

In [None]:
import pandas as pd
from lib.sentiment_analysis_utils import (
    convert_to_only_best_sentiment,
    correct_literals,
)
from visualizations import (
    plot_sentiment_barplot,
    plot_aspect_sentiment_barplot,
    plot_sentiment_over_time,
)
from matplotlib import pyplot as plt

In [None]:
df = pd.read_csv(
    "whole_dataset_results_absa_1600.csv"
)  # A file with 1600 articles predicted, not put in files due to NDA
df = correct_literals(df)
df = convert_to_only_best_sentiment(df)

## Sentiment by keywords (available in STA API)

There are top 10 keywords selected (based on number of appearances in all articles within the loaded dataframe). We calculate aspect based sentiment for these keywords (so it's not the same as overall article's sentiment). We can observe that topics such as press, government, or minister are negative much more often than others keywords.

In [None]:
plot_aspect_sentiment_barplot(
    df, percentage=True, by_column="keywords_sentiment", top_n=10, top_based_on="sum"
)
plt.rcParams["axes.labelpad"] = 5
plt.ylabel("Keyword", fontsize=12)
plt.rcParams["axes.titlepad"] = 30
plt.title(
    "Percentage distribution of aspect-based sentiment for the most common keywords",
    fontsize=14,
)
plt.show()

Here is plot presenting the same data, but using numbers of articles instead of percentages.

In [None]:
plot_aspect_sentiment_barplot(
    df, percentage=False, by_column="keywords_sentiment", top_n=10, top_based_on="sum"
)
plt.rcParams["axes.labelpad"] = 5
plt.ylabel("Keyword", fontsize=12)
plt.xlabel("Number of occurrences in news", fontsize=12)
plt.rcParams["axes.titlepad"] = 30
plt.title("Aspect-based sentiment of the most common keywords", fontsize=14)
plt.show()

It is also possible to select top keywords based on their negativity (or neutrality, or possibility) - it can be done based on number of articles or percentages. Plot below shows sentiment across keywords that are most often negative (but taking into account only these keywords that appeared at least 5 times in news).

In [None]:
plot_aspect_sentiment_barplot(
    df,
    percentage=False,
    by_column="keywords_sentiment",
    top_n=8,
    top_based_on=0,
    based_on_percentage=True,
    min_n_to_consider=5,
)
plt.rcParams["axes.labelpad"] = 12
plt.ylabel("Keyword", fontsize=12)
plt.xlabel("Number of occurrences in news", fontsize=12)
plt.rcParams["axes.titlepad"] = 30
plt.title("Aspect-based sentiment of the most negative keywords", fontsize=14)
plt.show()

Below is the same type of plot, but for keywords that were mostly positive.

In [None]:
plot_aspect_sentiment_barplot(
    df,
    percentage=False,
    by_column="keywords_sentiment",
    top_n=8,
    top_based_on=2,
    based_on_percentage=True,
    min_n_to_consider=5,
)
plt.rcParams["axes.labelpad"] = 12
plt.ylabel("Keyword", fontsize=12)
plt.xlabel("Number of occurrences in news", fontsize=12)
plt.rcParams["axes.titlepad"] = 30
plt.title("Aspect-based sentiment of the most positive keywords", fontsize=14)
plt.show()

## Sentiment by found named entities

Here are plots that present most common NER among English news, and aspect based sentiment predicted for these entities. We select only top 10 most common entities, but it can be changed with *top_n* parameter.

In [None]:
plot_aspect_sentiment_barplot(
    df, percentage=False, by_column="ner_sentiment", top_n=10, top_based_on="sum"
)
plt.rcParams["axes.labelpad"] = 12
plt.ylabel("Entity", fontsize=12)
plt.xlabel("Number of occurrences in news", fontsize=12)
plt.rcParams["axes.titlepad"] = 30
plt.title("Aspect-based sentiment of the most common entities", fontsize=14)
plt.show()

Here are presented the same data, but with percentages instead of numbers.

In [None]:
plot_aspect_sentiment_barplot(
    df, percentage=True, by_column="ner_sentiment", top_n=10, top_based_on="sum"
)
plt.rcParams["axes.labelpad"] = 12
plt.ylabel("Entity", fontsize=12)
plt.rcParams["axes.titlepad"] = 30
plt.title(
    "Percentage distribution of aspect-based sentiment for the most common entities",
    fontsize=14,
)
plt.show()

In [None]:
df = pd.read_csv(
    "full_dataseet_overall_sentiment_twitter-roberta.csv"
)  # Again, we do not share the file as it contains sensitive data
sentiment_column_name = "overall_sentiment_twitter-roberta"
# sentiment_column_name = "overall_sentiment"
df = correct_literals(df)
df["sentiment"] = None
df.loc[df[sentiment_column_name] == 0, "sentiment"] = "Negative"
df.loc[df[sentiment_column_name] == 1, "sentiment"] = "Neutral"
df.loc[df[sentiment_column_name] == 2, "sentiment"] = "Positive"

We predict overall articles' sentiments and group it by categories. We can observe, which categories are more negative, and which are more positive or neutral. 

In [None]:
plot_sentiment_barplot(df, by_column="categories", percentage=False, top_n=8)
plt.rcParams["axes.labelpad"] = 12
plt.ylabel("Category", fontsize=12)
plt.xlabel("Number of news", fontsize=12)
plt.rcParams["axes.titlepad"] = 30
plt.title("Document-based sentiment by news category", fontsize=14)
plt.show()

It can also be plotted with percentages, we limit here for top 8 categories due to very low number of Advisory news in our dataframe.

In [None]:
plot_sentiment_barplot(df, by_column="categories", percentage=True, top_n=8)
plt.rcParams["axes.labelpad"] = 12
plt.ylabel("Category", fontsize=12)
plt.rcParams["axes.titlepad"] = 30
plt.title(
    "Percentage distribution of document-based sentiment by news category", fontsize=14
)
plt.show()

We can also see which authors put sentiments in their news by using byline data from STA API. We won't share the result of such data here, as it might be sensitive. One can run it by loading appropriate dataframe (only if you have access to STA data) and uncommenting the line below.

In [None]:
# plot_sentiment_barplot(df, by_column="byline", percentage=False, top_n=10)

In [None]:
df = df.loc[df["sentiment"].notna()]

Another possibility with our solution is monitoring how sentiment changes over the time. As previously, you can specify if you want to see the numbers of articles or percentages of negatives vs positives. The plot below shows the numbers of articles by sentiment within given time period.

In [None]:
plot_sentiment_over_time(
    df,
    interval_len=2,
    percentage=False,
    cut_last_interval=False,
    sentiments=["Negative", "Neutral", "Positive"],
)

It is possible to specify length of time interval in every "bucket" by *interval_len* parameter. Here, we change it to bigger number and change numbers to percentages.

In [None]:
plot_sentiment_over_time(
    df,
    interval_len=4,
    percentage=True,
    cut_last_interval=True,
    sentiments=["Negative", "Neutral", "Positive"],
)
plt.rcParams["axes.labelpad"] = 12
plt.ylabel("Percentage", fontsize=12)
plt.rcParams["axes.titlepad"] = 30
plt.title(
    "Percentage distribution of document-based sentiment over the time", fontsize=14
)
plt.show()

In [None]:
plot_sentiment_over_time(
    df,
    interval_len=4,
    percentage=False,
    cut_last_interval=True,
    sentiments=["Negative", "Neutral", "Positive"],
)
plt.rcParams["axes.labelpad"] = 12
plt.ylabel("Number of news", fontsize=12)
plt.rcParams["axes.titlepad"] = 30
plt.title("Number of news by their overall sentiment over the time", fontsize=14)
plt.show()