# Exploring the data

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt

#### Load the dataframe containing videos from 2019 and their features

This is a big file (860 MB) so we have stored it on Google Drive. Download it from the link below and storie it as `generated/2019/2019_videos_Typo_Emojis_NegWords_Sentiment_title_desc.parquet`. It was generated by the notebook `data_processing.ipynb`.

https://drive.google.com/file/d/1RmVSw2MBq0Ps0dwcTQjqZsDAuivXbUaZ/view?usp=share_link

In [None]:
filepath = 'generated/2019/2019_videos_Typo_Emojis_NegWords_Sentiment_title_desc.parquet'
videos = pd.read_parquet(filepath, engine='fastparquet')

What fraction of the dataset do videos from 2019 respresent?

In [None]:
nb_videos_2019 = len(videos)
nb_videos_total = 73e6

print(f'Fractions of videos that are from 2019: {(nb_videos_2019 / nb_videos_total):.1%}')

What columns do we have in our dataframe?

Fields that were already present in the original dataset from Zenodo:

- `categories`
- `channel_id`
- `crawl_date`
- `dislike_count`
- `display_id`
- `duration`
- `like_count`
- `upload_date`
- `view_count`

Features we have created:

- Number of exclamation marks in the title (or the description) of the video: `count_excl_marks_title`, `count_excl_marks_description`.

- Number of words in all caps in the title (or the description) of the video: `count_upper_words_title`, `count_upper_words_description`.

- Number of negative emojis in the title (or the description) of the video: `count_negative_emojis_title`, `count_negative_emojis_description`.

- Number of (negative) words in the title (or the description) of the video: `count_words_title`, `count_negative_words_title`, `count_words_description`, `count_negative_words_description`.

- Intensity score in the title (or the description) of the sentiment 'negative', 'neutral' and 'positive' according to VADER (plus compund): `sia_negative_title`, `sia_neutral_title`, `sia_positive_title`, `sia_compound_title`, `sia_negative_description`, `sia_neutral_description`, `sia_positive_description`, `sia_compound_description`.

In [None]:
videos.info()

### Look at the distribution of the features

We look at ratios since what matters for the negativity is the fraction of negative words (etc.) and not the absolute count.

In [None]:
# Prepare the field types (this will help us plot the data)

type_text_desc = ['description'] * len(videos)
type_text_title = ['title'] * len(videos)
type_text = np.concatenate((type_text_desc, type_text_title), None)

#### Typography of the text

In [None]:
# Compute the ratios

ratio_excl_title = videos['count_excl_marks_title'] / videos['count_words_title']
ratio_excl_description = videos['count_excl_marks_description'] / videos['count_words_description']
ratio_excl = np.concatenate((ratio_excl_description, ratio_excl_title), None)

ratio_upper_title = videos['count_upper_words_title']/videos['count_words_title']
ratio_upper_description = videos['count_upper_words_description']/videos['count_words_description']
ratio_upper_words = np.concatenate((ratio_upper_description, ratio_upper_title), None)


# Plot

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(16,6))

ratio_typo = pd.DataFrame({
    'Field': type_text,
    'Fraction of exclamation marks (out of words)': ratio_excl,
    'Fraction of upper words': ratio_upper_words,
})

sns.histplot(ratio_typo, x='Fraction of exclamation marks (out of words)', bins=50, hue="Field", element="step", ax=axes[1]).set(yscale ='log')
sns.histplot(ratio_typo, x='Fraction of upper words', bins=50, hue="Field", element="step", ax=axes[0]).set(yscale ='log')

fig.suptitle('Typographic features')
plt.tight_layout()
plt.show()

#### Emojis

In [None]:
# Compute the ratios

ratio_emojis_title = videos['count_negative_emojis_title'] / videos['count_words_title']
ratio_emojis_description = videos['count_negative_emojis_description'] / videos['count_words_description']
ratio_emojis = np.concatenate((ratio_emojis_description, ratio_emojis_title), None)


# Plot

ratio_emojis = pd.DataFrame({
    'Field': type_text,
    'Fraction of negative emojis (out of words)': ratio_emojis
})

fig = plt.subplot()
sns.histplot(ratio_emojis, x='Fraction of negative emojis (out of words)', bins=50, hue="Field", element="step").set(yscale ='log')

fig.set_title('Negative emojis')
plt.tight_layout()
plt.show()

#### Negative words

In [None]:
# Compute the ratios

ratio_neg_words_title = videos['count_negative_words_title']/videos['count_words_title']
ratio_neg_words_description = videos['count_negative_words_description']/videos['count_words_description']


# Plot

ratio_neg_words = pd.DataFrame({
    'Field': type_text,
    'Fraction of negative words': np.concatenate((ratio_neg_words_description, ratio_neg_words_title), None)
})

fig = plt.subplot()
sns.histplot(ratio_neg_words, x='Fraction of negative words', bins=50, hue="Field", element="step").set(yscale ='log')

fig.set_title('Negative words')
plt.tight_layout()
plt.show()

#### Sentiment intensity

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(16,12))

sentiment = pd.DataFrame({
    'Field': type_text,
    'Negative sentiment': np.concatenate((videos['sia_negative_description'], videos['sia_negative_title']), None),
    'Positive sentiment': np.concatenate((videos['sia_positive_description'], videos['sia_positive_title']), None),
    'Neutral sentiment': np.concatenate((videos['sia_neutral_description'], videos['sia_neutral_title']), None),
    'Compound': np.concatenate((videos['sia_compound_description'], videos['sia_compound_title']), None)

})

sns.histplot(sentiment, x='Negative sentiment', bins=50, hue="Field", element="step", ax=axes[0, 0]).set(yscale ='log')
sns.histplot(sentiment, x='Positive sentiment', bins=50, hue="Field", element="step", ax=axes[0, 1]).set(yscale ='log')
sns.histplot(sentiment, x='Neutral sentiment', bins=50, hue="Field", element="step", ax=axes[1, 0]).set(yscale ='log')
sns.histplot(sentiment, x='Compound', bins=50, hue="Field", element="step", ax=axes[1, 1]).set(yscale ='log')

fig.suptitle('Sentiment intensity (Vader)')
plt.tight_layout()
plt.show()