# Does negativity make success?

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt
import statsmodels.formula.api as smf

import warnings
warnings.filterwarnings("ignore")

#### Load the dataframe containing videos from 2019 and their features

This is a big file (860 MB) so we have stored it on Google Drive. Download it from the link below and storie it as `generated/2019/2019_videos_Typo_Emojis_NegWords_Sentiment_title_desc.parquet`. It was generated by the notebook `data_processing.ipynb`.

https://drive.google.com/file/d/1RmVSw2MBq0Ps0dwcTQjqZsDAuivXbUaZ/view?usp=share_link

In [None]:
filepath = 'generated/2019/2019_videos_Typo_Emojis_NegWords_Sentiment_title_desc.parquet'
videos = pd.read_parquet(filepath, engine='fastparquet')

## What is negativity?

### Small intro (examples with sia from vader)

## [[Maybe put something more here]] [[Matteo]]

## Regression analysis [[Djian]]

### Overall

try description and title (look at R to find what is best) [[Djian: description is better]]

In [None]:
def print_regression(data, formula):
    model = smf.ols(formula=formula, data=data)
    np.random.seed(2)
    results = model.fit()
    print(results.summary())

In [None]:
# Remove videos where 'like_count' is NaN
videos = videos[videos['like_count'].isna() == False]

# Convert some rows to float
videos['like_count'] = videos['like_count'].astype(float)
videos['dislike_count'] = videos['dislike_count'].astype(float)
videos['view_count'] = videos['view_count'].astype(float)

# New columns for log of the viewcount (+1 so that the log is always defined)
videos['log_view_count'] = np.log(videos['view_count'] + 1)

In [None]:
f = 'log_view_count ~ '
f += 'sia_negative_description ' 
f += '+ sia_positive_description '
f += '+ sia_neutral_description '
f += '+ sia_negative_title ' 
f += '+ sia_positive_title '
f += '+ sia_neutral_title '


print_regression(data=videos, formula=f)

# interesting results: (log)_like_count, dislike_count, log_view_count

### By category

In [None]:
# Find the categories
categories = set(videos['categories'].values)
categories.remove(None)

In [None]:
f = 'log_view_count ~ '
f += 'sia_negative_description ' 
f += '+ sia_positive_description '
f += '+ sia_neutral_description '

results_params = dict()

for category in categories:
    videos_category = videos[videos['categories'] == category]
    model = smf.ols(formula=f, data=videos_category)
    np.random.seed(2)
    results = model.fit()
    results_params[category] = pd.concat([results.params, results.pvalues], keys=['parameter', 'p-value'])
    
df_regression = pd.DataFrame(results_params).transpose()

In [None]:
# Drop p-values and `Intercept`, remove index
df_regression = df_regression['parameter']
df_regression = df_regression.drop('Intercept', axis=1)
df_regression = df_regression.reset_index()

# Convert the dataframe to long form, for seaborn plot
df_regression_melt = df_regression.melt('index', var_name='sentiment_type', value_name='sentiment_value')

In [None]:
# Colors for the plot
palette_sentiment = {
    'sia_negative_description': 'crimson',
    'sia_neutral_description': 'gray',
    'sia_positive_description': 'dodgerblue',
    'Intercept': 'black'

}

# Plot
sns.scatterplot(
    data=df_regression_melt, 
    x='index', 
    y='sentiment_value', 
    hue='sentiment_type',
    s=40,
    marker='D',
    palette=palette_sentiment,
    #aspect=2,
    #jitter=False,  # for vertically aligned datapoints in each category
    alpha=0.8
)
plt.xticks(rotation=90)
plt.show()

In [None]:
# Same plot but with smileys instead of colors
fig, ax = plt.subplots()
plt.scatter(x=df_regression['index'], y=df_regression['sia_negative_description'], marker='$:($')
plt.scatter(x=df_regression['index'], y=df_regression['sia_neutral_description'],  marker='$:|$')
plt.scatter(x=df_regression['index'], y=df_regression['sia_positive_description'],  marker='$:)$')
plt.xticks(rotation=90)
plt.show()

## Evolution of channels with negativity [[Victor]]

## What does successful negativity look like?

### Most used words: the vocabulary of videos that are negative and successful (for different categories) [[Maj]]

Make 'histogram' of words in title/desc for videos that are very negative and have lots of success (maybe do it for each category). Example:

https://ldrame21.github.io/metoo-media-impact/#data-story-title

### Try to extract topics from negative and successful videos