# Does negativity make success?

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt
import statsmodels.formula.api as smf
from collections import Counter
from helpers import text_from_ids, neg_words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import operator
from wordcloud import WordCloud
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [None]:
!pip install wordcloud

#### Load the dataframe containing videos from 2019 and their features

This is a big file (860 MB) so we have stored it on Google Drive. Download it from the link below and storie it as `generated/2019/2019_videos_Typo_Emojis_NegWords_Sentiment_title_desc.parquet`. It was generated by the notebook `data_processing.ipynb`.

https://drive.google.com/file/d/1RmVSw2MBq0Ps0dwcTQjqZsDAuivXbUaZ/view?usp=share_link

In [None]:
filepath = 'generated/2019/2019_videos_Typo_Emojis_NegWords_Sentiment_title_desc.parquet'
videos = pd.read_parquet(filepath, engine='fastparquet')

In [None]:
videos.head()

## What is negativity?

### Small intro (examples with sia from vader)

## [[Maybe put something more here]] [[Matteo]]

## Regression analysis [[Djian]]

### Overall

try description and title (look at R to find what is best) [[Djian: description is better]]

In [None]:
def print_regression(data, formula):
    model = smf.ols(formula=formula, data=data)
    np.random.seed(2)
    results = model.fit()
    print(results.summary())

In [None]:
# Remove videos where 'like_count' is NaN
videos = videos[videos['like_count'].isna() == False]

# Convert some rows to float
videos['like_count'] = videos['like_count'].astype(float)
videos['dislike_count'] = videos['dislike_count'].astype(float)
videos['view_count'] = videos['view_count'].astype(float)

# New columns for the log of the counts (+1 so that the log is always defined)
videos['log_view_count'] = np.log(videos['view_count'] + 1)
videos['log_like_count'] = np.log(videos['like_count'] + 1)
videos['log_dislike_count'] = np.log(videos['dislike_count'] + 1)

In [None]:
success_factors = ['log_view_count', 'log_like_count', 'log_dislike_count']


def regression_formula(success_factor):
    f = f'{success_factor} ~ '
    f += 'sia_negative_description ' 
    f += '+ sia_positive_description '
    f += '+ sia_neutral_description '
    return f


formulas = [regression_formula(s) for s in success_factors]

for f in formulas:
    print(f'Regression analysis for formula \n{f}')
    print_regression(data=videos, formula=f)
    print('')
    print('')

### By category

In [None]:
# Find the categories
categories = set(videos['categories'].values)
categories.remove(None)

In [None]:
regression_for_success_factor = dict()

for success_factor in success_factors:
    f = regression_formula(success_factor)
    
    results_params_f = dict()

    for category in categories:
        videos_category = videos[videos['categories'] == category]
        model = smf.ols(formula=f, data=videos_category)
        np.random.seed(2)
        results = model.fit()
        results_params_f[category] = pd.concat([results.params, results.pvalues], keys=['parameter', 'p-value'])

    df_regression = pd.DataFrame(results_params_f).transpose()
    
    regression_for_success_factor[success_factor] = df_regression

In [None]:
def plot_regression(df_regression):
    
    df_reg = df_regression.copy()
    
    # Drop p-values and `Intercept`, remove index
    df_reg = df_reg['parameter']
    df_reg = df_reg.drop('Intercept', axis=1)
    df_reg = df_reg.reset_index()

    # Plot
    fig, ax = plt.subplots(figsize=(8, 4))
    plt.scatter(x=df_reg['index'], y=df_reg['sia_negative_description'], marker='$:($', color='crimson', s=50)
    plt.scatter(x=df_reg['index'], y=df_reg['sia_neutral_description'],  marker='$:|$', color='gray', s=50)
    plt.scatter(x=df_reg['index'], y=df_reg['sia_positive_description'],  marker='$:)$', color='dodgerblue', s=50)
    plt.xticks(rotation=90)
    plt.ylabel('log_count')
    plt.show()

In [None]:
# Plot the regression parameters for various success factors

for success_f in success_factors:
    print(f'Linear regression for {success_f}')
    plot_regression(regression_for_success_factor[success_f])
    print('')

In [None]:
# Alternative: seaborn plot
'''
# Colors for the plot
palette_sentiment = {
    'sia_negative_description': 'crimson',
    'sia_neutral_description': 'gray',
    'sia_positive_description': 'dodgerblue',
    'Intercept': 'black'

}

# Drop p-values and `Intercept`, remove index
df_regression = df_regression['parameter']
df_regression = df_regression.drop('Intercept', axis=1)
df_regression = df_regression.reset_index()

# Convert the dataframe to long form, for seaborn plot
df_regression_melt = df_regression.melt('index', var_name='sentiment_type', value_name='sentiment_value')

# Plot
sns.scatterplot(
    data=df_regression_melt, 
    x='index', 
    y='sentiment_value', 
    hue='sentiment_type',
    s=40,
    marker='D',
    palette=palette_sentiment,
    #aspect=2,
    #jitter=False,  # for vertically aligned datapoints in each category
    alpha=0.8
)
plt.xticks(rotation=90)
plt.show()
'''
' '

## Evolution of channels with negativity [[Victor]]

## What does successful negativity look like?

### Most used words: the vocabulary of videos that are negative and successful (for different categories) [[Maj]]

Make 'histogram' of words in title/desc for videos that are very negative and have lots of success (maybe do it for each category). Example:

https://ldrame21.github.io/metoo-media-impact/#data-story-title

#### What is a negative and successful video?

A negative video is the one with sia_negative_description>= 0.4. 

A successful video is the one with number of views is above the average number of views

In [None]:
df_videos = videos.copy()
df_videos.shape

In [None]:
mean_count = df_videos['view_count'].mean()
print(mean_count)

In [None]:
#Select negative and successful videos
df_filtered = df_videos[(df_videos['sia_negative_description'] >= 0.4) & (df_videos['view_count'] >= 80000)]
df_filtered.shape

In [None]:
df_filtered.head()

In [None]:
data_path = 'generated/2019/2019_videos.csv'
video_ids = set(df_filtered['display_id'])
df_title_des= text_from_ids(video_ids, data_path) #contains display_id, title, description and tags

In [None]:
df_title_des.head()

In [None]:
# merge both dataframes
df_combined = pd.merge(df_filtered, df_title_des, on="display_id")
df_combined.head()

In [None]:
# remove stopwords from titles 
def remove_stopwords(df):
    df_cleaned = df.copy()
    df_cleaned['tokens'] = df_cleaned['tags'].apply(lambda title: title.split())
    stop_words = stopwords.words('english')
    df_cleaned['tokens'] = df_cleaned['tokens'].apply(lambda tokens: [token for token in tokens if token.lower() not in stop_words])
    return df_cleaned

df_cleaned = remove_stopwords(df_combined)
df_cleaned.head()

In [None]:
#select the 5 most successful categories based on view_count
grouped = df_cleaned.groupby("categories")
most_successful = {}
for name, group in grouped:
    most_successful[name] = group['view_count'].mean()
sorted_dict = sorted(most_successful.items(), key=operator.itemgetter(1), reverse=True)[:5]
print(sorted_dict)

## Most common negative words in the most successful categories

In [None]:
# Find the most common words in the most 5 successful categories 
common_words_with_freq = {}

categorie_groups = df_cleaned.groupby("categories")
for name, group in categorie_groups:
    flattened = [val for sublist in group['tokens'].tolist() for val in sublist]
    common_words_with_freq[name] = Counter(flattened).most_common(100)
#convert the dict to a dataframe   
L = [(k, *t) for k, v in common_words_with_freq.items() for t in v]
df_success = pd.DataFrame(L, columns=['categories','common_words','frequency'])
df_success

In [None]:
df_success_all = pd.DataFrame(common_words_with_freq.items(), columns=['categories', 'most_common_words'])
df_success_all

In [None]:
grouped = df_success.groupby("categories")
for name, group in grouped:
    group_words = [item.lower() for item in grouped['common_words'].get_group(name).tolist()]
    #group_neg_words = list(set(group_words) & neg_set)
    if(len(group_words) != 0):
        group_freq = grouped['frequency'].get_group(name).tolist()
        data = dict(zip(group_words, group_freq))
        wc = WordCloud(width=800, height=400, max_words=200).generate_from_frequencies(data)
        plt.figure(figsize=(10, 10))
        plt.imshow(wc, interpolation='bilinear')
        plt.axis('off')
        plt.title(name)
        plt.show()       

# What topics appear the most in negative and successful videos? 

In [None]:
# Dont forget the put the reference

In [None]:
import nltk
from nltk.stem import *
import re 
import gensim
import pickle 

In [None]:
stopwords = set(nltk.corpus.stopwords.words('english'))

In [None]:
#sucessful and negative videos: df_cleaned
df_cleaned.head()

In [None]:
# remove urls
def remove_url(text):
    return re.sub(r'https?:\S*','',text)
df_cleaned.description = df_cleaned.description.apply(remove_url)

In [None]:
# remove mentions and tags
def remove_mentions_and_tags(text):
    text = re.sub(r'@\S*','',text)
    return re.sub(r'#\S*','',text)
df_cleaned.description = df_cleaned.description.apply(remove_mentions_and_tags)

In [None]:
def preprocessing(df):
    corpus = []
    lem = WordNetLemmatizer() # For Lemmatization
    for news in df['description']:
        words=[w for w in nltk.tokenize.word_tokenize(news) if (w not in stopwords)] # word_tokenize function tokenizes text on each word by default
        words=[lem.lemmatize(w) for w in words if len(w)>2]
        corpus.append(words)
    return corpus

# Apply this function on our data frame
corpus = preprocessing(df_cleaned)
print(corpus)

In [None]:
# Transform to gensim dictionary
dic = gensim.corpora.Dictionary(corpus) 
bow_corpus = [dic.doc2bow(doc) for doc in corpus]
pickle.dump(bow_corpus, open('corpus.pkl', 'wb'))
dic.save('dictionary.gensim')

In [None]:
lda_model = gensim.models.LdaMulticore(bow_corpus,
                                   num_topics = 15,
                                    id2word = dic,
                                      passes = 10,
                                      workers = 2)
lda_model.save('model15.gensim')

In [None]:
# We print words occuring in each of the topics as we iterate through them
for idx, topic in lda_model.print_topics(num_words=50):    
    print('Topic: {} \nWords: {}'.format(idx, topic))

## Visualizing results

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models

In [None]:
# Loading the dictionary and corpus files we saved earlier
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))

# Loading the num_of_topics = 2 model we saved earlier
lda = gensim.models.ldamodel.LdaModel.load('model15.gensim')
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda, bow_corpus, dic, sort_topics=False)
pyLDAvis.display(vis)