In [95]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

plt.style.use('ggplot')

import nltk
nltk.download('punkt')
nltk.download('vader_lexicon')

from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/frankhigley/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/frankhigley/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


initialize sentimentIntensityAnalyser()

In [96]:
sia = SentimentIntensityAnalyzer()

Sentiment analysis function:
- uses tqdm for progress bar
- iterated through all data and if the data is string perform sentiment analysis
- the based of the compounded number the comment is either positive negaive or neutral
- return the sentiment array for each file this array consists of ones, zeros, and negative ones

In [97]:
def sentiment_analysis(data, sia):
   #results
    sent = []

    # run sentiment analysis
    for i, row in tqdm(data.iterrows(), total=len(data)):
        comment = row['Comments']

        #check if comment is a string
        if isinstance(comment, str):
            res = sia.polarity_scores(comment)
            compound = res.get('compound', None)
            if compound is not None:
                if compound > 0:
                    sentiment = 1
                elif compound < 0:
                    sentiment = -1
                else:
                    sentiment = 0

                sent.append(sentiment)    

    return sent

In [98]:
def get_sent_totals(sent):
    pos = 0
    neg = 0
    neu = 0

    for i in sent:
        if i == 1:
            pos += 1
        if i == -1:
            neg += 1
        else:
            neu +=1

    return pos, neu, neg

Runs sentiment analysis on a given file path returns the number of positive, negative, and neutral comments.

In [99]:
def run_sentiment_analysis(path):
    data = pd.read_csv(path)
    sentiment = sentiment_analysis(data, sia)
    numpos, numneu, numneg = get_sent_totals(sentiment)

    return numpos, numneu, numneg


Create data file array

In [100]:
clean_csv_files = ['../data/CleanCSV/beauty_comments_clean.csv',
            '../data/CleanCSV/controversial_comments_clean.csv',
            '../data/CleanCSV/gaming_comments_clean.csv',
            '../data/CleanCSV/music_comments_clean.csv',
            '../data/CleanCSV/reaction_comments_clean.csv',
            '../data/CleanCSV/vlog_comments_clean.csv']

In [101]:
raw_csv_files = ['../data/RawCSV/beauty_comments.csv',
                 '../data/RawCSV/controversial_comments.csv',
                 '../data/RawCSV/gaming_comments.csv',
                 '../data/RawCSV/music_comments.csv',
                 '../data/RawCSV/reaction_comments.csv',
                 '../data/RawCSV/vlog_comments.csv']

Run sentiment analysis on each dataset

In [102]:
beauty_sent_totals_clean = []
contraversial_sent_totals_clean = []
gaming_sent_totals_clean = []
music_sent_totals_clean = []
reaction_sent_totals_clean = []
vlog_sent_totals_clean =[]

beauty_sent_totals_clean = run_sentiment_analysis(clean_csv_files[0])
contraversial_sent_totals_clean = run_sentiment_analysis(clean_csv_files[1])
gaming_sent_totals_clean = run_sentiment_analysis(clean_csv_files[2])
music_sent_totals_clean = run_sentiment_analysis(clean_csv_files[3])
reaction_sent_totals_clean = run_sentiment_analysis(clean_csv_files[4])
vlog_sent_totals_clean = run_sentiment_analysis(clean_csv_files[5])


  0%|          | 0/3136 [00:00<?, ?it/s]

  0%|          | 0/2235 [00:00<?, ?it/s]

  0%|          | 0/33370 [00:00<?, ?it/s]

  0%|          | 0/33501 [00:00<?, ?it/s]

  0%|          | 0/1581 [00:00<?, ?it/s]

  0%|          | 0/24479 [00:00<?, ?it/s]

In [103]:
beauty_sent_totals_raw = []
contraversial_sent_totals_raw = []
gaming_sent_totals_raw = []
music_sent_totals_raw = []
reaction_sent_totals_raw = []
vlog_sent_totals_raw =[]

beauty_sent_totals_raw = run_sentiment_analysis(raw_csv_files[0])
contraversial_sent_totals_raw = run_sentiment_analysis(raw_csv_files[1])
gaming_sent_totals_raw = run_sentiment_analysis(raw_csv_files[2])
music_sent_totals_raw = run_sentiment_analysis(raw_csv_files[3])
reaction_sent_totals_raw = run_sentiment_analysis(raw_csv_files[4])
vlog_sent_totals_raw = run_sentiment_analysis(raw_csv_files[5])

  0%|          | 0/57587 [00:00<?, ?it/s]

  0%|          | 0/93584 [00:00<?, ?it/s]

  0%|          | 0/246826 [00:00<?, ?it/s]

  0%|          | 0/238386 [00:00<?, ?it/s]

  0%|          | 0/22207 [00:00<?, ?it/s]

  0%|          | 0/222522 [00:00<?, ?it/s]

Print results

In [104]:
print("Beauty:\n \tPositive:" , beauty_sent_totals_clean[0], "\n\tNeutral:", beauty_sent_totals_clean[1], "\n\tNegative:", beauty_sent_totals_clean[2], "\n\tTotal:", sum(beauty_sent_totals_clean))
print("Contraversial:\n \tPositive:" , contraversial_sent_totals_clean[0], "\n\tNeutral:", contraversial_sent_totals_clean[1], "\n\tNegative:", contraversial_sent_totals_clean[2], "\n\tTotal:", sum(contraversial_sent_totals_clean))
print("Gaming:\n \tPositive:" , gaming_sent_totals_clean[0], "\n\tNeutral:", gaming_sent_totals_clean[1], "\n\tNegative:", gaming_sent_totals_clean[2], "\n\tTotal:", sum(gaming_sent_totals_clean))
print("Music:\n \tPositive:" , music_sent_totals_clean[0], "\n\tNeutral:", music_sent_totals_clean[1], "\n\tNegative:", music_sent_totals_clean[2], "\n\tTotal:", sum(music_sent_totals_clean))
print("Reaction:\n \tPositive:" , reaction_sent_totals_clean[0], "\n\tNeutral:", reaction_sent_totals_clean[1], "\n\tNegative:", reaction_sent_totals_clean[2], "\n\tTotal:", sum(reaction_sent_totals_clean))
print("Vlog:\n \tPositive:" , vlog_sent_totals_clean[0], "\n\tNeutral:", vlog_sent_totals_clean[1], "\n\tNegative:", vlog_sent_totals_clean[2], "\n\tTotal:", sum(vlog_sent_totals_clean))

Beauty:
 	Positive: 1528 
	Neutral: 2843 
	Negative: 293 
	Total: 4664
Contraversial:
 	Positive: 882 
	Neutral: 1995 
	Negative: 240 
	Total: 3117
Gaming:
 	Positive: 14790 
	Neutral: 31543 
	Negative: 1827 
	Total: 48160
Music:
 	Positive: 5706 
	Neutral: 32670 
	Negative: 818 
	Total: 39194
Reaction:
 	Positive: 351 
	Neutral: 1463 
	Negative: 118 
	Total: 1932
Vlog:
 	Positive: 10775 
	Neutral: 22683 
	Negative: 1796 
	Total: 35254


In [105]:
print("Beauty Raw:\n \tPositive:" , beauty_sent_totals_raw[0], "\n\tNeutral:", beauty_sent_totals_raw[1], "\n\tNegative:", beauty_sent_totals_raw[2], "\n\tTotal:", sum(beauty_sent_totals_raw))
print("Contraversial Raw:\n \tPositive:" , contraversial_sent_totals_raw[0], "\n\tNeutral:", contraversial_sent_totals_raw[1], "\n\tNegative:", contraversial_sent_totals_raw[2], "\n\tTotal:", sum(contraversial_sent_totals_raw))
print("Gaming Raw:\n \tPositive:" , gaming_sent_totals_raw[0], "\n\tNeutral:", gaming_sent_totals_raw[1], "\n\tNegative:", gaming_sent_totals_raw[2], "\n\tTotal:", sum(gaming_sent_totals_raw))
print("Music Raw:\n \tPositive:" , music_sent_totals_raw[0], "\n\tNeutral:", music_sent_totals_raw[1], "\n\tNegative:", music_sent_totals_raw[2], "\n\tTotal:", sum(music_sent_totals_raw))
print("Reaction Raw:\n \tPositive:" , reaction_sent_totals_raw[0], "\n\tNeutral:", reaction_sent_totals_raw[1], "\n\tNegative:", reaction_sent_totals_raw[2], "\n\tTotal:", sum(reaction_sent_totals_raw))
print("Vlog Raw:\n \tPositive:" , vlog_sent_totals_raw[0], "\n\tNeutral:", vlog_sent_totals_raw[1], "\n\tNegative:", vlog_sent_totals_raw[2], "\n\tTotal:", sum(vlog_sent_totals_raw))

Beauty Raw:
 	Positive: 40296 
	Neutral: 49446 
	Negative: 8122 
	Total: 97864
Contraversial Raw:
 	Positive: 32494 
	Neutral: 53762 
	Negative: 39822 
	Total: 126078
Gaming Raw:
 	Positive: 128935 
	Neutral: 220241 
	Negative: 26140 
	Total: 375316
Music Raw:
 	Positive: 67076 
	Neutral: 218765 
	Negative: 19621 
	Total: 305462
Reaction Raw:
 	Positive: 9798 
	Neutral: 18602 
	Negative: 3605 
	Total: 32005
Vlog Raw:
 	Positive: 106208 
	Neutral: 191400 
	Negative: 30713 
	Total: 328321
