# Sentiment Analysis

This notebook does some basic sentiment analysis of the text data scraped from YouTube (captions, comments, descriptions).

In [1]:
import json
import re
import os

import pandas as pd
import numpy as np
from textblob import TextBlob

In [2]:
# import the raw video info json
raw = pd.read_json('results/video_info.json', orient='index')
raw.reset_index(inplace=True)
raw.rename(index=str, columns={"index": "video_id"}, inplace=True)

## Sentiment Analysis of Comments

In [3]:
# process the comments
comments = raw[['video_id', 'comments']].copy()

comments = comments.comments.apply(lambda x: pd.Series(x))\
                        .stack()\
                        .reset_index(level=1, drop=True)\
                        .to_frame('comments')\
                        .join(comments[['video_id']], how='left')

comments['comments'] = comments['comments'].astype('str')

polarities = comments['polarity'] = comments.comments\
                       .apply(lambda x: TextBlob(x).sentiment.polarity)
    
avg_polarity = comments.groupby('video_id')['polarity'].mean().reset_index()
avg_polarity.to_csv('derived_data/comment_sentiments.csv')

## Sentiment Analysis of Descriptions

In [4]:
descriptions = raw[['video_id', 'description']].copy()
descriptions['description'] = descriptions.description.astype(str)

descriptions['polarity'] = descriptions.description.\
                           apply(lambda x: TextBlob(x).sentiment.polarity)
    
descriptions[['video_id', 'polarity']].to_csv('derived_data/description_sentiments.csv')

## Sentiment Analysis of Captions

In [5]:
polarities = {}

for file in os.listdir('derived_data/captions_clean/'):
    if file[0] == '.':
        continue
    video_id = file.split('.')[0]
    captions = open(os.path.join('derived_data/captions_clean', file), 'r').readlines()
    captions = [line.replace('\n', '') for line in captions]
    blob = TextBlob(" ".join(captions))
    polarities[video_id] = blob.sentiment.polarity
    
polarity_df = pd.DataFrame(list(polarities.items()), columns=['video_id', 'polarity'])
polarity_df.to_csv('derived_data/caption_sentiments.csv')