## Imports

In [1]:
import time
from selenium import webdriver
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains

from wordcloud import WordCloud, STOPWORDS

## Class LordOfReplies
Takes in yt-url, returns df of comments

In [2]:
class LordOfReplies:
    """Takes in a youtube url,
    cycles: how many times the driver scrolls down, 20 cycles covers about 500 comments
    url: clearly a url

    returns a dataframe of the youtube comments. """

    def __init__(self, cycles, url):
        self.cycles = cycles
        self.url = url
        self.driver = webdriver.Chrome('/Users/evanisenstein/Documents/Chromedriver/chromedriver')


    def scrape_to_dataframe(self):

        self.driver.get(self.url)
        time.sleep(4)

        html = self.driver.find_element_by_tag_name('html')
        html.send_keys(Keys.PAGE_DOWN)
        time.sleep(3)

        for i in range(self.cycles):
            html.send_keys(Keys.END)
            time.sleep(2)

        element = self.driver.find_element_by_xpath("//*[@id='title' and @class='style-scope ytd-comments-header-renderer']")
        actions = ActionChains(self.driver)
        actions.move_to_element(element).perform()

        view_replies = self.driver.find_elements_by_xpath("//*[@id='text' and @class='style-scope ytd-button-renderer']")

        for reply in range(len(view_replies)):
            if reply % 2 == 0:
                try:
                    view_replies[reply].click()
                    time.sleep(2)
                except Exception as e:
                        print(e)
                        break

        element = self.driver.find_element_by_xpath("//*[@id='title' and @class='style-scope ytd-comments-header-renderer']")
        actions = ActionChains(self.driver)
        actions.move_to_element(element).perform()

        show_more_replies = self.driver.find_elements_by_xpath("//*[@class='style-scope yt-next-continuation' and @role='button']")

        for reply in show_more_replies:
            if reply.text == 'Show more replies':
                reply.click()
                time.sleep(2)

        comments=self.driver.find_elements_by_xpath('//*[@id="content-text"]')
        name=self.driver.find_elements_by_xpath('//*[@id="author-text"]')

        name_list = []
        comments_list = []

        for i in range(len(name)):
            name_list.append(name[i].text)
            comments_list.append(comments[i].text)

        d = {'name': name_list, 'comments': comments_list}

        self.driver.quit()

        yt_df = pd.DataFrame(d)

        print(f'Size of the scraped comments is {yt_df.shape[0]}')

        return yt_df

## Class InTheClouds
takes in yt comment df, returns word cloud saved in png


In [3]:
class InTheClouds:
    """Takes in a dataframe, returns a wordcloud
    df is a dataframe you need to turn into a wordcloud
    stops is a list of strings, stopwords you don't want included
    filename, ends in .png, what you want it"""

    def __init__(self, df, stops, filename):
        self.df = df
        self.stops = stops
        self.filename = filename

    def make_word_cloud(self):

        self.df["splitted"] = self.df['comments'].str.lower().str.replace('[^\w\s]','').str.split()

        text = " ".join(word for word in self.df['comments'])

        stopwords = set(STOPWORDS)

        stopwords.update(self.stops)

        word_cloud = WordCloud(width = 1600,
                               height = 800,
                               stopwords=stopwords).generate(text)

        plt.figure( figsize=(20,10), facecolor='k')
        plt.imshow(word_cloud)
        plt.axis("off")
        plt.savefig(self.filename, facecolor='k', bbox_inches='tight')

# Run to scrape comments, then generate word cloud

In [None]:
lord_replies = LordOfReplies(30, 'https://www.youtube.com/watch?v=DQ7mwZnN250')

replies_df = lord_replies.scrape_to_dataframe()

stops_words = ['ugly', 'stupid']

in_clouds = InTheClouds(replies_df, stops_words, 'trial.png')

in_clouds.make_word_cloud()