## Youtube Scraper


In [6]:
# Setup

api_key = 'AIzaSyA2wS4rp9sQ-ctjtNiVEOeUVmlB6Tsl3Mw'
api_servicename = 'youtube'
api_version = 'v3'

# Import necessary libraries
from googleapiclient.discovery import build
import isodate
from datetime import timedelta
import nltk

# Initialize YouTube API service
youtube = build(api_servicename, api_version, developerKey=api_key)

playlist_id = 'PLpi4YdMCC439sN_5vIza6IfQm0qc-IqPO'

nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/danavolovelsky/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [7]:
def get_video_data(video_id):
    
    # Store video in empty list
    video_data = []
    
    # Call YouTube API to get videos information
    videos = youtube.videos().list(
        part='snippet', # Snippet data includes title and description
        id=video_id # video_id provided as an input 
    ).execute()

    if 'items' in videos and videos['items']:
        
        # Extracts snippet information from video
        video_snippet = videos['items'][0]['snippet']
        title = video_snippet['title']
        description = video_snippet['description']
        video_data.append({'video_id': video_id, 'title': title, 'description': description})

    return video_data

In [8]:
# Function Definitions

def get_video_comments(video_id):
    # Function to retrieve comments for a video
    comments = []
    next_page_token = None
    while True:
        comment_response = youtube.commentThreads().list(
            part='snippet',
            videoId=video_id,
            pageToken=next_page_token,
            maxResults=100000  
        ).execute()
        
        for item in comment_response['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textOriginal']
            comments.append(comment)
        
        next_page_token = comment_response.get('nextPageToken')
        if not next_page_token:
            break  # No more comments -> exit loop
    
    return comments

In [9]:

filtered_video_ids = []

# Initialize variables for pagination
next_page_token = None # Track next page of results
total_videos = 0

# https://developers.google.com/youtube/v3/guides/implementation/pagination
while True:
    # Get playlist items for current page
    playlist_items = youtube.playlistItems().list(
        part='snippet',
        playlistId=playlist_id,
        pageToken=next_page_token,
    ).execute()

    # Extract information about each video in the playlist
    for item in playlist_items['items']:
        video_id = item['snippet']['resourceId']['videoId']
        title = item['snippet']['title']
        description = item['snippet']['description']
        
        if 'celebrity' not in title.lower():
                
            # Get video details from Youtube API
            video_details = youtube.videos().list(
                part='contentDetails',
                id=video_id
            ).execute()

            # Check if video details are available
            if 'items' in video_details and video_details['items']:
                video_details = video_details['items'][0] # Get details of first item
                duration = video_details['contentDetails']['duration'] # Get video length

                # Convert duration to a timedelta object
                duration_timedelta = isodate.parse_duration(duration)

                # Remove videos under 8 minutes
                if duration_timedelta > timedelta(minutes=8):
                    # Add Video Id to the filtered_video_ids list
                    filtered_video_ids.append((video_id, title))

                    print(f'Video ID: {video_id}, Title: {title}')
                    print()           
                    # Count total videos         
                    total_videos += 1

    # Check if there are more pages 
    next_page_token = playlist_items.get('nextPageToken')
    if not next_page_token:
        break  # No more pages, so exit the loop

print(f'Total videos processed: {total_videos}')


Video ID: kavjzsRtuuA, Title: Inside Benny Blanco’s Fun-Filled L.A. Home | Open Door | Architectural Digest

Video ID: XxPPdlo72ho, Title: Inside Emma Roberts’s Charming Los Angeles Home | Open Door | Architectural Digest

Video ID: VylU-ueLA_g, Title: Inside Joshua Weissman's Minimalist Texas Home | Open Door | Architectural Digest

Video ID: xqv0yo_Elj4, Title: Inside Amber Valletta’s Peaceful L.A. Sanctuary | Open Door | Architectural Digest

Video ID: QvoPw_5Sz8U, Title: Inside Tan France’s Dream Home in Salt Lake City | Open Door | Architectural Digest

Video ID: 9OvibwfflDg, Title: Inside Sofía Vergara’s Stunning Los Angeles Home | Open Door | Architectural Digest

Video ID: zPUSDU0hrwE, Title: Inside Bryce Dallas Howard’s Charming New York Cottage | Open Door | Architectural Digest

Video ID: Jzqu48uup54, Title: Inside The White House With President Joe Biden

Video ID: Li8ubUzLB90, Title: Inside Jesse Tyler Ferguson & Justin Mikita’s Delightful L.A. Home | Architectural Digest


# Data Preprocessing 

## Tokenization

In [19]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import contractions
import re
import pandas as pd
import emoji


comments_data = []

# Function Definitions

def preprocess_comment(comment):
    # Check if comment is empty or None
    if not comment:
        return []
    
    try:
        # Expand contractions
        comment = contractions.fix(comment)
    except IndexError:
        print(f"Error expanding contractions in comment: {comment}")
        return []
    
    # Remove emojis
    comment = ''.join(c for c in comment if emoji.demojize(c) == c)
    
    # Removing punctuation and converting to lowercase
    comment = ''.join(ch for ch in comment if ch not in string.punctuation).lower()
    
    # Tokenization
    tokens = word_tokenize(comment)
    
    # Removing stopwords
    tokens = [token for token in tokens if token not in stopwords.words('english')]
    
    return tokens

for video_id, title in filtered_video_ids:
    #print(f'Video Title: {title}\n')
    # Get comments for the current video
    comments = get_video_comments(video_id)
    
    # Preprocess each comment
    for i, comment in enumerate(comments):
        # Preprocess comment using NLTK
        tokens_nltk = preprocess_comment(comment)
        
        comments_data.append({'Video ID': video_id, 'Video Title': title, 'Tokens': tokens_nltk})

        #print(f"Tokens from comment {i+1} (NLTK): {tokens_nltk}")
        
df_comments = pd.DataFrame(comments_data)

# Remove rows with empty lists in 'Tokens' column
df_comments = df_comments[df_comments['Tokens'].map(lambda d: len(d)) > 0]

print(df_comments.head())
df_comments.to_csv('comments.csv', index=False)


Error expanding contractions in comment: İ didnt know they were together WHAT?! good for them
Error expanding contractions in comment: OH MY GOSH !!! She had an amethyst? İs that real??? Cool
      Video ID                                        Video Title  \
0  kavjzsRtuuA  Inside Benny Blanco’s Fun-Filled L.A. Home | O...   
1  kavjzsRtuuA  Inside Benny Blanco’s Fun-Filled L.A. Home | O...   
2  kavjzsRtuuA  Inside Benny Blanco’s Fun-Filled L.A. Home | O...   
3  kavjzsRtuuA  Inside Benny Blanco’s Fun-Filled L.A. Home | O...   
4  kavjzsRtuuA  Inside Benny Blanco’s Fun-Filled L.A. Home | O...   

                                              Tokens  
0                  [one, going, mention, toe, nails]  
1          [thanks, sharing, benny, really, enjoyed]  
2  [eww, marijuana, gateway, every, thing, else, ...  
3                                 [seems, kind, fun]  
4                               [hahaha, bag, money]  


In [1]:
import pandas as pd
import ast


# Load the existing data
df = pd.read_csv('comments.csv')

celebrity_names = ['benny', 'selena', 'blanco', 'emma', 'roberts', 'joshua', 'weissman', 'amber', 'valletta', 'tan', 'rob', 'france', 'maude', 'apatow', 'sofia', 'sofía', 'vergara', 'bryce', 'dallas', 'howard', 'joe', 'biden', 'jesse', 'tyler', 'ferguson', 'justin', 'mikita', 'ray', 'romano', 'kevin', 'hart', 'orville', 'peck', 'jon', 'batiste', 'suleika', 'jaouad', 'carmelo', 'anthony', 'adwoa', 'aboah', 'ashley', 'benson', 'winnie', 'harlow', 'john', 'legend', 'chrissy', 'teigen', 'amanda', 'seyfried', 'chloe', 'fineman', 'rupaul', 'karen', 'gillan', 'troian', 'bellisario', 'patrick', 'adams', 'debby', 'ryan', 'josh', 'dun', 'sarah', 'paulson', 'david', 'harbour', 'lily', 'allen', 'michael', 'imperioli', 'viola', 'davis', 'julius', 'tennon', 'ellen', 'pompeo', 'rita', 'ora', 'emma', 'chamberlain', 'demi', 'lovato', 'nate', 'berkus', 'jeremiah', 'brent', 'tommy', 'hilfiger', 'sienna', 'miller', 'bryce', 'dallas', 'howard', 'matty', 'matheson', 'travis', 'barker', 'justina', 'blakeney', 'seth', 'rogan', 'chlöe', 'chloe', 'bailey', 'kacey', 'musgraves', 'ashley', 'tisdale', 'try', 'guys', 'zach', 'kornfeld', 'eugene', 'lee', 'yang', 'keith', 'habersberger', 'ned', 'fulmer', 'shonda', 'rhimes', 'devin', 'booker', 'gwyneth', 'paltrow', 'kathy', 'hilton', 'connor', 'mcdavid', 'alicia', 'keys', 'kasseem', 'dean', 'swizz', 'beatz', 'vanessa', 'hudgens', 'nina', 'dobrev', 'adam', 'levine', 'behati', 'prinsloo', 'genevieve', 'jared', 'taylor', 'hill', 'padalecki', 'geazy', 'tyrese', 'gibson', 'cara', 'delevingne', 'bretman', 'rock', 'troye', 'sivan', 'naomi', 'campbell', 'pharrell', 'david', 'grutman', 'daveed', 'diggs', 'emmy', 'raverlampman', 'serena', 'williams', 'andrew', 'rea', 'binging', 'babish', 'kevin', 'vanessa', 'carlton', 'rainn', 'wilson', 'hilary', 'duff', 'scottie', 'pippen', 'misty', 'copeland', 'kendall', 'jenner', 'balvin', 'maluma', 'ditta', 'von', 'teese', 'dakota', 'johnson', 'jesse', 'tyler', 'ferguson', 'aaron', 'paul', 'madelaine', 'petsch', 'nyjah', 'huston', 'chelsea', 'handler', 'neil', 'patrick', 'harris', 'tyler', 'perry', 'sheryl', 'crow', 'redick', 'nicole', 'scherzinger', 'nikolaj', 'costerwaldau', 'liv', 'tyler', 'mark', 'ronson', 'maria', 'sharapova', 'david', 'dobrik', 'john', 'stamos', 'jessica', 'alba', 'kerry', 'washington', 'lance', 'armstrong', 'dominic', 'west', 'jensen', 'danneel', 'ackles', 'alessandra', 'ambrosio', 'michael', 'kors', 'zedd', 'terry', 'crews']

# Function to clean tokens
def clean_tokens(tokens):

    # Convert string representation of list to actual list
    tokens = ast.literal_eval(tokens)
    
    cleaned_tokens = []
    for token in tokens:
        # Keep only alphabetic characters
        cleaned_token = ''.join([char for char in token if char.isalpha()])
        # Add token to cleaned_tokens if it is not empty
        if cleaned_token and len(cleaned_token) > 2 and not any(celeb in cleaned_token.lower() for celeb in celebrity_names):
            cleaned_tokens.append(cleaned_token)
    return cleaned_tokens

# Apply the cleaning function to each row in the dataframe
df['Tokens'] = df['Tokens'].apply(clean_tokens)

# Remove rows where 'Tokens' is empty
df = df[df['Tokens'].map(lambda d: len(d)) > 0]

# Write the cleaned data to a new CSV file
df.to_csv('cleaned_comments.csv', index=False)


In [1]:
# Continue with the existing code for descriptions_data
descriptions_data = []

for video_id, title in filtered_video_ids:
    print(f'Video ID: {video_id}, Title: {title}')
    
    # Get video data including description
    video_data = get_video_data(video_id)
    
    if video_data:
        # Extract the description from video data
        description = video_data[0]['description']
        
        # Clean the description
        cleaned_description = ' '.join([token for token in description.split() if token.isalpha() and not any(celeb in token.lower() for celeb in celebrity_names)])
        
        # Process the cleaned description
        processed_description = preprocess_comment(cleaned_description)
        
        descriptions_data.append({'Video ID': video_id, 'Video Title': title, 'Main Description': processed_description})
        
        print(f'Main Description: {cleaned_description}')
        print()

df_descriptions = pd.DataFrame(descriptions_data)
print(df_descriptions.head())
df_descriptions.to_csv('descriptions.csv', index=False)

NameError: name 'filtered_video_ids' is not defined

In [None]:
from wordcloud import WordCloud

df = pd.read_csv('cleaned_comments.csv')