### Importing Libraries


In [15]:
# importing libraries
import pandas as pd
from nltk.corpus import stopwords
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# importing config.py
from config import YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, DEVELOPER_KEY

### Creating Youtube Client

In [16]:
# creating a youtube client
youtube_client = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=DEVELOPER_KEY)


### Creating Functions

In [33]:
# Define function to get video data from YouTube API
def get_video_data(video_id):
    try:
        video_response = youtube_client.videos().list(
            part="snippet,statistics,contentDetails",
            id=video_id
        ).execute()
        data = [
            video_id,
            video_response['items'][0]['snippet']['description'],
            int(video_response['items'][0]['statistics']['viewCount']),
            int(video_response['items'][0]['statistics']['likeCount']),
            int(video_response['items'][0]['statistics']['dislikeCount']),
            int(video_response['items'][0]['statistics']['commentCount']),
            video_response['items'][0]['contentDetails']['duration'],
            int(video_response['items'][0]['statistics']['favoriteCount'])
        ]
        return data
    except Exception as e:
        print(f"An error occurred: {e}")
        return None
    
# Define function to get video comments from YouTube API
def get_video_comments(video_id, max_results=100):
    try:
        comments = []
        next_page_token = None
        while len(comments) < max_results:
            comment_response = youtube_client.commentThreads().list(
                part="snippet",
                videoId=video_id,
                maxResults=min(max_results - len(comments), 100),
                pageToken=next_page_token
            ).execute()
            for item in comment_response['items']:
                comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
                comments.append(comment)
            if 'nextPageToken' in comment_response:
                next_page_token = comment_response['nextPageToken']
            else:
                break
        return comments
    except Exception as e:
        print(f"An error occurred: {e}")
        return []
    
# filter the video ids from the dataframe that works with the YouTube API
#check if the video id is valid
#check connection
def check_connection(video_id):
    try:
        video_response = youtube_client.videos().list(
            part="snippet,statistics,contentDetails",
            id=video_id
        ).execute()
        return True
    except HttpError as e:
        print(f"An error occurred: {e}")
        return False
    
def filter_video_ids(video_ids):
    valid_video_ids = []
    for video_id in video_ids:
        if len(video_id) == 11 and check_connection(video_id):
            valid_video_ids.append(video_id)
    return valid_video_ids
    
   
       

In [None]:
# Define the columns we want to extract
columns = ['youtubeId', 'description', 'viewCount', 'likeCount', 'dislikeCount', 'commentCount', 'duration', 'favoriteCount']

### Sentiment Analaysis

In [18]:
# Define function to calculate sentiment score using VADER
def get_sentiment_score(text):
    analyzer = SentimentIntensityAnalyzer()
    sentiment_score = analyzer.polarity_scores(text)
    return sentiment_score['compound']

#preprocessing the text data
def preprocessing(df):
    
    #lower string   
    df['comments'] = df['comments'].str.lower()
    
    #remove punctuation
    df['comments'] = df['comments'].str.replace('[^\w\s]','')
    
    #remove numbers
    df['comments'] = df['comments'].str.replace('\d+', '')
    
    #remove whitespace
    df['comments'] = df['comments'].str.strip()
    
    #tokenize the text using tokenizer
    df['comments'] = df['comments'].apply(lambda x: x.split())
    
    #remove stopwords
    df['comments'] = df['comments'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords.words('english'))]))
    

In [34]:
# Read CSV file into pandas DataFrame
df = pd.read_csv('vdoLinks.csv')

# Extract video ids
video_ids = df['youtubeId'].tolist()

In [39]:
filtered_ids = filter_video_ids(video_ids)

An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/videos?part=snippet%2Cstatistics%2CcontentDetails&id=0uVPQG01JHk&key=AIzaSyD6hLsCvSptmVL7br5hrjctNc5aC3ttHeM&alt=json returned "The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.". Details: "[{'message': 'The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.', 'domain': 'youtube.quota', 'reason': 'quotaExceeded'}]">
An error occurred: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/videos?part=snippet%2Cstatistics%2CcontentDetails&id=_FrdVdKlxUk&key=AIzaSyD6hLsCvSptmVL7br5hrjctNc5aC3ttHeM&alt=json returned "The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.". Details: "[{'message': 'The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-sta

KeyboardInterrupt: 

In [None]:
filtered_ids

In [38]:
# Get data for each video and store in a list
video_data_list = []
for video in filtered_ids:
    video_data = get_video_data(video)
    if video_data is not None:
        comments = get_video_comments(video, max_results=100)
        sentiment_scores = [get_sentiment_score(comment) for comment in comments]
        video_data['comments'] = comments
        video_data['sentiment_scores'] = sentiment_scores
        video_data_list.append(video_data)

An error occurred: list index out of range
An error occurred: list index out of range
An error occurred: list index out of range
