### Importing Libraries


In [None]:
"""
Name: Raj KUmar Phagami
ID: C0846583
Module: 2023@_AML 3204_2 Social Media Analytics
Subject: Assignment 1
"""

In [23]:
# importing libraries
import os, time
import re, json
import pandas as pd
import threading
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError

#import NLP tools
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# importing config.py
from config import YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, DEVELOPER_KEY

[nltk_data] Downloading package punkt to /Users/raj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/raj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/raj/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


### Creating Youtube Client

In [20]:
API_KEYS=["AIzaSyC8plwr7LS2a3OEthaGvN-5vNtd9BXAkrQ","AIzaSyBNMIDbVk-cW8uqAqDvkiSJx_n661N8lUM","AIzaSyCvjKBGN20-6ZAnX7MTZOvNs5Q7FyEjtcs","AIzaSyCDbIue93jkyTjXQElbhqh0IpbZFH5lB9g","AIzaSyCLokWAuZ0F2ofGRSxu8Ks41LW-zvS4Eo4"]

In [3]:
# creating a youtube client
youtube_client = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=DEVELOPER_KEY)


### Creating Functions

In [17]:
# Define function to get video data from YouTube API
def get_video_data(DEVELOPER_KEY,video_id):
    try:
        youtube_client = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=DEVELOPER_KEY)

        video_response = youtube_client.videos().list(
            part="snippet,contentDetails,statistics",
            id=video_id
        ).execute()
        if len(video_response['items']) == 0:
            print(f"Video not found: {video_id}")
            return []
        else:
            data = [
                video_response['items'][0]['snippet']['description'],
                int(video_response['items'][0]['statistics']['viewCount'] if 'viewCount' in video_response['items'][0]['statistics'] else 0),
                int(video_response['items'][0]['statistics']['likeCount'] if 'likeCount' in video_response['items'][0]['statistics'] else 0),
                int(video_response['items'][0]['statistics']['dislikeCount'] if 'dislikeCount' in video_response['items'][0]['statistics'] else 0),
                int(video_response['items'][0]['statistics']['commentCount'] if 'dislikeCount' in video_response['items'][0]['statistics'] else 0),
                video_response['items'][0]['contentDetails']['duration'],
                int(video_response['items'][0]['statistics']['favoriteCount'])
            ]
            return data

    except HttpError as e:
        print(f"An error occurred: {e}")
        
        return None
    except IndexError:
        print("Video not found: ",video_id)
    
# Define function to get video comments from YouTube API
def get_video_comments(video_id, max_results=100):
    try:
        comments = []
        next_page_token = None
        while len(comments) < max_results:
            comment_response = youtube_client.commentThreads().list(
                part="snippet",
                videoId=video_id,
                maxResults=max_results,
                pageToken=next_page_token
            ).execute()
            for item in comment_response['items']:
                comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
                comments.append(comment)
            if 'nextPageToken' in comment_response:
                next_page_token = comment_response['nextPageToken']
            else:
                break
        return comments
    except HttpError as e:
        print(f"An error occurred: {e}")
        return []
    
# filter the video ids from the dataframe that works with the YouTube API
#check if the video id is valid
#check connection
def check_connection(video_id):
    try:
        video_response = youtube_client.videos().list(
            part="snippet,statistics,contentDetails",
            id=video_id
        ).execute()
        print(f"Connection successful: {video_id}")
        return True
    except HttpError as e:
        print(f"Connection unsuccessful: {video_id}")
        return False
    
#get the video id ids that works with the YouTube API
def filter_video_ids(video_ids):
    valid_video_ids = []
    for video_id in video_ids:
        if check_connection(video_id):
            valid_video_ids.append(video_id)
    return valid_video_ids

# Create a DataFrame from the comments dictionary id and comments
def get_comments(filtered_ids, new_columns):
    try:
        comments = {}
        for video in filtered_ids:
            comments[video] = get_video_comments(video)
        mappped_comments = []
        for key, value in comments.items():
            if value!=[]:
                for comment in value:
                    mappped_comments.append([key, comment])
        df_comments = pd.DataFrame(mappped_comments, columns=new_columns)
        df_comments.to_csv('comments.csv', index=False)
        return df_comments
    except HttpError as e:
        print(f"An error occurred: {e}")
        return None


# Get data for each video and store in a list
def get_youtube_data(filtered_ids, columns):
    video_data_list = []
    for index,video in enumerate(filtered_ids):
        video_data = get_video_data(video)
        if video_data is not None:
            video_data_list.append(video_data)
        if index == 100:
            break
    df = pd.DataFrame(video_data_list,columns=columns)
    df.to_csv('video_data.csv', index=False)
    return df

### Sentiment Analaysis

In [4]:
#Calculate the sentiment score using VADER      
def get_sentiment_score(df):
    analyzer = SentimentIntensityAnalyzer()
    df['polarity_scores'] = df['comments_preprocessed'].apply(lambda x: analyzer.polarity_scores(x))
    df['compound'] = df['polarity_scores'].apply(lambda score_dict: score_dict['compound'])
    df['sentiment'] = df['compound'].apply(lambda c: 'pos' if c >=0 else 'neg')
    return df

#preprocessing the text data
def preprocessing(df):
    #lower string   
    df['comments_preprocessed'] = df['comments'].str.lower()
    #remove punctuation
    df['comments_preprocessed'] = df['comments_preprocessed'].str.replace('[^\w\s]','', regex=True)
    #remove numbers
    df['comments_preprocessed'] = df['comments_preprocessed'].str.replace('\d+', '', regex=True)
    #remove emojis
    df['comments_preprocessed'] = df['comments_preprocessed'].str.replace('[^\w\s#@/:%.,_-]', '', regex=True)
    #remove whitespace
    df['comments_preprocessed'] = df['comments_preprocessed'].str.strip()
    #tokenize the text using tokenizer
    df['comments_preprocessed'] = df['comments_preprocessed'].apply(lambda x: word_tokenize(x))
    #stemming
    df['comments_preprocessed'] = df['comments_preprocessed'].apply(lambda x: [PorterStemmer().stem(y) for y in x])
    #remove stopwords
    df['comments_preprocessed'] = df['comments_preprocessed'].apply(lambda x: ' '.join([word for word in x if word not in (stopwords.words('english'))]))
    return df

### Statistical Functions

In [6]:
# list of top 10 viewcount videos
def get_top_10_viewCount(df):
    top_10 = df.sort_values(by='viewCount', ascending=False).head(10)
    return top_10

#list of bottom 10 viewcount videos
def get_bottom_10_viewCount(df):
    bottom_10 = df.sort_values(by='viewCount', ascending=True).head(10)
    return bottom_10

#list of top 10 likecount videos
def get_top_10_likeCount(df):
    top_10 = df.sort_values(by='likeCount', ascending=False).head(10)
    return top_10

#list of bottom 10 likecount videos
def get_bottom_10_likeCount(df):
    bottom_10 = df.sort_values(by='likeCount', ascending=True).head(10)
    return bottom_10

#list of top 10 dislikecount videos
def get_top_10_dislikeCount(df):
    top_10 = df.sort_values(by='dislikeCount', ascending=False).head(10)
    return top_10

#Video with maximum duration
def get_max_duration(df):
    # df['duration'] = df['duration'].apply(lambda x: pd.to_timedelta(x).total_seconds())
    max_duration = df.sort_values(by='duration', ascending=False).head(1)
    return max_duration

#get statistics
def get_stats(df):
    #List of top 10pvideos with most views
    top_10 = get_top_10_viewCount(df)
    print("******Top 10 videos with most views******")
    print(top_10.head(10))
    
    #List of top ten videos with least views
    bottom_10 = get_bottom_10_viewCount(df)
    print("******Bottom 10 videos with least views******")
    print(bottom_10.head(10))
    
    #List of top 10 videos with most likes
    top_10_likes = get_top_10_likeCount(df)
    print("******Top 10 videos with most likes******")
    print(top_10_likes.head(10))
    
    #List of bottom 10 with least likes
    bottom_10_likes = get_bottom_10_likeCount(df)
    print("******Bottom 10 videos with least likes******")
    print(bottom_10_likes.head(10))
    
    #Video with  maximum duration
    max_duration = get_max_duration(df)
    print("******Video with maximum duration******")
    print(max_duration)


### Graph Plot Functions

In [7]:
# Read CSV file into pandas DataFrame
df = pd.read_csv('vdoLinks.csv')

# Extract video ids
print("shape",df.shape)
df.head()

shape (25623, 3)


Unnamed: 0,youtubeId,movieId,title
0,K26_sDKnvMU,1,Toy Story (1995)
1,3LPANjHlPxo,2,Jumanji (1995)
2,rEnOoWs3FuA,3,Grumpier Old Men (1995)
3,j9xml1CxgXI,4,Waiting to Exhale (1995)
4,ltwvKLnj1B4,5,Father of the Bride Part II (1995)


In [27]:
# Define the columns we want to extract
columns = ['youtubeId','title','description', 'viewCount', 'likeCount', 'dislikeCount', 'commentCount', 'duration', 'favoriteCount']

In [18]:
def get_youtube_data(columns):
    video_data_list = []
    for item in df.iterrows():
        video_data_list.append(item[1]['youtubeId'])
        video_data_list.append(item[1]['title'])
        video_data = get_video_data(item[1]['youtubeId'])
        if video_data is not None:
                video_data_list.append(video_data)
    df = pd.DataFrame(video_data_list,columns=columns)
    return df


An error occurred: list index out of range
An error occurred: list index out of range
An error occurred: list index out of range
An error occurred: list index out of range
An error occurred: list index out of range
An error occurred: list index out of range
An error occurred: list index out of range
An error occurred: list index out of range


In [92]:
# list of top 10 viewcount videos
def get_top_10_viewCount(df):
    top_10 = df.sort_values(by='viewCount', ascending=False).head(10)
    return top_10

#list of bottom 10 viewcount videos
def get_bottom_10_viewCount(df):
    bottom_10 = df.sort_values(by='viewCount', ascending=True).head(10)
    return bottom_10

#list of top 10 likecount videos
def get_top_10_likeCount(df):
    top_10 = df.sort_values(by='likeCount', ascending=False).head(10)
    return top_10

#list of bottom 10 likecount videos
def get_bottom_10_likeCount(df):
    bottom_10 = df.sort_values(by='likeCount', ascending=True).head(10)
    return bottom_10

#list of top 10 dislikecount videos
def get_top_10_dislikeCount(df):
    top_10 = df.sort_values(by='dislikeCount', ascending=False).head(10)
    return top_10

#Video with maximum duration
def get_max_duration(df):
    df['duration'] = df['duration'].apply(lambda x: pd.to_timedelta(x).total_seconds())
    max_duration = df.sort_values(by='duration', ascending=False).head(1)
    return max_duration

In [39]:
new_columns = ['youtubeId', 'comments']


In [57]:
# Create a DataFrame from the comments dictionary id and comments
def get_comments():
    comments = {}
    for video in filtered_ids:
        comments[video] = get_video_comments(video)
    mappped_comments = []
    for key, value in comments.items():
        if value!=[]:
            for comment in value:
                mappped_comments.append([key, comment])
    df_comments = pd.DataFrame(mappped_comments, columns=new_columns)
    df_comments.to_csv('comments.csv', index=False)
    return df_comments


(693, 2)

In [None]:
if __name__ == '__main__':
    # Read CSV file into pandas DataFrame
    df = pd.read_csv('vdoLinks.csv')
    # Extract video ids
    video_ids = df['youtubeId'].tolist()
    df = get_youtube_data()
    df_comments = get_comments()

In [58]:
df_comments = get_comments()
df_comments.head()



In [None]:
preprocessing(df_comments)
df_comments.head(5)

In [89]:
get_sentiment_score(df_comments)
df_comments.head(5)

Unnamed: 0,youtubeId,comments,comments_preprocessed,sentiments,polarity_scores,compound,sentiment
0,dO2LWKpeyI8,"He won the world, he lost his own nation.",world lost hi nation,0.0,"{'neg': 0.434, 'neu': 0.566, 'pos': 0.0, 'comp...",-0.3182,neg
1,dO2LWKpeyI8,2024 biden the movie by oliver stone lol,biden movi oliv stone lol,0.0,"{'neg': 0.0, 'neu': 0.588, 'pos': 0.412, 'comp...",0.4215,pos
2,dO2LWKpeyI8,They hate trump show much biden is tge real di...,hate trump show much biden tge real dirtbag,0.0,"{'neg': 0.346, 'neu': 0.654, 'pos': 0.0, 'comp...",-0.5719,neg
3,dO2LWKpeyI8,"Superb film, get the director&#39;s cut with S...",superb film get director cut sam waterston ad ...,0.0,"{'neg': 0.159, 'neu': 0.53, 'pos': 0.311, 'com...",0.4588,pos
4,dO2LWKpeyI8,Had to dislike video too many f_cking ads a_sh...,dislik video mani f_cking ad a_shol,0.0,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,pos
