In [1]:
import pandas as pd
import os
from apiclient.discovery import build
from apiclient.errors import HttpError
from oauth2client.tools import argparser
from IPython.core.display import clear_output
input_path = 'D:\\Data\\Box-Office-Forecasting'
output_path = 'D:\\Data\\Youtube\\Scrapped'

In [2]:
df_columns = ['name', 'video_id', 'video_name', 'viewCount', 'likeCount', 'dislikeCount', 'commentCount']

In [3]:
# Set DEVELOPER_KEY to the API key value from the APIs & auth > Registered apps
# tab of https://cloud.google.com/console
# Please ensure that you have enabled the YouTube Data API for your project.
DEVELOPER_KEY = ""
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"
youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=DEVELOPER_KEY)

## Get movies comments

In [4]:
def get_comment_threads(youtube, video_id, page_token):
    results = youtube.commentThreads().list(
        part="snippet",
        videoId=video_id,
        textFormat="plainText",
        maxResults=100,
        pageToken=page_token
    ).execute()

    for item in results["items"]:
        comment = item["snippet"]["topLevelComment"]
        author = comment["snippet"]["authorDisplayName"]
        text = comment["snippet"]["textDisplay"]
        publish_date = comment["snippet"]["publishedAt"]
        comment_like = comment["snippet"]["likeCount"]
        
        ids.append(video_id)
        texts.append(text)
        publish_dates.append(publish_date)
        likes.append(comment_like)
        #print("Comment by {0}: {1}".format(author, text))
    try:
        return results["nextPageToken"]
    except:
        return None

In [5]:
movies = pd.read_csv(os.path.join(input_path, 'movie-master-final.csv'), header=0, sep=';', engine='python', encoding= 'utf8')
videos = movies['video_id'].unique()

In [7]:
all_data = pd.DataFrame(columns = ['video_id', 'comment', 'comment_date', 'comment_like'])
count = 0
selected_videos = videos[0:1]
for video_id in selected_videos:
    video_id = '3UHc_-OzstM'
    ids = []
    texts = []
    publish_dates = []
    likes=[]
    next_page_token = None
    token_count = 0
    while next_page_token or token_count == 0:
        try:
            token_count = token_count + 1
            next_page_token = get_comment_threads(youtube, video_id, next_page_token)
        except:
            next_page_token = None
            pass
    video_comments = pd.DataFrame({'video_id': ids, 'comment': texts, 'comment_date': publish_dates, 'comment_like':likes})
    
    count = count + 1
    all_data = pd.concat([all_data, video_comments])
    print("Video({0}) comment generation is complete. Status: {1}/{2}".format(video_id, str(count), str(len(selected_videos))))

Video(3UHc_-OzstM) comment generation is complete. Status: 1/1


In [None]:
all_data.to_csv(os.path.join(output_path, 'youtube-comments.csv'), sep=';', encoding= 'utf8', index=False)

## Get movie statistics 

In [None]:
def get_statistics(video_id):
    response = youtube.videos().list(part='statistics, snippet', id=video_id).execute()  
    statistics = response['items'][0]['statistics']
    return statistics

def get_stat_value(stats, key):
    if key in stats:
        return stats[key]
    else:
        return None
    
def youtube_search(movie_name):
    search_term = movie_name + " trailer"

    # Call the search.list method to retrieve results matching the specified query term.
    search_response = youtube.search().list(q=search_term, part="id, snippet", maxResults=2).execute()    
    search_df = pd.DataFrame(columns = df_columns)
    
    # Add each result to the appropriate list, and then display the lists of matching videos, channels, and playlists.
    for search_result in search_response.get("items", []):
        if search_result["id"]["kind"] == "youtube#video":
            statistics = get_statistics(search_result['id']['videoId'])
            search_df = search_df.append({'name': movie_name,
                                          'video_id': search_result["id"]["videoId"],
                                          'video_name': search_result["snippet"]["title"],                                         
                                          'viewCount': get_stat_value(statistics, "viewCount"),
                                          'likeCount': get_stat_value(statistics, "likeCount"),
                                          'dislikeCount': get_stat_value(statistics, "dislikeCount"),
                                          'commentCount': get_stat_value(statistics, "commentCount")}, 
                                         ignore_index=True)
    return search_df

In [None]:
m = pd.read_csv(os.path.join(input_path, 'movie-master-final.csv'), header=0, sep=';', engine='python', encoding= 'utf8')

In [None]:
years = [2011]

for year in years:
    m_year = m[m.year==year]
    movies = m_year['name'].unique()
    
    data = pd.DataFrame(columns = df_columns)
    
    count=0
    for movie_name in movies:
        try:
            data = pd.concat([data, youtube_search(movie_name)])
        except:
            pass
            
        count=count+1
        if count % 50 == 0:
            clear_output(wait = True)
            print("Completed:" + str(count) + ", Remaining:" + str(len(movies) - count))
    
    data.to_csv(os.path.join(output_path, 'youtube-stats-' + str(year) + '.csv'), sep=';', encoding= 'utf8', index=False)