In [1]:
import pandas as pd
import numpy as np
import re

from googleapiclient.discovery import build
from dateutil import parser
from IPython.display import JSON

from datetime import datetime as dt
from datetime import date

#**Step 1: Get data from Youtube Channels**

In [None]:
api_key = '#paste your YouTube Data API Key here#'

In [None]:
channel_ids = ['UCe_ZLwzh-73vuzoZesJJgkw', #OMBE surf
               'UCCNZ9zfcszWw_LErqxhuT3g', #Surf Coach,
               'UCvOh9i-BOFzu51rpj33fGag', #Barefoot Surf
               'UCuZSTHZf3vd7eVehhnotcsg', #Surfers Journey
               'ChuLeaTGRcfzo0UjL-2qSbQ', #WSL
               'UC4i3-yfVazfuqwoz71T79Sw', #Surfline
               'UC--3c8RqSfAqYBdDjIG3UNA', #red bull surfing
               'UCM7nkBGadxKOa4DAJVFwoWg', #ripcurl
               'UC6uX6GF5q2JxLjMkyivM2Og', #Nathan Florence
               'UCmzxts0YGES5tN-oJ9abTQg', #Kenny Lai
              'UCLdPicN16eAKPKir8EY1UXQ', #Kale Brock
               'UCnDcnuhRMNxsI14ZnwIh6fQ' #Nic Von Rupp
              ]

In [None]:
api_service_name = "youtube"
api_version = "v3"

# Get credentials and create an API client
youtube = build(
    api_service_name, api_version, developerKey=api_key)

In [None]:
def get_channel_stats(youtube, channel_ids):
    """
    Get channel statistics: title, subscriber count, view count, video count, upload playlist
    Params:

    youtube: the build object from googleapiclient.discovery
    channels_ids: list of channel IDs

    Returns:
    Dataframe containing the channel statistics for all channels in the provided list: title, subscriber count, view count, video count, upload playlist

    """
    all_data = []
    request = youtube.channels().list(
                part='snippet,contentDetails,statistics',
                id=','.join(channel_ids))
    response = request.execute()

    for i in range(len(response['items'])):
        data = dict(channelName = response['items'][i]['snippet']['title'],
                    all = response['items'][i]['snippet'],
                    startdate = response['items'][i]['snippet']['publishedAt'],
                    subscribers = response['items'][i]['statistics']['subscriberCount'],
                    views = response['items'][i]['statistics']['viewCount'],
                    totalVideos = response['items'][i]['statistics']['videoCount'],
                    playlistId = response['items'][i]['contentDetails']['relatedPlaylists']['uploads'])
        all_data.append(data)

    return pd.DataFrame(all_data)

In [None]:
def get_video_ids(youtube, playlist_id):
    """
    Get list of video IDs of all videos in the given playlist
    Params:

    youtube: the build object from googleapiclient.discovery
    playlist_id: playlist ID of the channel

    Returns:
    List of video IDs of all videos in the playlist

    """

    request = youtube.playlistItems().list(
                part='contentDetails',
                playlistId = playlist_id,
                maxResults = 50)
    response = request.execute()

    video_ids = []

    for i in range(len(response['items'])):
        video_ids.append(response['items'][i]['contentDetails']['videoId'])

    next_page_token = response.get('nextPageToken')
    more_pages = True

    while more_pages:
        if next_page_token is None:
            more_pages = False
        else:
            request = youtube.playlistItems().list(
                        part='contentDetails',
                        playlistId = playlist_id,
                        maxResults = 50,
                        pageToken = next_page_token)
            response = request.execute()

            for i in range(len(response['items'])):
                video_ids.append(response['items'][i]['contentDetails']['videoId'])

            next_page_token = response.get('nextPageToken')

    return video_ids

In [None]:
def get_video_details(youtube, video_ids):
    """
    Get video statistics of all videos with given IDs
    Params:

    youtube: the build object from googleapiclient.discovery
    video_ids: list of video IDs

    Returns:
    Dataframe with statistics of videos, i.e.:
        'channelTitle', 'title', 'description', 'tags', 'publishedAt'
        'viewCount', 'likeCount', 'favoriteCount', 'commentCount'
        'duration', 'definition', 'caption'
    """

    all_video_info = []

    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
            part="snippet,contentDetails,statistics",
            id=','.join(video_ids[i:i+50])
        )
        response = request.execute()

        for video in response['items']:
            stats_to_keep = {'snippet': ['channelTitle', 'title', 'description', 'tags', 'publishedAt'],
                             'statistics': ['viewCount', 'likeCount', 'favouriteCount', 'commentCount'],
                             'contentDetails': ['duration', 'definition', 'caption']
                            }
            video_info = {}
            video_info['video_id'] = video['id']

            for k in stats_to_keep.keys():
                for v in stats_to_keep[k]:
                    try:
                        video_info[v] = video[k][v]
                    except:
                        video_info[v] = None

            all_video_info.append(video_info)

    return pd.DataFrame(all_video_info)

In [None]:
channel_data = get_channel_stats(youtube, channel_ids)

In [None]:
channel_data.sort_values('subscribers')

Unnamed: 0,channelName,all,startdate,subscribers,views,totalVideos,playlistId
0,How to Rip,"{'title': 'How to Rip', 'description': 'Welcom...",2014-12-26T02:20:37Z,141000,17722770,209,UUuZSTHZf3vd7eVehhnotcsg
4,Kai Lenny,"{'title': 'Kai Lenny', 'description': 'Kai Len...",2010-02-25T04:14:35Z,141000,40403333,184,UUmzxts0YGES5tN-oJ9abTQg
7,Rip Curl,"{'title': 'Rip Curl', 'description': 'Welcome ...",2005-10-10T15:28:17Z,147000,50583492,2381,UUM7nkBGadxKOa4DAJVFwoWg
3,Barefoot Surf,"{'title': 'Barefoot Surf', 'description': 'Lea...",2012-12-19T21:09:25Z,195000,21716221,150,UUvOh9i-BOFzu51rpj33fGag
10,Kale Brock,"{'title': 'Kale Brock', 'description': 'Filmma...",2012-07-29T09:05:21Z,204000,31626184,447,UULdPicN16eAKPKir8EY1UXQ
8,Surfline,"{'title': 'Surfline', 'description': 'Surfers ...",2006-06-05T06:21:02Z,253000,115020633,2820,UU4i3-yfVazfuqwoz71T79Sw
9,Red Bull Surfing,"{'title': 'Red Bull Surfing', 'description': '...",2019-02-18T17:03:24Z,289000,132912602,970,UU--3c8RqSfAqYBdDjIG3UNA
2,Nic Von Rupp,"{'title': 'Nic Von Rupp', 'description': 'Nic ...",2018-01-07T10:48:05Z,349000,10414827,375,UUnDcnuhRMNxsI14ZnwIh6fQ
6,Ombe Surf,"{'title': 'Ombe Surf', 'description': 'Start S...",2016-07-22T02:23:28Z,36500,4632575,526,UUe_ZLwzh-73vuzoZesJJgkw
1,Nathan Florence,"{'title': 'Nathan Florence', 'description': 'W...",2013-10-24T02:55:42Z,427000,183617743,478,UU6uX6GF5q2JxLjMkyivM2Og


In [None]:
def get_country(dictionary):
    return dictionary.get('country', 'nothing')

channel_data['country'] = channel_data['all'].apply(get_country)

In [None]:
channel_data = channel_data.drop('all', axis=1)
channel_data

Unnamed: 0,channelName,startdate,subscribers,views,totalVideos,playlistId,country
0,How to Rip,2014-12-26T02:20:37Z,141000,17722770,209,UUuZSTHZf3vd7eVehhnotcsg,nothing
1,Nathan Florence,2013-10-24T02:55:42Z,427000,183617743,478,UU6uX6GF5q2JxLjMkyivM2Og,US
2,Nic Von Rupp,2018-01-07T10:48:05Z,349000,10414827,375,UUnDcnuhRMNxsI14ZnwIh6fQ,PT
3,Barefoot Surf,2012-12-19T21:09:25Z,195000,21716221,150,UUvOh9i-BOFzu51rpj33fGag,nothing
4,Kai Lenny,2010-02-25T04:14:35Z,141000,40403333,184,UUmzxts0YGES5tN-oJ9abTQg,US
5,Surf Strength Coach,2010-09-22T01:11:00Z,59300,4748696,284,UUCNZ9zfcszWw_LErqxhuT3g,US
6,Ombe Surf,2016-07-22T02:23:28Z,36500,4632575,526,UUe_ZLwzh-73vuzoZesJJgkw,AU
7,Rip Curl,2005-10-10T15:28:17Z,147000,50583492,2381,UUM7nkBGadxKOa4DAJVFwoWg,nothing
8,Surfline,2006-06-05T06:21:02Z,253000,115020633,2820,UU4i3-yfVazfuqwoz71T79Sw,US
9,Red Bull Surfing,2019-02-18T17:03:24Z,289000,132912602,970,UU--3c8RqSfAqYBdDjIG3UNA,AT


In [None]:
video_df = pd.DataFrame()

for c in channel_data['channelName'].unique():
    print("Getting video information from channel: " + c)
    playlist_id = channel_data.loc[channel_data['channelName']== c, 'playlistId'].iloc[0]
    video_ids = get_video_ids(youtube, playlist_id)

    # get video data
    video_data = get_video_details(youtube, video_ids)

    # append video data together and comment data toghether
    video_df = video_df.append(video_data, ignore_index=True)

Getting video information from channel: How to Rip


  video_df = video_df.append(video_data, ignore_index=True)


Getting video information from channel: Nathan Florence


  video_df = video_df.append(video_data, ignore_index=True)


Getting video information from channel: Nic Von Rupp


  video_df = video_df.append(video_data, ignore_index=True)


Getting video information from channel: Barefoot Surf


  video_df = video_df.append(video_data, ignore_index=True)


Getting video information from channel: Kai Lenny


  video_df = video_df.append(video_data, ignore_index=True)


Getting video information from channel: Surf Strength Coach


  video_df = video_df.append(video_data, ignore_index=True)


Getting video information from channel: Ombe Surf


  video_df = video_df.append(video_data, ignore_index=True)


Getting video information from channel: Rip Curl


  video_df = video_df.append(video_data, ignore_index=True)


Getting video information from channel: Surfline


  video_df = video_df.append(video_data, ignore_index=True)


Getting video information from channel: Red Bull Surfing


  video_df = video_df.append(video_data, ignore_index=True)


Getting video information from channel: Kale Brock


  video_df = video_df.append(video_data, ignore_index=True)


#**Step 2: Data Cleaning**

A good amount of cleaning relates to the datetime variables. In another notebook I do a good amount of EDA to see how these YouTube channels evolve over time.

In [None]:
video_df

Unnamed: 0.1,Unnamed: 0,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption
0,0,pWGWGlDECkE,How to Rip,"Surfing Will Change Your Life For The Better, ...",Ryan @TheSurfersJourney shares a powerful test...,,2024-02-18T23:30:07Z,552.0,23.0,,1.0,PT1M22S,hd,False
1,1,S-YPTrf4t6w,How to Rip,Surfing + Corporate Development = The Perfect ...,Experience a unique 9-hour corporate team deve...,,2024-02-15T23:30:07Z,309.0,5.0,,0.0,PT59S,hd,False
2,2,jZqnHsFMRjo,How to Rip,Using Surf Therapy To Help Frontline Workers T...,You can buy tickets to the event using the bel...,,2024-02-14T02:29:53Z,353.0,10.0,,0.0,PT1M,hd,False
3,3,WTe2IFcX0J8,How to Rip,Surfer Escapes Into Nature,Adventure + surfing + good company = the perfe...,,2024-02-12T23:30:06Z,1083.0,30.0,,0.0,PT48S,hd,False
4,4,GxrQuRe-aWI,How to Rip,Surfer Improves Their Pop Up x100 In One Surf,My client recently improved his pop up in one ...,,2024-02-05T02:01:35Z,1563.0,30.0,,0.0,PT34S,hd,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8816,8816,RwDWzF4hp1M,Kale Brock,Le Turtles Sri Lanka,Turtles are rad creatures - in our quest to fi...,"['Sri Lanka (Country)', 'hikkaduwa', 'turtles'...",2015-01-06T09:47:34Z,341.0,5.0,,1.0,PT1M51S,hd,False
8817,8817,btYukFVnQRo,Kale Brock,A Dolphin Attacked Me With It's Fin,"Here, a dolphin attacked me.... he thought it ...","['dolphin', 'dolphins', 'nature', 'ocean', 'wi...",2014-05-09T02:29:11Z,1486.0,32.0,,3.0,PT57S,hd,False
8818,8818,Lg5QQENsDt0,Kale Brock,Surfing In Adelaide,MY INSTAGRAM - http://bit.ly/kalesbroccoli\n\n...,"['leSouth', 'surfing', 'surf', 'surfer', 'wave...",2014-05-09T01:47:08Z,14973.0,80.0,,9.0,PT1M35S,hd,False
8819,8819,HZehqKQiRQ8,Kale Brock,Surfing in Margaret River,Easter 2014 surfing in the Margaret River Wine...,"['Margaret River (City/Town/Village)', 'Surfin...",2014-04-27T09:58:25Z,1290.0,33.0,,3.0,PT4M1S,hd,False


In [None]:
video_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8821 entries, 0 to 8820
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      8821 non-null   int64  
 1   video_id        8821 non-null   object 
 2   channelTitle    8821 non-null   object 
 3   title           8821 non-null   object 
 4   description     7891 non-null   object 
 5   tags            7097 non-null   object 
 6   publishedAt     8821 non-null   object 
 7   viewCount       8820 non-null   float64
 8   likeCount       8808 non-null   float64
 9   favouriteCount  0 non-null      float64
 10  commentCount    8786 non-null   float64
 11  duration        8821 non-null   object 
 12  definition      8821 non-null   object 
 13  caption         8821 non-null   bool   
dtypes: bool(1), float64(4), int64(1), object(8)
memory usage: 904.6+ KB


In [None]:
video_df['definition'].value_counts()

hd    8423
sd     398
Name: definition, dtype: int64

In [None]:
video_df['caption'].value_counts()

False    8680
True      141
Name: caption, dtype: int64

In [None]:
video_df = video_df.drop(['Unnamed: 0', 'favouriteCount'], axis=1)

## Cleaning Date and Time related columns

In [None]:
video_df['date'] = pd.to_datetime(video_df['publishedAt'])
video_df['month_year'] = pd.to_datetime(video_df['date']).dt.strftime('%Y-%m')
video_df['day'] = pd.to_datetime(video_df['date']).dt.strftime('%A')
video_df['month'] = pd.to_datetime(video_df['month_year']).dt.strftime('%m')

In [None]:
video_df['duration'] = video_df['duration'].str.replace(r'PT', '', regex=True)
video_df['duration'] = video_df['duration'].str.replace(r'S', '', regex=True)

In [None]:
video_df['duration'] = video_df['duration'].str.replace('H', ':')
video_df['duration'] = video_df['duration'].str.replace('M', ':')
video_df['duration_split'] = video_df['duration'].str.split(':')

In [None]:
def create_duration(list_of_numbers):

  if len(list_of_numbers)==3 and list_of_numbers[2]=='':
    return int(list_of_numbers[0])*60 + int(list_of_numbers[1])

  elif len(list_of_numbers)==3 and list_of_numbers[2]!='':
    return int(list_of_numbers[0])*60 + int(list_of_numbers[1]) + int(list_of_numbers[2])/60

  elif len(list_of_numbers)==2 and list_of_numbers[1]=='':
    return int(list_of_numbers[0])

  elif len(list_of_numbers)==2 and list_of_numbers[1]!='':
    return int(list_of_numbers[0]) + int(list_of_numbers[1])/60

  elif len(list_of_numbers)==1 and list_of_numbers[0] == 'P0D':
    return 0
  else:
    return int(list_of_numbers[0])/60

In [None]:
video_df['video_length']= video_df['duration_split'].apply(create_duration)
video_df = video_df.drop(['date', 'duration_split'], axis=1)

In [None]:
channel_df

Unnamed: 0,channelName,startdate,subscribers,views,totalVideos,playlistId,country
0,How to Rip,2014-12-26T02:20:37Z,141000,17722770,209,UUuZSTHZf3vd7eVehhnotcsg,nothing
1,Nathan Florence,2013-10-24T02:55:42Z,427000,183617743,478,UU6uX6GF5q2JxLjMkyivM2Og,US
2,Nic Von Rupp,2018-01-07T10:48:05Z,349000,10414827,375,UUnDcnuhRMNxsI14ZnwIh6fQ,PT
3,Barefoot Surf,2012-12-19T21:09:25Z,195000,21716221,150,UUvOh9i-BOFzu51rpj33fGag,nothing
4,Kai Lenny,2010-02-25T04:14:35Z,141000,40403333,184,UUmzxts0YGES5tN-oJ9abTQg,US
5,Surf Strength Coach,2010-09-22T01:11:00Z,59300,4748696,284,UUCNZ9zfcszWw_LErqxhuT3g,US
6,Ombe Surf,2016-07-22T02:23:28Z,36500,4632575,526,UUe_ZLwzh-73vuzoZesJJgkw,AU
7,Rip Curl,2005-10-10T15:28:17Z,147000,50583492,2381,UUM7nkBGadxKOa4DAJVFwoWg,nothing
8,Surfline,2006-06-05T06:21:02Z,253000,115020633,2820,UU4i3-yfVazfuqwoz71T79Sw,US
9,Red Bull Surfing,2019-02-18T17:03:24Z,289000,132912602,970,UU--3c8RqSfAqYBdDjIG3UNA,AT


In [None]:
channel_df['foundingdate'] = pd.to_datetime(channel_df['startdate']).dt.strftime('%Y-%m')
channel_df = channel_df.drop(['startdate'], axis=1)

In [None]:
channel_df.to_csv('channel_df.csv', index=False)
video_df.to_csv('video_df.csv', index=False)