# Youtube Video Titles Words Frequency Analysis

In [None]:
#needed libraries installation
#http requests
!pip install requests
#natural language processing
!pip install nlpk
#to remove stop words
!pip install stop_words
#word cloud graphical representation
!pip install wordcloud

In [None]:
#imports from built-in and above libraries
import requests
import json
import nltk
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer
from wordcloud import WordCloud

In [None]:
#youtube api credentials
api_key = "<YOUR API KEY>"

In [None]:
#function used to make youtube api request
#if you won't set any region code the defeault is US
def fetchYoutubeVideosTitles(keyword, token = '', region_code = 'us'):
    local_titles = []
    
    if len(token)==0:
        request_url = "https://www.googleapis.com/youtube/v3/search?q={}&part=snippet&type=video&maxResults=50&key={}&regionCode={}".format(search_term, api_key, region_code)
    else:
        request_url = "https://www.googleapis.com/youtube/v3/search?q={}&part=snippet&type=video&maxResults=50&key={}&pageToken={}&regionCode={}".format(search_term, api_key, token, region_code)
    
    r = requests.get(request_url)
    response_content = r.content.decode('utf-8')
    
    response_json = json.loads(response_content)
    
    nextPageToken = ''
    if 'nextPageToken' in response_json:
        nextPageToken = response_json['nextPageToken'] 
    items = response_json['items']

    for item in items:
        local_titles.append(item['snippet']['title'])
        
    return (local_titles, nextPageToken)

In [None]:
#fetch youtube most videos based on the search term below
#the code below will put together the paginated results 
#executing multiple requests if needed (more than 50 results)
search_term = "<YOUR SEARCH TERM>"
print("Search Youtube for videos related to {}...".format(search_term))

titles = []

page_token = ''
api_calls_count = 0
while True:
    print("Search Youtube for videos related to {}... Iteration {}".format(search_term, api_calls_count))
    fetch_function_results = fetchYoutubeVideosTitles(search_term, page_token)
    #print(fetch_function_results)
    
    titles.extend(fetch_function_results[0])
    page_token = fetch_function_results[1]
    print("Search Youtube for videos related to {}... Next Page Token = {}.".format(search_term, page_token) )
    
    if page_token == '':
        break
        
    api_calls_count += 1
    
print("Search Youtube for videos related to {}. DONE.".format(search_term))

print(titles)

In [None]:
#prepare titles list for words frequency analysis
titles_words_joined = " ".join(titles)
titles_words_joined

In [None]:
nltk.download('gutenberg')
nltk.download('genesis')
nltk.download('inaugural')
nltk.download('nps_chat')
nltk.download('webtext')
nltk.download('treebank')
nltk.download('punkt')
from nltk.book import *

In [None]:
#use this function to filter out numbers from the text to analyze
#from: https://www.pythoncentral.io/how-to-check-if-a-string-is-a-number-in-python-including-unicode/
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        pass
 
    try:
        import unicodedata
        unicodedata.numeric(s)
        return True
    except (TypeError, ValueError):
        pass
    return False

In [None]:
#frequency analysis (many lines below from https://onlinecoursetutorials.com/nlp/how-to-remove-punctuation-in-python-nltk/)
tokenizer = RegexpTokenizer(r'\w+')

words = tokenizer.tokenize(titles_words_joined)

#remove stop words
nltk.download('stopwords')

from stop_words import get_stop_words
from nltk.corpus import stopwords

stop_words = list(get_stop_words('en')) #put here the right country iso code for the titles language    
nltk_words = list(stopwords.words('english')) #put here the right language 

#in my stop words put, if needed otherwise leave empty, the words not filtered
#using the two lines above that you want to filter out
my_stop_words = ['any_word_you_want_to_filter_out']
stop_words.extend(nltk_words)
stop_words.extend(my_stop_words)

#the len(w)>2 can be removed from the conditions below if you want to do frequency analysis of 
#those so short words(not filtered out as stop words) if any
output = [w for w in words if (not w in stop_words and not is_number(w) and len(w)>2)] 

freqDist = FreqDist(output)
freqDist

In [None]:
#plot words frequency of the youtube video titles
freqDist.plot(10)

In [None]:
#draw word cloud graph (from https://stackoverflow.com/questions/16645799/how-to-create-a-word-cloud-from-a-corpus-in-python)
def show_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color='white',
        max_words=200,
        max_font_size=40, 
        scale=3,
        random_state=1 # chosen at random by flipping a coin; it was heads
    ).generate(str(data))

    fig = plt.figure(1, figsize=(12, 12))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize=20)
        fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()
    
show_wordcloud(output)