# **Dependencies**

# Imports

In [5]:
import numpy as np
import pandas as pd
import grequests
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

from urllib import parse as url

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dasan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Constants

In [9]:
MAX_YT_SEARCH_RESULTS = 5

API_KEY = 'AIzaSyCqTnKWWl26r2_FbJfUlk4WjaFcvLgivUo'
ACCESS_TOKEN = 'ya29.a0Ad52N389v24Zohy1pE8uKQsqXm-Zz0S5JDp7Aw1sFAhj3ShjtFi4wLpWwwtGGXGFp7uS9uou3TwXHpFpORv7uFgd3cYfmXFvDq2-myfeqfMJoflE1qp9iw6ZVwGb7FxI_d0RztGPYj6-8XNBpvs5dUjqjD8JPFMtQfAaCgYKAdkSARASFQHGX2MidN2FCOGU0OwYG1CMGrkYMA0170'

API_VERSION = 'v3'
API_NAME = 'youtube'

SCHEME = "https"
NETLOC = f'youtube.googleapis.com/{API_NAME}/{API_VERSION}'
YT_SEARCH_ENDPOINT = "/search"
YT_VIDEOS_ENDPOINT = "/videos"

# **Utility Functions**

## Function to remove any stopwords that might occur in tags

In [None]:
stop_words = set(stopwords.words(stopwords.fileids()))

def remove_stopword_tags(videos):
    for video in videos:
        if 'video_tags' in video.keys() and video['video_tags'] is not None:
            video['video_tags'] = list(filter(lambda x:x not in stop_words, video['video_tags']))
    return videos

## Function to stem the tags to their base/root words

In [None]:
ps = PorterStemmer()

def stem_tags(videos):
    for video in videos:
        if 'video_tags' in video.keys() and video['video_tags'] is not None:
            video['video_tags'] = [ps.stem(tag) for tag in video['video_tags']]
    return videos

In [15]:
def prepare_link_for_search_request(region_code, order, channel_id=None, search_query=None): # order : viewCount | date
    params = {
        'type' : 'video',
        'part' : 'id',
        'maxResults' : MAX_YT_SEARCH_RESULTS,
        'order' : order,
        'regionCode' : region_code,
        'key' : API_KEY
    }
    if(channel_id != None):
        params['channelId'] = channel_id
    if(search_query != None):
        params['q'] = search_query

    url_str = url.urlunparse((SCHEME, NETLOC, YT_SEARCH_ENDPOINT, None, url.urlencode(params), None))
    return url_str


In [72]:
def prepare_link_for_video_request(video_ids): 
    params = {
        'part' : 'snippet,id',
        'id' : ','.join(video_ids),
        'key' : API_KEY
    }

    url_str = url.urlunparse((SCHEME, NETLOC, YT_SEARCH_ENDPOINT, None, url.urlencode(params), None))
    return url_str

# **Implementations**

## Recommendation System Utilities

### Recommend videos by **Relevant Tags**

In [73]:
def get_recommendation_by_tags(watched_tags) :
    if len(watched_tags) == 0 :
        return None

    vectorizer = TfidfVectorizer(max_features=250)

    vectorized_tags = vectorizer.fit_transform(watched_tags)
    vectorized_tags_DF = pd.DataFrame(vectorized_tags.toarray(), index=watched_tags, columns=vectorizer.get_feature_names_out())
    vectorized_tags_DF.loc[len(vectorized_tags_DF.index)] = abs(vectorized_tags_DF.sum() - 1)

    sorted_tags = (vectorized_tags_DF.iloc[-1].sort_values(ascending=False))
    reduced_sorted_tags = sorted_tags[0: 10 if len(sorted_tags) > 10 else len(sorted_tags)]

    header = {'Authorization': f'Bearer {ACCESS_TOKEN}'}
    videos_to_search_by_top_tags = [prepare_link_for_search_request('IN', 'date', search_query=single_tag) if np.random.rand() > 0.5 else prepare_link_for_search_request('IN', 'viewCount', search_query=single_tag) for single_tag in reduced_sorted_tags.index]
    search_results = grequests.map(grequests.get(u) for u in videos_to_search_by_top_tags)

    array_flatten = []
    for array_item in [response.json()['items'] for response in search_results] :
        array_flatten = array_flatten + array_item
    video_ids = [item['videoId'] for item in [item['id'] for item in array_flatten]]

    video_search_results = grequests.map(grequests.get(u) for u in [prepare_link_for_video_request(video_ids)])

    return video_search_results

responses = get_recommendation_by_tags(['requests', 'python', 'code', 'ninja'])
responses

[<Response [200]>]

  with loop.timer(seconds, ref=ref) as t:


In [74]:
responses[0].json()

{'kind': 'youtube#searchListResponse',
 'etag': 'jOJpyoCKQvPcaFs9qjKNEo4z3UI',
 'nextPageToken': 'CAUQAA',
 'regionCode': 'IN',
 'pageInfo': {'totalResults': 1000000, 'resultsPerPage': 5},
 'items': [{'kind': 'youtube#searchResult',
   'etag': 'IwW4_X1J2bYMkmBN97nZ7NxbVwA',
   'id': {'kind': 'youtube#video', 'videoId': 'RS4Fl_7BryE'},
   'snippet': {'publishedAt': '2023-05-22T22:24:43Z',
    'channelId': 'UCUT1S-W5BIBUu_vzwhyGNVg',
    'title': 'Bazen yanlış oturursun 😳💈',
    'description': '',
    'thumbnails': {'default': {'url': 'https://i.ytimg.com/vi/RS4Fl_7BryE/default.jpg',
      'width': 120,
      'height': 90},
     'medium': {'url': 'https://i.ytimg.com/vi/RS4Fl_7BryE/mqdefault.jpg',
      'width': 320,
      'height': 180},
     'high': {'url': 'https://i.ytimg.com/vi/RS4Fl_7BryE/hqdefault.jpg',
      'width': 480,
      'height': 360}},
    'channelTitle': 'Barberstown Tv',
    'liveBroadcastContent': 'none',
    'publishTime': '2023-05-22T22:24:43Z'}},
  {'kind': 'youtub