From 209705cd284899d5ff0aab19e71d6f599b12363d Mon Sep 17 00:00:00 2001 From: Elias Dabbas Date: Fri, 22 Mar 2024 00:54:44 +0300 Subject: [PATCH] Remove the relatedSite paramter from serp_goog - deprecated --- advertools/serp.py | 1952 +++++++++++++++++++++++++++++++++----------- 1 file changed, 1475 insertions(+), 477 deletions(-) diff --git a/advertools/serp.py b/advertools/serp.py index 5a528506..0d5878da 100644 --- a/advertools/serp.py +++ b/advertools/serp.py @@ -47,10 +47,17 @@ """ -__all__ = ['SERP_GOOG_VALID_VALS', 'YOUTUBE_TOPIC_IDS', - 'YOUTUBE_VID_CATEGORY_IDS', 'serp_goog', 'serp_youtube', - 'set_logging_level', 'youtube_channel_details', - 'youtube_video_details'] + +__all__ = [ + "SERP_GOOG_VALID_VALS", + "YOUTUBE_TOPIC_IDS", + "YOUTUBE_VID_CATEGORY_IDS", + "serp_goog", + "serp_youtube", + "set_logging_level", + "youtube_channel_details", + "youtube_video_details", +] import datetime import logging @@ -65,8 +72,10 @@ import requests -SERP_GOOG_LOG_FMT = ('%(asctime)s | %(levelname)s | %(filename)s:%(lineno)d ' - '| %(funcName)s | %(message)s') +SERP_GOOG_LOG_FMT = ( + "%(asctime)s | %(levelname)s | %(filename)s:%(lineno)d " + "| %(funcName)s | %(message)s" +) logging.basicConfig(format=SERP_GOOG_LOG_FMT) @@ -77,165 +86,691 @@ SERP_GOOG_VALID_VALS = dict( fileType={ - 'bas', 'c', 'cc', 'cpp', 'cs', 'cxx', 'doc', 'docx', 'dwf', 'gpx', - 'h', 'hpp', 'htm', 'html', 'hwp', 'java', 'kml', 'kmz', 'odp', 'ods', - 'odt', 'pdf', 'pl', 'ppt', 'pptx', 'ps', 'py', 'rtf', 'svg', 'swf', - 'tex', 'text', 'txt', 'wap', 'wml', 'xls', 'xlsx', 'xml', + "bas", + "c", + "cc", + "cpp", + "cs", + "cxx", + "doc", + "docx", + "dwf", + "gpx", + "h", + "hpp", + "htm", + "html", + "hwp", + "java", + "kml", + "kmz", + "odp", + "ods", + "odt", + "pdf", + "pl", + "ppt", + "pptx", + "ps", + "py", + "rtf", + "svg", + "swf", + "tex", + "text", + "txt", + "wap", + "wml", + "xls", + "xlsx", + "xml", }, - c2coff={0, 1}, - cr={ - 'countryAF', 'countryAL', 'countryDZ', 'countryAS', 'countryAD', - 'countryAO', 'countryAI', 'countryAQ', 'countryAG', 'countryAR', - 'countryAM', 'countryAW', 'countryAU', 'countryAT', 'countryAZ', - 'countryBS', 'countryBH', 'countryBD', 'countryBB', 'countryBY', - 'countryBE', 'countryBZ', 'countryBJ', 'countryBM', 'countryBT', - 'countryBO', 'countryBA', 'countryBW', 'countryBV', 'countryBR', - 'countryIO', 'countryBN', 'countryBG', 'countryBF', 'countryBI', - 'countryKH', 'countryCM', 'countryCA', 'countryCV', 'countryKY', - 'countryCF', 'countryTD', 'countryCL', 'countryCN', 'countryCX', - 'countryCC', 'countryCO', 'countryKM', 'countryCG', 'countryCD', - 'countryCK', 'countryCR', 'countryCI', 'countryHR', 'countryCU', - 'countryCY', 'countryCZ', 'countryDK', 'countryDJ', 'countryDM', - 'countryDO', 'countryTP', 'countryEC', 'countryEG', 'countrySV', - 'countryGQ', 'countryER', 'countryEE', 'countryET', 'countryEU', - 'countryFK', 'countryFO', 'countryFJ', 'countryFI', 'countryFR', - 'countryFX', 'countryGF', 'countryPF', 'countryTF', 'countryGA', - 'countryGM', 'countryGE', 'countryDE', 'countryGH', 'countryGI', - 'countryGR', 'countryGL', 'countryGD', 'countryGP', 'countryGU', - 'countryGT', 'countryGN', 'countryGW', 'countryGY', 'countryHT', - 'countryHM', 'countryVA', 'countryHN', 'countryHK', 'countryHU', - 'countryIS', 'countryIN', 'countryID', 'countryIR', 'countryIQ', - 'countryIE', 'countryIL', 'countryIT', 'countryJM', 'countryJP', - 'countryJO', 'countryKZ', 'countryKE', 'countryKI', 'countryKP', - 'countryKR', 'countryKW', 'countryKG', 'countryLA', 'countryLV', - 'countryLB', 'countryLS', 'countryLR', 'countryLY', 'countryLI', - 'countryLT', 'countryLU', 'countryMO', 'countryMK', 'countryMG', - 'countryMW', 'countryMY', 'countryMV', 'countryML', 'countryMT', - 'countryMH', 'countryMQ', 'countryMR', 'countryMU', 'countryYT', - 'countryMX', 'countryFM', 'countryMD', 'countryMC', 'countryMN', - 'countryMS', 'countryMA', 'countryMZ', 'countryMM', 'countryNA', - 'countryNR', 'countryNP', 'countryNL', 'countryAN', 'countryNC', - 'countryNZ', 'countryNI', 'countryNE', 'countryNG', 'countryNU', - 'countryNF', 'countryMP', 'countryNO', 'countryOM', 'countryPK', - 'countryPW', 'countryPS', 'countryPA', 'countryPG', 'countryPY', - 'countryPE', 'countryPH', 'countryPN', 'countryPL', 'countryPT', - 'countryPR', 'countryQA', 'countryRE', 'countryRO', 'countryRU', - 'countryRW', 'countrySH', 'countryKN', 'countryLC', 'countryPM', - 'countryVC', 'countryWS', 'countrySM', 'countryST', 'countrySA', - 'countrySN', 'countryCS', 'countrySC', 'countrySL', 'countrySG', - 'countrySK', 'countrySI', 'countrySB', 'countrySO', 'countryZA', - 'countryGS', 'countryES', 'countryLK', 'countrySD', 'countrySR', - 'countrySJ', 'countrySZ', 'countrySE', 'countryCH', 'countrySY', - 'countryTW', 'countryTJ', 'countryTZ', 'countryTH', 'countryTG', - 'countryTK', 'countryTO', 'countryTT', 'countryTN', 'countryTR', - 'countryTM', 'countryTC', 'countryTV', 'countryUG', 'countryUA', - 'countryAE', 'countryUK', 'countryUS', 'countryUM', 'countryUY', - 'countryUZ', 'countryVU', 'countryVE', 'countryVN', 'countryVG', - 'countryVI', 'countryWF', 'countryEH', 'countryYE', 'countryYU', - 'countryZM', 'countryZW' + "countryAF", + "countryAL", + "countryDZ", + "countryAS", + "countryAD", + "countryAO", + "countryAI", + "countryAQ", + "countryAG", + "countryAR", + "countryAM", + "countryAW", + "countryAU", + "countryAT", + "countryAZ", + "countryBS", + "countryBH", + "countryBD", + "countryBB", + "countryBY", + "countryBE", + "countryBZ", + "countryBJ", + "countryBM", + "countryBT", + "countryBO", + "countryBA", + "countryBW", + "countryBV", + "countryBR", + "countryIO", + "countryBN", + "countryBG", + "countryBF", + "countryBI", + "countryKH", + "countryCM", + "countryCA", + "countryCV", + "countryKY", + "countryCF", + "countryTD", + "countryCL", + "countryCN", + "countryCX", + "countryCC", + "countryCO", + "countryKM", + "countryCG", + "countryCD", + "countryCK", + "countryCR", + "countryCI", + "countryHR", + "countryCU", + "countryCY", + "countryCZ", + "countryDK", + "countryDJ", + "countryDM", + "countryDO", + "countryTP", + "countryEC", + "countryEG", + "countrySV", + "countryGQ", + "countryER", + "countryEE", + "countryET", + "countryEU", + "countryFK", + "countryFO", + "countryFJ", + "countryFI", + "countryFR", + "countryFX", + "countryGF", + "countryPF", + "countryTF", + "countryGA", + "countryGM", + "countryGE", + "countryDE", + "countryGH", + "countryGI", + "countryGR", + "countryGL", + "countryGD", + "countryGP", + "countryGU", + "countryGT", + "countryGN", + "countryGW", + "countryGY", + "countryHT", + "countryHM", + "countryVA", + "countryHN", + "countryHK", + "countryHU", + "countryIS", + "countryIN", + "countryID", + "countryIR", + "countryIQ", + "countryIE", + "countryIL", + "countryIT", + "countryJM", + "countryJP", + "countryJO", + "countryKZ", + "countryKE", + "countryKI", + "countryKP", + "countryKR", + "countryKW", + "countryKG", + "countryLA", + "countryLV", + "countryLB", + "countryLS", + "countryLR", + "countryLY", + "countryLI", + "countryLT", + "countryLU", + "countryMO", + "countryMK", + "countryMG", + "countryMW", + "countryMY", + "countryMV", + "countryML", + "countryMT", + "countryMH", + "countryMQ", + "countryMR", + "countryMU", + "countryYT", + "countryMX", + "countryFM", + "countryMD", + "countryMC", + "countryMN", + "countryMS", + "countryMA", + "countryMZ", + "countryMM", + "countryNA", + "countryNR", + "countryNP", + "countryNL", + "countryAN", + "countryNC", + "countryNZ", + "countryNI", + "countryNE", + "countryNG", + "countryNU", + "countryNF", + "countryMP", + "countryNO", + "countryOM", + "countryPK", + "countryPW", + "countryPS", + "countryPA", + "countryPG", + "countryPY", + "countryPE", + "countryPH", + "countryPN", + "countryPL", + "countryPT", + "countryPR", + "countryQA", + "countryRE", + "countryRO", + "countryRU", + "countryRW", + "countrySH", + "countryKN", + "countryLC", + "countryPM", + "countryVC", + "countryWS", + "countrySM", + "countryST", + "countrySA", + "countrySN", + "countryCS", + "countrySC", + "countrySL", + "countrySG", + "countrySK", + "countrySI", + "countrySB", + "countrySO", + "countryZA", + "countryGS", + "countryES", + "countryLK", + "countrySD", + "countrySR", + "countrySJ", + "countrySZ", + "countrySE", + "countryCH", + "countrySY", + "countryTW", + "countryTJ", + "countryTZ", + "countryTH", + "countryTG", + "countryTK", + "countryTO", + "countryTT", + "countryTN", + "countryTR", + "countryTM", + "countryTC", + "countryTV", + "countryUG", + "countryUA", + "countryAE", + "countryUK", + "countryUS", + "countryUM", + "countryUY", + "countryUZ", + "countryVU", + "countryVE", + "countryVN", + "countryVG", + "countryVI", + "countryWF", + "countryEH", + "countryYE", + "countryYU", + "countryZM", + "countryZW", }, - gl={ - 'ad', 'ae', 'af', 'ag', 'ai', 'al', 'am', 'an', 'ao', 'aq', 'ar', - 'as', 'at', 'au', 'aw', 'az', 'ba', 'bb', 'bd', 'be', 'bf', 'bg', - 'bh', 'bi', 'bj', 'bm', 'bn', 'bo', 'br', 'bs', 'bt', 'bv', 'bw', - 'by', 'bz', 'ca', 'cc', 'cd', 'cf', 'cg', 'ch', 'ci', 'ck', 'cl', - 'cm', 'cn', 'co', 'cr', 'cs', 'cu', 'cv', 'cx', 'cy', 'cz', 'de', - 'dj', 'dk', 'dm', 'do', 'dz', 'ec', 'ee', 'eg', 'eh', 'er', 'es', - 'et', 'fi', 'fj', 'fk', 'fm', 'fo', 'fr', 'ga', 'gd', 'ge', 'gf', - 'gh', 'gi', 'gl', 'gm', 'gn', 'gp', 'gq', 'gr', 'gs', 'gt', 'gu', - 'gw', 'gy', 'hk', 'hm', 'hn', 'hr', 'ht', 'hu', 'id', 'ie', 'il', - 'in', 'io', 'iq', 'ir', 'is', 'it', 'jm', 'jo', 'jp', 'ke', 'kg', - 'kh', 'ki', 'km', 'kn', 'kp', 'kr', 'kw', 'ky', 'kz', 'la', 'lb', - 'lc', 'li', 'lk', 'lr', 'ls', 'lt', 'lu', 'lv', 'ly', 'ma', 'mc', - 'md', 'mg', 'mh', 'mk', 'ml', 'mm', 'mn', 'mo', 'mp', 'mq', 'mr', - 'ms', 'mt', 'mu', 'mv', 'mw', 'mx', 'my', 'mz', 'na', 'nc', 'ne', - 'nf', 'ng', 'ni', 'nl', 'no', 'np', 'nr', 'nu', 'nz', 'om', 'pa', - 'pe', 'pf', 'pg', 'ph', 'pk', 'pl', 'pm', 'pn', 'pr', 'ps', 'pt', - 'pw', 'py', 'qa', 're', 'ro', 'ru', 'rw', 'sa', 'sb', 'sc', 'sd', - 'se', 'sg', 'sh', 'si', 'sj', 'sk', 'sl', 'sm', 'sn', 'so', 'sr', - 'st', 'sv', 'sy', 'sz', 'tc', 'td', 'tf', 'tg', 'th', 'tj', 'tk', - 'tl', 'tm', 'tn', 'to', 'tr', 'tt', 'tv', 'tw', 'tz', 'ua', 'ug', - 'uk', 'um', 'us', 'uy', 'uz', 'va', 'vc', 've', 'vg', 'vi', 'vn', - 'vu', 'wf', 'ws', 'ye', 'yt', 'za', 'zm', 'zw', + "ad", + "ae", + "af", + "ag", + "ai", + "al", + "am", + "an", + "ao", + "aq", + "ar", + "as", + "at", + "au", + "aw", + "az", + "ba", + "bb", + "bd", + "be", + "bf", + "bg", + "bh", + "bi", + "bj", + "bm", + "bn", + "bo", + "br", + "bs", + "bt", + "bv", + "bw", + "by", + "bz", + "ca", + "cc", + "cd", + "cf", + "cg", + "ch", + "ci", + "ck", + "cl", + "cm", + "cn", + "co", + "cr", + "cs", + "cu", + "cv", + "cx", + "cy", + "cz", + "de", + "dj", + "dk", + "dm", + "do", + "dz", + "ec", + "ee", + "eg", + "eh", + "er", + "es", + "et", + "fi", + "fj", + "fk", + "fm", + "fo", + "fr", + "ga", + "gd", + "ge", + "gf", + "gh", + "gi", + "gl", + "gm", + "gn", + "gp", + "gq", + "gr", + "gs", + "gt", + "gu", + "gw", + "gy", + "hk", + "hm", + "hn", + "hr", + "ht", + "hu", + "id", + "ie", + "il", + "in", + "io", + "iq", + "ir", + "is", + "it", + "jm", + "jo", + "jp", + "ke", + "kg", + "kh", + "ki", + "km", + "kn", + "kp", + "kr", + "kw", + "ky", + "kz", + "la", + "lb", + "lc", + "li", + "lk", + "lr", + "ls", + "lt", + "lu", + "lv", + "ly", + "ma", + "mc", + "md", + "mg", + "mh", + "mk", + "ml", + "mm", + "mn", + "mo", + "mp", + "mq", + "mr", + "ms", + "mt", + "mu", + "mv", + "mw", + "mx", + "my", + "mz", + "na", + "nc", + "ne", + "nf", + "ng", + "ni", + "nl", + "no", + "np", + "nr", + "nu", + "nz", + "om", + "pa", + "pe", + "pf", + "pg", + "ph", + "pk", + "pl", + "pm", + "pn", + "pr", + "ps", + "pt", + "pw", + "py", + "qa", + "re", + "ro", + "ru", + "rw", + "sa", + "sb", + "sc", + "sd", + "se", + "sg", + "sh", + "si", + "sj", + "sk", + "sl", + "sm", + "sn", + "so", + "sr", + "st", + "sv", + "sy", + "sz", + "tc", + "td", + "tf", + "tg", + "th", + "tj", + "tk", + "tl", + "tm", + "tn", + "to", + "tr", + "tt", + "tv", + "tw", + "tz", + "ua", + "ug", + "uk", + "um", + "us", + "uy", + "uz", + "va", + "vc", + "ve", + "vg", + "vi", + "vn", + "vu", + "wf", + "ws", + "ye", + "yt", + "za", + "zm", + "zw", }, - filter={0, 1}, - hl={ - 'af', 'sq', 'sm', 'ar', 'az', 'eu', 'be', 'bn', 'bh', 'bs', 'bg', - 'ca', 'zh-CN', 'zh-TW', 'hr', 'cs', 'da', 'nl', 'en', 'eo', 'et', - 'fo', 'fi', 'fr', 'fy', 'gl', 'ka', 'de', 'el', 'gu', 'iw', 'hi', - 'hu', 'is', 'id', 'ia', 'ga', 'it', 'ja', 'jw', 'kn', 'ko', 'la', - 'lv', 'lt', 'mk', 'ms', 'ml', 'mt', 'mr', 'ne', 'no', 'nn', 'oc', - 'fa', 'pl', 'pt-BR', 'pt-PT', 'pa', 'ro', 'ru', 'gd', 'sr', 'si', - 'sk', 'sl', 'es', 'su', 'sw', 'sv', 'tl', 'ta', 'te', 'th', 'ti', - 'tr', 'uk', 'ur', 'uz', 'vi', 'cy', 'xh', 'zu' + "af", + "sq", + "sm", + "ar", + "az", + "eu", + "be", + "bn", + "bh", + "bs", + "bg", + "ca", + "zh-CN", + "zh-TW", + "hr", + "cs", + "da", + "nl", + "en", + "eo", + "et", + "fo", + "fi", + "fr", + "fy", + "gl", + "ka", + "de", + "el", + "gu", + "iw", + "hi", + "hu", + "is", + "id", + "ia", + "ga", + "it", + "ja", + "jw", + "kn", + "ko", + "la", + "lv", + "lt", + "mk", + "ms", + "ml", + "mt", + "mr", + "ne", + "no", + "nn", + "oc", + "fa", + "pl", + "pt-BR", + "pt-PT", + "pa", + "ro", + "ru", + "gd", + "sr", + "si", + "sk", + "sl", + "es", + "su", + "sw", + "sv", + "tl", + "ta", + "te", + "th", + "ti", + "tr", + "uk", + "ur", + "uz", + "vi", + "cy", + "xh", + "zu", }, - - imgColorType={ - 'color', 'gray', 'mono', 'trans' - }, - + imgColorType={"color", "gray", "mono", "trans"}, imgDominantColor={ - 'black', - 'blue', - 'brown', - 'gray', - 'green', - 'orange', - 'pink', - 'purple', - 'red', - 'teal', - 'white', - 'yellow', + "black", + "blue", + "brown", + "gray", + "green", + "orange", + "pink", + "purple", + "red", + "teal", + "white", + "yellow", }, - imgSize={ - 'huge', - 'icon', - 'large', - 'medium', - 'small', - 'xlarge', - 'xxlarge', - }, - - imgType={ - 'clipart', - 'face', - 'lineart', - 'stock', - 'photo', - 'animated' + "huge", + "icon", + "large", + "medium", + "small", + "xlarge", + "xxlarge", }, - + imgType={"clipart", "face", "lineart", "stock", "photo", "animated"}, lr={ - 'lang_ar', 'lang_bg', 'lang_ca', 'lang_zh-CN', 'lang_zh-TW', - 'lang_hr', 'lang_cs', 'lang_da', 'lang_nl', 'lang_en', 'lang_et', - 'lang_fi', 'lang_fr', 'lang_de', 'lang_el', 'lang_iw', 'lang_hu', - 'lang_is', 'lang_id', 'lang_it', 'lang_ja', 'lang_ko', 'lang_lv', - 'lang_lt', 'lang_no', 'lang_pl', 'lang_pt', 'lang_ro', 'lang_ru', - 'lang_sr', 'lang_sk', 'lang_sl', 'lang_es', 'lang_sv', 'lang_tr', + "lang_ar", + "lang_bg", + "lang_ca", + "lang_zh-CN", + "lang_zh-TW", + "lang_hr", + "lang_cs", + "lang_da", + "lang_nl", + "lang_en", + "lang_et", + "lang_fi", + "lang_fr", + "lang_de", + "lang_el", + "lang_iw", + "lang_hu", + "lang_is", + "lang_id", + "lang_it", + "lang_ja", + "lang_ko", + "lang_lv", + "lang_lt", + "lang_no", + "lang_pl", + "lang_pt", + "lang_ro", + "lang_ru", + "lang_sr", + "lang_sk", + "lang_sl", + "lang_es", + "lang_sv", + "lang_tr", }, - num={1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, - rights={ - 'cc_publicdomain', 'cc_attribute', 'cc_sharealike', - 'cc_noncommercial', 'cc_nonderived' + "cc_publicdomain", + "cc_attribute", + "cc_sharealike", + "cc_noncommercial", + "cc_nonderived", }, - - safe={'active', 'off'}, - - searchType={None, 'image'}, - - siteSearchFilter={'e', 'i'}, - - start=range(1, 92) + safe={"active", "off"}, + searchType={None, "image"}, + siteSearchFilter={"e", "i"}, + start=range(1, 92), ) @@ -245,208 +780,570 @@ YOUTUBE_TOPIC_IDS = { - 'Entertainment topics': {'Entertainment (parent topic)': '/m/02jjt', - 'Humor': '/m/09kqc', - 'Movies': '/m/02vxn', - 'Performing arts': '/m/05qjc', - 'Professional wrestling': '/m/066wd', - 'TV shows': '/m/0f2f9'}, - 'Gaming topics': {'Action game': '/m/025zzc', - 'Action-adventure game': '/m/02ntfj', - 'Casual game': '/m/0b1vjn', - 'Gaming (parent topic)': '/m/0bzvm2', - 'Music video game': '/m/02hygl', - 'Puzzle video game': '/m/04q1x3q', - 'Racing video game': '/m/01sjng', - 'Role-playing video game': '/m/0403l3g', - 'Simulation video game': '/m/021bp2', - 'Sports game': '/m/022dc6', - 'Strategy video game': '/m/03hf_rm'}, - 'Lifestyle topics': {'Fashion': '/m/032tl', - 'Fitness': '/m/027x7n', - 'Food': '/m/02wbm', - 'Hobby': '/m/03glg', - 'Lifestyle (parent topic)': '/m/019_rr', - 'Pets': '/m/068hy', - 'Physical attractiveness [Beauty]': '/m/041xxh', - 'Technology': '/m/07c1v', - 'Tourism': '/m/07bxq', - 'Vehicles': '/m/07yv9'}, - 'Music topics': {'Christian music': '/m/02mscn', - 'Classical music': '/m/0ggq0m', - 'Country': '/m/01lyv', - 'Electronic music': '/m/02lkt', - 'Hip hop music': '/m/0glt670', - 'Independent music': '/m/05rwpb', - 'Jazz': '/m/03_d0', - 'Music (parent topic)': '/m/04rlf', - 'Music of Asia': '/m/028sqc', - 'Music of Latin America': '/m/0g293', - 'Pop music': '/m/064t9', - 'Reggae': '/m/06cqb', - 'Rhythm and blues': '/m/06j6l', - 'Rock music': '/m/06by7', - 'Soul music': '/m/0gywn'}, - 'Other topics': {'Knowledge': '/m/01k8wb'}, - 'Society topics': {'Business': '/m/09s1f', - 'Health': '/m/0kt51', - 'Military': '/m/01h6rj', - 'Politics': '/m/05qt0', - 'Religion': '/m/06bvp', - 'Society (parent topic)': '/m/098wr'}, - 'Sports topics': {'American football': '/m/0jm_', - 'Baseball': '/m/018jz', - 'Basketball': '/m/018w8', - 'Boxing': '/m/01cgz', - 'Cricket': '/m/09xp_', - 'Football': '/m/02vx4', - 'Golf': '/m/037hz', - 'Ice hockey': '/m/03tmr', - 'Mixed martial arts': '/m/01h7lh', - 'Motorsport': '/m/0410tth', - 'Sports (parent topic)': '/m/06ntj', - 'Tennis': '/m/07bs0', - 'Volleyball': '/m/07_53'} - } + "Entertainment topics": { + "Entertainment (parent topic)": "/m/02jjt", + "Humor": "/m/09kqc", + "Movies": "/m/02vxn", + "Performing arts": "/m/05qjc", + "Professional wrestling": "/m/066wd", + "TV shows": "/m/0f2f9", + }, + "Gaming topics": { + "Action game": "/m/025zzc", + "Action-adventure game": "/m/02ntfj", + "Casual game": "/m/0b1vjn", + "Gaming (parent topic)": "/m/0bzvm2", + "Music video game": "/m/02hygl", + "Puzzle video game": "/m/04q1x3q", + "Racing video game": "/m/01sjng", + "Role-playing video game": "/m/0403l3g", + "Simulation video game": "/m/021bp2", + "Sports game": "/m/022dc6", + "Strategy video game": "/m/03hf_rm", + }, + "Lifestyle topics": { + "Fashion": "/m/032tl", + "Fitness": "/m/027x7n", + "Food": "/m/02wbm", + "Hobby": "/m/03glg", + "Lifestyle (parent topic)": "/m/019_rr", + "Pets": "/m/068hy", + "Physical attractiveness [Beauty]": "/m/041xxh", + "Technology": "/m/07c1v", + "Tourism": "/m/07bxq", + "Vehicles": "/m/07yv9", + }, + "Music topics": { + "Christian music": "/m/02mscn", + "Classical music": "/m/0ggq0m", + "Country": "/m/01lyv", + "Electronic music": "/m/02lkt", + "Hip hop music": "/m/0glt670", + "Independent music": "/m/05rwpb", + "Jazz": "/m/03_d0", + "Music (parent topic)": "/m/04rlf", + "Music of Asia": "/m/028sqc", + "Music of Latin America": "/m/0g293", + "Pop music": "/m/064t9", + "Reggae": "/m/06cqb", + "Rhythm and blues": "/m/06j6l", + "Rock music": "/m/06by7", + "Soul music": "/m/0gywn", + }, + "Other topics": {"Knowledge": "/m/01k8wb"}, + "Society topics": { + "Business": "/m/09s1f", + "Health": "/m/0kt51", + "Military": "/m/01h6rj", + "Politics": "/m/05qt0", + "Religion": "/m/06bvp", + "Society (parent topic)": "/m/098wr", + }, + "Sports topics": { + "American football": "/m/0jm_", + "Baseball": "/m/018jz", + "Basketball": "/m/018w8", + "Boxing": "/m/01cgz", + "Cricket": "/m/09xp_", + "Football": "/m/02vx4", + "Golf": "/m/037hz", + "Ice hockey": "/m/03tmr", + "Mixed martial arts": "/m/01h7lh", + "Motorsport": "/m/0410tth", + "Sports (parent topic)": "/m/06ntj", + "Tennis": "/m/07bs0", + "Volleyball": "/m/07_53", + }, +} YOUTUBE_VID_CATEGORY_IDS = { - 'Action/Adventure': '32', - 'Anime/Animation': '31', - 'Autos & Vehicles': '2', - 'Classics': '33', - 'Comedy': '34', - 'Documentary': '35', - 'Drama': '36', - 'Education': '27', - 'Entertainment': '24', - 'Family': '37', - 'Film & Animation': '1', - 'Foreign': '38', - 'Gaming': '20', - 'Horror': '39', - 'Howto & Style': '26', - 'Movies': '30', - 'Music': '10', - 'News & Politics': '25', - 'Nonprofits & Activism': '29', - 'People & Blogs': '22', - 'Pets & Animals': '15', - 'Sci-Fi/Fantasy': '40', - 'Science & Technology': '28', - 'Short Movies': '18', - 'Shorts': '42', - 'Shows': '43', - 'Sports': '17', - 'Thriller': '41', - 'Trailers': '44', - 'Travel & Events': '19', - 'Videoblogging': '21' + "Action/Adventure": "32", + "Anime/Animation": "31", + "Autos & Vehicles": "2", + "Classics": "33", + "Comedy": "34", + "Documentary": "35", + "Drama": "36", + "Education": "27", + "Entertainment": "24", + "Family": "37", + "Film & Animation": "1", + "Foreign": "38", + "Gaming": "20", + "Horror": "39", + "Howto & Style": "26", + "Movies": "30", + "Music": "10", + "News & Politics": "25", + "Nonprofits & Activism": "29", + "People & Blogs": "22", + "Pets & Animals": "15", + "Sci-Fi/Fantasy": "40", + "Science & Technology": "28", + "Short Movies": "18", + "Shorts": "42", + "Shows": "43", + "Sports": "17", + "Thriller": "41", + "Trailers": "44", + "Travel & Events": "19", + "Videoblogging": "21", } SERP_YTUBE_VALID_VALS = dict( - channelType={'any', 'show'}, - - eventType={'completed', 'live', 'upcoming'}, - - forContentOwner={True, False, 'true', 'false'}, - - forDeveloper={True, False, 'true', 'false'}, - - forMine={True, False, 'true', 'false'}, - + channelType={"any", "show"}, + eventType={"completed", "live", "upcoming"}, + forContentOwner={True, False, "true", "false"}, + forDeveloper={True, False, "true", "false"}, + forMine={True, False, "true", "false"}, maxResults=range(51), - - order={'date', 'rating', 'relevance', 'title', - 'videoCount', 'viewCount'}, - + order={"date", "rating", "relevance", "title", "videoCount", "viewCount"}, regionCode={ - 'ad', 'ae', 'af', 'ag', 'ai', 'al', 'am', 'an', 'ao', 'aq', 'ar', - 'as', 'at', 'au', 'aw', 'az', 'ba', 'bb', 'bd', 'be', 'bf', 'bg', - 'bh', 'bi', 'bj', 'bm', 'bn', 'bo', 'br', 'bs', 'bt', 'bv', 'bw', - 'by', 'bz', 'ca', 'cc', 'cd', 'cf', 'cg', 'ch', 'ci', 'ck', 'cl', - 'cm', 'cn', 'co', 'cr', 'cs', 'cu', 'cv', 'cx', 'cy', 'cz', 'de', - 'dj', 'dk', 'dm', 'do', 'dz', 'ec', 'ee', 'eg', 'eh', 'er', 'es', - 'et', 'fi', 'fj', 'fk', 'fm', 'fo', 'fr', 'ga', 'gd', 'ge', 'gf', - 'gh', 'gi', 'gl', 'gm', 'gn', 'gp', 'gq', 'gr', 'gs', 'gt', 'gu', - 'gw', 'gy', 'hk', 'hm', 'hn', 'hr', 'ht', 'hu', 'id', 'ie', 'il', - 'in', 'io', 'iq', 'ir', 'is', 'it', 'jm', 'jo', 'jp', 'ke', 'kg', - 'kh', 'ki', 'km', 'kn', 'kp', 'kr', 'kw', 'ky', 'kz', 'la', 'lb', - 'lc', 'li', 'lk', 'lr', 'ls', 'lt', 'lu', 'lv', 'ly', 'ma', 'mc', - 'md', 'mg', 'mh', 'mk', 'ml', 'mm', 'mn', 'mo', 'mp', 'mq', 'mr', - 'ms', 'mt', 'mu', 'mv', 'mw', 'mx', 'my', 'mz', 'na', 'nc', 'ne', - 'nf', 'ng', 'ni', 'nl', 'no', 'np', 'nr', 'nu', 'nz', 'om', 'pa', - 'pe', 'pf', 'pg', 'ph', 'pk', 'pl', 'pm', 'pn', 'pr', 'ps', 'pt', - 'pw', 'py', 'qa', 're', 'ro', 'ru', 'rw', 'sa', 'sb', 'sc', 'sd', - 'se', 'sg', 'sh', 'si', 'sj', 'sk', 'sl', 'sm', 'sn', 'so', 'sr', - 'st', 'sv', 'sy', 'sz', 'tc', 'td', 'tf', 'tg', 'th', 'tj', 'tk', - 'tl', 'tm', 'tn', 'to', 'tr', 'tt', 'tv', 'tw', 'tz', 'ua', 'ug', - 'uk', 'um', 'us', 'uy', 'uz', 'va', 'vc', 've', 'vg', 'vi', 'vn', - 'vu', 'wf', 'ws', 'ye', 'yt', 'za', 'zm', 'zw', + "ad", + "ae", + "af", + "ag", + "ai", + "al", + "am", + "an", + "ao", + "aq", + "ar", + "as", + "at", + "au", + "aw", + "az", + "ba", + "bb", + "bd", + "be", + "bf", + "bg", + "bh", + "bi", + "bj", + "bm", + "bn", + "bo", + "br", + "bs", + "bt", + "bv", + "bw", + "by", + "bz", + "ca", + "cc", + "cd", + "cf", + "cg", + "ch", + "ci", + "ck", + "cl", + "cm", + "cn", + "co", + "cr", + "cs", + "cu", + "cv", + "cx", + "cy", + "cz", + "de", + "dj", + "dk", + "dm", + "do", + "dz", + "ec", + "ee", + "eg", + "eh", + "er", + "es", + "et", + "fi", + "fj", + "fk", + "fm", + "fo", + "fr", + "ga", + "gd", + "ge", + "gf", + "gh", + "gi", + "gl", + "gm", + "gn", + "gp", + "gq", + "gr", + "gs", + "gt", + "gu", + "gw", + "gy", + "hk", + "hm", + "hn", + "hr", + "ht", + "hu", + "id", + "ie", + "il", + "in", + "io", + "iq", + "ir", + "is", + "it", + "jm", + "jo", + "jp", + "ke", + "kg", + "kh", + "ki", + "km", + "kn", + "kp", + "kr", + "kw", + "ky", + "kz", + "la", + "lb", + "lc", + "li", + "lk", + "lr", + "ls", + "lt", + "lu", + "lv", + "ly", + "ma", + "mc", + "md", + "mg", + "mh", + "mk", + "ml", + "mm", + "mn", + "mo", + "mp", + "mq", + "mr", + "ms", + "mt", + "mu", + "mv", + "mw", + "mx", + "my", + "mz", + "na", + "nc", + "ne", + "nf", + "ng", + "ni", + "nl", + "no", + "np", + "nr", + "nu", + "nz", + "om", + "pa", + "pe", + "pf", + "pg", + "ph", + "pk", + "pl", + "pm", + "pn", + "pr", + "ps", + "pt", + "pw", + "py", + "qa", + "re", + "ro", + "ru", + "rw", + "sa", + "sb", + "sc", + "sd", + "se", + "sg", + "sh", + "si", + "sj", + "sk", + "sl", + "sm", + "sn", + "so", + "sr", + "st", + "sv", + "sy", + "sz", + "tc", + "td", + "tf", + "tg", + "th", + "tj", + "tk", + "tl", + "tm", + "tn", + "to", + "tr", + "tt", + "tv", + "tw", + "tz", + "ua", + "ug", + "uk", + "um", + "us", + "uy", + "uz", + "va", + "vc", + "ve", + "vg", + "vi", + "vn", + "vu", + "wf", + "ws", + "ye", + "yt", + "za", + "zm", + "zw", }, - relevanceLanguage={ - 'af', 'sq', 'sm', 'ar', 'az', 'eu', 'be', 'bn', 'bh', 'bs', 'bg', - 'ca', 'zh-CN', 'zh-TW', 'zh-Hans', 'zh-Hant', 'hr', 'cs', 'da', - 'nl', 'en', 'eo', 'et', 'fo', 'fi', 'fr', 'fy', 'gl', 'ka', 'de', - 'el', 'gu', 'iw', 'hi', 'hu', 'is', 'id', 'ia', 'ga', 'it', 'ja', - 'jw', 'kn', 'ko', 'la', 'lv', 'lt', 'mk', 'ms', 'ml', 'mt', 'mr', - 'ne', 'no', 'nn', 'oc', 'fa', 'pl', 'pt-BR', 'pt-PT', 'pa', 'ro', - 'ru', 'gd', 'sr', 'si', 'sk', 'sl', 'es', 'su', 'sw', 'sv', 'tl', - 'ta', 'te', 'th', 'ti', 'tr', 'uk', 'ur', 'uz', 'vi', 'cy', 'xh', - 'zu' + "af", + "sq", + "sm", + "ar", + "az", + "eu", + "be", + "bn", + "bh", + "bs", + "bg", + "ca", + "zh-CN", + "zh-TW", + "zh-Hans", + "zh-Hant", + "hr", + "cs", + "da", + "nl", + "en", + "eo", + "et", + "fo", + "fi", + "fr", + "fy", + "gl", + "ka", + "de", + "el", + "gu", + "iw", + "hi", + "hu", + "is", + "id", + "ia", + "ga", + "it", + "ja", + "jw", + "kn", + "ko", + "la", + "lv", + "lt", + "mk", + "ms", + "ml", + "mt", + "mr", + "ne", + "no", + "nn", + "oc", + "fa", + "pl", + "pt-BR", + "pt-PT", + "pa", + "ro", + "ru", + "gd", + "sr", + "si", + "sk", + "sl", + "es", + "su", + "sw", + "sv", + "tl", + "ta", + "te", + "th", + "ti", + "tr", + "uk", + "ur", + "uz", + "vi", + "cy", + "xh", + "zu", }, - - safeSearch={'moderate', 'none', 'strict'}, - + safeSearch={"moderate", "none", "strict"}, topicId={ - '/m/04rlf', '/m/02mscn', '/m/0ggq0m', '/m/01lyv', '/m/02lkt', - '/m/0glt670', '/m/05rwpb', '/m/03_d0', '/m/028sqc', '/m/0g293', - '/m/064t9', '/m/06cqb', '/m/06j6l', '/m/06by7', '/m/0gywn', - '/m/0bzvm2', '/m/025zzc', '/m/02ntfj', '/m/0b1vjn', '/m/02hygl', - '/m/04q1x3q', '/m/01sjng', '/m/0403l3g', '/m/021bp2', '/m/022dc6', - '/m/03hf_rm', '/m/06ntj', '/m/0jm_', '/m/018jz', '/m/018w8', - '/m/01cgz', '/m/09xp_', '/m/02vx4', '/m/037hz', '/m/03tmr', - '/m/01h7lh', '/m/0410tth', '/m/07bs0', '/m/07_53', '/m/02jjt', - '/m/09kqc', '/m/02vxn', '/m/05qjc', '/m/066wd', '/m/0f2f9', - '/m/019_rr', '/m/032tl', '/m/027x7n', '/m/02wbm', '/m/03glg', - '/m/068hy', '/m/041xxh', '/m/07c1v', '/m/07bxq', '/m/07yv9', - '/m/098wr', '/m/09s1f', '/m/0kt51', '/m/01h6rj', '/m/05qt0', - '/m/06bvp', '/m/01k8wb' + "/m/04rlf", + "/m/02mscn", + "/m/0ggq0m", + "/m/01lyv", + "/m/02lkt", + "/m/0glt670", + "/m/05rwpb", + "/m/03_d0", + "/m/028sqc", + "/m/0g293", + "/m/064t9", + "/m/06cqb", + "/m/06j6l", + "/m/06by7", + "/m/0gywn", + "/m/0bzvm2", + "/m/025zzc", + "/m/02ntfj", + "/m/0b1vjn", + "/m/02hygl", + "/m/04q1x3q", + "/m/01sjng", + "/m/0403l3g", + "/m/021bp2", + "/m/022dc6", + "/m/03hf_rm", + "/m/06ntj", + "/m/0jm_", + "/m/018jz", + "/m/018w8", + "/m/01cgz", + "/m/09xp_", + "/m/02vx4", + "/m/037hz", + "/m/03tmr", + "/m/01h7lh", + "/m/0410tth", + "/m/07bs0", + "/m/07_53", + "/m/02jjt", + "/m/09kqc", + "/m/02vxn", + "/m/05qjc", + "/m/066wd", + "/m/0f2f9", + "/m/019_rr", + "/m/032tl", + "/m/027x7n", + "/m/02wbm", + "/m/03glg", + "/m/068hy", + "/m/041xxh", + "/m/07c1v", + "/m/07bxq", + "/m/07yv9", + "/m/098wr", + "/m/09s1f", + "/m/0kt51", + "/m/01h6rj", + "/m/05qt0", + "/m/06bvp", + "/m/01k8wb", }, - - type={'channel', 'playlist', 'video'}, - - videoCaption={'any', 'closedCaption', 'none'}, - + type={"channel", "playlist", "video"}, + videoCaption={"any", "closedCaption", "none"}, videoCategoryId={ - '1', '2', '10', '15', '17', '18', '19', '20', '21', '22', '23', - '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', - '35', '36', '37', '38', '39', '40', '41', '42', '43', '44' + "1", + "2", + "10", + "15", + "17", + "18", + "19", + "20", + "21", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + "32", + "33", + "34", + "35", + "36", + "37", + "38", + "39", + "40", + "41", + "42", + "43", + "44", }, - - videoDefinition={'any', 'high', 'standard'}, - - videoDimension={'2d', '3d', 'any'}, - - videoDuration={'any', 'long', 'medium', 'short'}, - - videoEmbeddable={'any', True, 'true'}, - - videoLicense={'any', 'creativeCommon', 'youtube'}, - - videoSyndicated={'any', True, 'true'}, - - videoType={'any', 'episode', 'movie'}, + videoDefinition={"any", "high", "standard"}, + videoDimension={"2d", "3d", "any"}, + videoDuration={"any", "long", "medium", "short"}, + videoEmbeddable={"any", True, "true"}, + videoLicense={"any", "creativeCommon", "youtube"}, + videoSyndicated={"any", True, "true"}, + videoType={"any", "episode", "movie"}, ) def _split_by_comma(s, length=50): """Group a comma-separated string into a list of at-most ``length``-length words each.""" - str_split = s.split(',') + str_split = s.split(",") str_list = [] for i in range(0, len(str_split) + length, length): - temp_str = ','.join(str_split[i:i+length]) + temp_str = ",".join(str_split[i : i + length]) if temp_str: str_list.append(temp_str) return str_list @@ -456,28 +1353,31 @@ def youtube_video_details(key, vid_ids): """Return details of videos for which the ids are given. Assumes ``ids`` is a comma-separated list of video ids with no spaces.""" - base_url = ('https://www.googleapis.com/youtube/v3/videos?part=' - 'contentDetails,id,liveStreamingDetails,localizations,player,' - 'recordingDetails,snippet,statistics,status,topicDetails') + base_url = ( + "https://www.googleapis.com/youtube/v3/videos?part=" + "contentDetails,id,liveStreamingDetails,localizations,player," + "recordingDetails,snippet,statistics,status,topicDetails" + ) vid_ids = _split_by_comma(vid_ids, length=50) final_df = pd.DataFrame() for vid_id in vid_ids: - params = {'id': vid_id, 'key': key} - logging.info(msg='Requesting: ' + 'video details') + params = {"id": vid_id, "key": key} + logging.info(msg="Requesting: " + "video details") video_resp = requests.get(base_url, params=params) if video_resp.status_code >= 400: raise Exception(video_resp.json()) - items_df = pd.DataFrame(video_resp.json()['items']) - details = ['snippet', 'topicDetails', 'statistics', - 'status', 'contentDetails'] + items_df = pd.DataFrame(video_resp.json()["items"]) + details = ["snippet", "topicDetails", "statistics", "status", "contentDetails"] detail_df = pd.DataFrame() for detail in details: try: - detail_df = pd.concat([ - detail_df, - pd.DataFrame([x[detail] for x in - video_resp.json()['items']]) - ], axis=1) + detail_df = pd.concat( + [ + detail_df, + pd.DataFrame([x[detail] for x in video_resp.json()["items"]]), + ], + axis=1, + ) except KeyError: continue temp_df = pd.concat([items_df, detail_df], axis=1) @@ -489,26 +1389,30 @@ def youtube_channel_details(key, channel_ids): """Return details of channels for which the ids are given. Assumes ``ids`` is a comma-separated list of channel ids with no spaces.""" - base_url = ('https://www.googleapis.com/youtube/v3/channels?part=' - 'snippet,contentDetails,statistics') + base_url = ( + "https://www.googleapis.com/youtube/v3/channels?part=" + "snippet,contentDetails,statistics" + ) channel_ids = _split_by_comma(channel_ids, length=50) final_df = pd.DataFrame() for channel_id in channel_ids: - params = {'id': channel_id, 'key': key} - logging.info(msg='Requesting: ' + 'channel details') + params = {"id": channel_id, "key": key} + logging.info(msg="Requesting: " + "channel details") channel_resp = requests.get(base_url, params=params) if channel_resp.status_code >= 400: raise Exception(channel_resp.json()) - items_df = pd.DataFrame(channel_resp.json()['items']) - details = ['snippet', 'statistics', 'contentDetails'] + items_df = pd.DataFrame(channel_resp.json()["items"]) + details = ["snippet", "statistics", "contentDetails"] detail_df = pd.DataFrame() for detail in details: try: - detail_df = pd.concat([ - detail_df, - pd.DataFrame([x[detail] for x in - channel_resp.json()['items']]) - ], axis=1) + detail_df = pd.concat( + [ + detail_df, + pd.DataFrame([x[detail] for x in channel_resp.json()["items"]]), + ], + axis=1, + ) except KeyError: continue temp_df = pd.concat([items_df, detail_df], axis=1) @@ -541,14 +1445,38 @@ def _dict_product(d): return dicts -def serp_goog(q, cx, key, c2coff=None, cr=None, - dateRestrict=None, exactTerms=None, excludeTerms=None, - fileType=None, filter=None, gl=None, highRange=None, - hl=None, hq=None, imgColorType=None, imgDominantColor=None, - imgSize=None, imgType=None, linkSite=None, lowRange=None, - lr=None, num=None, orTerms=None, relatedSite=None, - rights=None, safe=None, searchType=None, siteSearch=None, - siteSearchFilter=None, sort=None, start=None): +def serp_goog( + q, + cx, + key, + c2coff=None, + cr=None, + dateRestrict=None, + exactTerms=None, + excludeTerms=None, + fileType=None, + filter=None, + gl=None, + highRange=None, + hl=None, + hq=None, + imgColorType=None, + imgDominantColor=None, + imgSize=None, + imgType=None, + linkSite=None, + lowRange=None, + lr=None, + num=None, + orTerms=None, + rights=None, + safe=None, + searchType=None, + siteSearch=None, + siteSearchFilter=None, + sort=None, + start=None, +): """Query Google and get search results in a DataFrame. For each parameter, you can supply single or multiple values / arguments. @@ -656,8 +1584,6 @@ def serp_goog(q, cx, key, c2coff=None, cr=None, check for in a document, where each document in the search results must contain at least one of the additional search terms. - :param relatedSite: Specifies that all search results - should be pages that are related to the specified URL. :param rights: Filters based on licensing. Supported values include: cc_publicdomain, cc_attribute, cc_sharealike, cc_noncommercial, cc_nonderived, and @@ -714,107 +1640,136 @@ def serp_goog(q, cx, key, c2coff=None, cr=None, for p in supplied_params: if p in SERP_GOOG_VALID_VALS: if not set(supplied_params[p]).issubset(SERP_GOOG_VALID_VALS[p]): - raise ValueError('Please make sure you provide a' - ' valid value for "{}", valid values:\n' - '{}'.format(p, - sorted(SERP_GOOG_VALID_VALS[p]))) + raise ValueError( + "Please make sure you provide a" + ' valid value for "{}", valid values:\n' + "{}".format(p, sorted(SERP_GOOG_VALID_VALS[p])) + ) params_list = _dict_product(supplied_params) - base_url = 'https://www.googleapis.com/customsearch/v1?' - specified_cols = ['searchTerms', 'rank', 'title', 'snippet', - 'displayLink', 'link', 'queryTime', 'totalResults'] + base_url = "https://www.googleapis.com/customsearch/v1?" + specified_cols = [ + "searchTerms", + "rank", + "title", + "snippet", + "displayLink", + "link", + "queryTime", + "totalResults", + ] responses = [] for param in params_list: - param_log = ', '.join([k + '=' + str(v) for k, v in param.items()]) - logging.info(msg='Requesting: ' + param_log) + param_log = ", ".join([k + "=" + str(v) for k, v in param.items()]) + logging.info(msg="Requesting: " + param_log) resp = requests.get(base_url, params=param) if resp.status_code >= 400: raise Exception(resp.json()) responses.append(resp) result_df = pd.DataFrame() for i, resp in enumerate(responses): - request_metadata = resp.json()['queries']['request'][0] - del request_metadata['title'] - search_info = resp.json()['searchInformation'] - if int(search_info['totalResults']) == 0: + request_metadata = resp.json()["queries"]["request"][0] + del request_metadata["title"] + search_info = resp.json()["searchInformation"] + if int(search_info["totalResults"]) == 0: df = pd.DataFrame(columns=specified_cols, index=range(1)) - df['searchTerms'] = request_metadata['searchTerms'] + df["searchTerms"] = request_metadata["searchTerms"] # These keys don't appear in the response so they have to be # added manually - for missing in ['lr', 'num', 'start', 'c2coff']: + for missing in ["lr", "num", "start", "c2coff"]: if missing in params_list[i]: df[missing] = params_list[i][missing] else: - df = pd.DataFrame(resp.json()['items']) - df['cseName'] = resp.json()['context']['title'] - start_idx = request_metadata['startIndex'] - df['rank'] = range(start_idx, start_idx + len(df)) - for missing in ['lr', 'num', 'start', 'c2coff']: + df = pd.DataFrame(resp.json()["items"]) + df["cseName"] = resp.json()["context"]["title"] + start_idx = request_metadata["startIndex"] + df["rank"] = range(start_idx, start_idx + len(df)) + for missing in ["lr", "num", "start", "c2coff"]: if missing in params_list[i]: df[missing] = params_list[i][missing] meta_columns = {**request_metadata, **search_info} df = df.assign(**meta_columns) - df['queryTime'] = datetime.datetime.now(tz=datetime.timezone.utc) - df['queryTime'] = pd.to_datetime(df['queryTime']) - if 'image' in df: - img_df = json_normalize(df['image']) - img_df.columns = ['image.' + c for c in img_df.columns] + df["queryTime"] = datetime.datetime.now(tz=datetime.timezone.utc) + df["queryTime"] = pd.to_datetime(df["queryTime"]) + if "image" in df: + img_df = json_normalize(df["image"]) + img_df.columns = ["image." + c for c in img_df.columns] df = pd.concat([df, img_df], axis=1) result_df = pd.concat([result_df, df], sort=False, ignore_index=True) - ordered_cols = (list(set(params_list[i]).difference({'q', 'key', 'cx'})) + - specified_cols) + ordered_cols = ( + list(set(params_list[i]).difference({"q", "key", "cx"})) + specified_cols + ) non_ordered = result_df.columns.difference(set(ordered_cols)) final_df = result_df[ordered_cols + list(non_ordered)] - if 'pagemap' in final_df: + if "pagemap" in final_df: pagemap_df = pd.DataFrame() - for p in final_df['pagemap']: + for p in final_df["pagemap"]: try: temp_pagemap_df = json_normalize(p) pagemap_df = pd.concat([pagemap_df, temp_pagemap_df], sort=False) except Exception as e: - temp_pagemap_df = pd.DataFrame({'delete_me': None}, - index=range(1)) + temp_pagemap_df = pd.DataFrame({"delete_me": None}, index=range(1)) pagemap_df = pd.concat([pagemap_df, temp_pagemap_df], sort=False) pagemap_df = pagemap_df.reset_index(drop=True) - if 'delete_me' in pagemap_df: - del pagemap_df['delete_me'] + if "delete_me" in pagemap_df: + del pagemap_df["delete_me"] for col in pagemap_df: if col in final_df: - pagemap_df = pagemap_df.rename(columns={col: 'pagemap_' + col}) + pagemap_df = pagemap_df.rename(columns={col: "pagemap_" + col}) final_df = pd.concat([final_df, pagemap_df], axis=1) - if 'metatags' in pagemap_df: + if "metatags" in pagemap_df: metatag_df = pd.DataFrame() - for m in pagemap_df['metatags']: + for m in pagemap_df["metatags"]: try: temp_metatags_df = json_normalize(m) - metatag_df = pd.concat([metatag_df, temp_metatags_df], - sort=False) + metatag_df = pd.concat([metatag_df, temp_metatags_df], sort=False) except Exception as e: - temp_metatags_df = pd.DataFrame({'delete_me': None}, - index=range(1)) - metatag_df = pd.concat([metatag_df, temp_metatags_df], - sort=False) + temp_metatags_df = pd.DataFrame({"delete_me": None}, index=range(1)) + metatag_df = pd.concat([metatag_df, temp_metatags_df], sort=False) metatag_df = metatag_df.reset_index(drop=True) - if 'delete_me' in metatag_df: - del metatag_df['delete_me'] + if "delete_me" in metatag_df: + del metatag_df["delete_me"] for col in metatag_df: if col in final_df: - metatag_df = metatag_df.rename(columns={col: 'metatag_' + col}) + metatag_df = metatag_df.rename(columns={col: "metatag_" + col}) final_df = pd.concat([final_df, metatag_df], axis=1) return final_df -def serp_youtube(key, q=None, channelId=None, channelType=None, eventType=None, - forContentOwner=None, forDeveloper=None, forMine=None, - location=None, locationRadius=None, maxResults=None, - onBehalfOfContentOwner=None, order=None, pageToken=None, - publishedAfter=None, publishedBefore=None, regionCode=None, - relatedToVideoId=None, relevanceLanguage=None, - safeSearch=None, topicId=None, type=None, videoCaption=None, - videoCategoryId=None, videoDefinition=None, - videoDimension=None, videoDuration=None, videoEmbeddable=None, - videoLicense=None, videoSyndicated=None, videoType=None): +def serp_youtube( + key, + q=None, + channelId=None, + channelType=None, + eventType=None, + forContentOwner=None, + forDeveloper=None, + forMine=None, + location=None, + locationRadius=None, + maxResults=None, + onBehalfOfContentOwner=None, + order=None, + pageToken=None, + publishedAfter=None, + publishedBefore=None, + regionCode=None, + relatedToVideoId=None, + relevanceLanguage=None, + safeSearch=None, + topicId=None, + type=None, + videoCaption=None, + videoCategoryId=None, + videoDefinition=None, + videoDimension=None, + videoDuration=None, + videoEmbeddable=None, + videoLicense=None, + videoSyndicated=None, + videoType=None, +): """Query the YouTube API and get search results in a DataFrame. For each parameter you can supply a single or multiple value(s). Looping and merging results is handled automatically in case of multiple @@ -1130,16 +2085,29 @@ def serp_youtube(key, q=None, channelId=None, channelType=None, eventType=None, params = locals() supplied_params = {k: v for k, v in params.items() if params[k]} - type_vid_params = {'eventType', 'relatedToVideoId', 'videoCaption', - 'videoCategoryId', 'videoDefinition', 'videoDimension', - 'videoDuration', 'videoEmbeddable', 'videoLicense', - 'videoSyndicated', 'videoType', 'forMine', - 'forContentOwner'} - - if (supplied_params.get('type') != 'video' and - type_vid_params.intersection(set(supplied_params.keys()))): - raise ValueError('You need to set type="video" if you want to set' - ' any of the following:' + str(type_vid_params)) + type_vid_params = { + "eventType", + "relatedToVideoId", + "videoCaption", + "videoCategoryId", + "videoDefinition", + "videoDimension", + "videoDuration", + "videoEmbeddable", + "videoLicense", + "videoSyndicated", + "videoType", + "forMine", + "forContentOwner", + } + + if supplied_params.get("type") != "video" and type_vid_params.intersection( + set(supplied_params.keys()) + ): + raise ValueError( + 'You need to set type="video" if you want to set' + " any of the following:" + str(type_vid_params) + ) for p in supplied_params: if isinstance(supplied_params[p], (str, int)): @@ -1148,19 +2116,20 @@ def serp_youtube(key, q=None, channelId=None, channelType=None, eventType=None, for p in supplied_params: if p in SERP_YTUBE_VALID_VALS: if not set(supplied_params[p]).issubset(SERP_YTUBE_VALID_VALS[p]): - raise ValueError('Please make sure you provide a' - ' valid value for "{}", valid values:\n{}' - .format(p, - sorted([str(x) for x in - SERP_YTUBE_VALID_VALS[p]]))) + raise ValueError( + "Please make sure you provide a" + ' valid value for "{}", valid values:\n{}'.format( + p, sorted([str(x) for x in SERP_YTUBE_VALID_VALS[p]]) + ) + ) params_list = _dict_product(supplied_params) base_url = "https://www.googleapis.com/youtube/v3/search?part=snippet" responses = [] for param in params_list: - param_log = ', '.join([k + '=' + str(v) for k, v in param.items()]) - logging.info(msg='Requesting: ' + param_log) + param_log = ", ".join([k + "=" + str(v) for k, v in param.items()]) + logging.info(msg="Requesting: " + param_log) resp = requests.get(base_url, params=param) if resp.status_code >= 400: raise Exception(resp.json()) @@ -1168,61 +2137,78 @@ def serp_youtube(key, q=None, channelId=None, channelType=None, eventType=None, result_df = pd.DataFrame() for i, resp in enumerate(responses): - snippet_df = pd.DataFrame([x['snippet'] for x in resp.json()['items']]) - id_df = pd.DataFrame([x['id'] for x in resp.json()['items']]) - if 'channelId' in id_df: - id_df = id_df.drop('channelId', axis=1) + snippet_df = pd.DataFrame([x["snippet"] for x in resp.json()["items"]]) + id_df = pd.DataFrame([x["id"] for x in resp.json()["items"]]) + if "channelId" in id_df: + id_df = id_df.drop("channelId", axis=1) - if 'thumbnails' in snippet_df: - thumb_df = json_normalize(snippet_df['thumbnails']) + if "thumbnails" in snippet_df: + thumb_df = json_normalize(snippet_df["thumbnails"]) else: thumb_df = pd.DataFrame() - page_info = resp.json()['pageInfo'] - temp_df = pd.concat([snippet_df, id_df, thumb_df], - axis=1).assign(**page_info) - temp_df['rank'] = range(1, len(temp_df)+1) + page_info = resp.json()["pageInfo"] + temp_df = pd.concat([snippet_df, id_df, thumb_df], axis=1).assign(**page_info) + temp_df["rank"] = range(1, len(temp_df) + 1) if len(temp_df) == 0: - empty_df_cols = ['title', 'description', 'publishedAt', - 'channelTitle', 'kind', 'videoId', 'channelId'] - temp_df = temp_df.assign(q=[params_list[i]['q']]) + empty_df_cols = [ + "title", + "description", + "publishedAt", + "channelTitle", + "kind", + "videoId", + "channelId", + ] + temp_df = temp_df.assign(q=[params_list[i]["q"]]) temp_df = temp_df.assign(**dict.fromkeys(empty_df_cols)) temp_df = temp_df.assign(**page_info) - del params_list[i]['key'] + del params_list[i]["key"] temp_df = temp_df.assign(**params_list[i]) - temp_df['nextPageToken'] = resp.json().get('nextPageToken') - result_df = pd.concat([result_df, temp_df], sort=False, - ignore_index=True) - - result_df['queryTime'] = datetime.datetime.now(tz=datetime.timezone.utc) - result_df['queryTime'] = pd.to_datetime(result_df['queryTime']) - - specified_cols = ['queryTime', 'rank', 'title', 'description', - 'publishedAt', 'channelTitle', 'totalResults', - 'kind'] + temp_df["nextPageToken"] = resp.json().get("nextPageToken") + result_df = pd.concat([result_df, temp_df], sort=False, ignore_index=True) + + result_df["queryTime"] = datetime.datetime.now(tz=datetime.timezone.utc) + result_df["queryTime"] = pd.to_datetime(result_df["queryTime"]) + + specified_cols = [ + "queryTime", + "rank", + "title", + "description", + "publishedAt", + "channelTitle", + "totalResults", + "kind", + ] ordered_cols = list(params_list[i].keys()) + specified_cols non_ordered = result_df.columns.difference(set(ordered_cols)) final_df = result_df[ordered_cols + list(non_ordered)] - vid_ids = ','.join(final_df['videoId'].dropna()) + vid_ids = ",".join(final_df["videoId"].dropna()) if vid_ids: vid_details_df = youtube_video_details(vid_ids=vid_ids, key=key) - vid_details_df.columns = ['video.' + x for x in vid_details_df.columns] - final_df = pd.merge(final_df, vid_details_df, - how='left', left_on='videoId', right_on='video.id') + vid_details_df.columns = ["video." + x for x in vid_details_df.columns] + final_df = pd.merge( + final_df, vid_details_df, how="left", left_on="videoId", right_on="video.id" + ) - channel_ids = ','.join(final_df['channelId'].dropna()) + channel_ids = ",".join(final_df["channelId"].dropna()) if channel_ids: - channel_details_df = youtube_channel_details(channel_ids=channel_ids, - key=key) - channel_details_df.columns = ['channel.' + x for x in - channel_details_df.columns] - - final_df = pd.merge(final_df, channel_details_df, - how='left', left_on='channelId', - right_on='channel.id') - final_df = final_df.drop_duplicates(subset=['videoId']) + channel_details_df = youtube_channel_details(channel_ids=channel_ids, key=key) + channel_details_df.columns = [ + "channel." + x for x in channel_details_df.columns + ] + + final_df = pd.merge( + final_df, + channel_details_df, + how="left", + left_on="channelId", + right_on="channel.id", + ) + final_df = final_df.drop_duplicates(subset=["videoId"]) return final_df.reset_index(drop=True) @@ -1232,13 +2218,25 @@ def set_logging_level(level_or_name): 'NOTSET', 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] """ - lvl_names_values = [0, 10, 20, 30, 40, 50, - 'NOTSET', 'DEBUG', 'INFO', - 'WARNING', 'ERROR', 'CRITICAL'] + lvl_names_values = [ + 0, + 10, + 20, + 30, + 40, + 50, + "NOTSET", + "DEBUG", + "INFO", + "WARNING", + "ERROR", + "CRITICAL", + ] if level_or_name not in lvl_names_values: - raise ValueError('Please make sure you supply' - ' a value from: {}'.format(lvl_names_values)) + raise ValueError( + "Please make sure you supply" " a value from: {}".format(lvl_names_values) + ) logging.getLogger().setLevel(level_or_name) -logging.getLogger().setLevel('INFO') +logging.getLogger().setLevel("INFO")