### Test environment, Create a folder to store result

In [1]:
import os, pathlib
from dotenv import load_dotenv
from tornado.gen import multi

pathlib.Path("Out").mkdir(exist_ok=True)
print("Out/ ready.")

load_dotenv()
print("Env loaded. API_KEY present? ", bool(os.getenv("API_KEY")))
print("Kernel OK.")

Out/ ready.
Env loaded. API_KEY present?  True
Kernel OK.


### Test API Key and Search Engine ID

In [2]:
API_KEY = os.getenv("API_KEY")
CX_ID = os.getenv("CX_ID")
assert API_KEY and CX_ID, "Set API_KEY and CX_ID in your .env file"
print("OK, keys loaded.")

OK, keys loaded.


### Setup Queries and Pages

In [3]:
import json


PAGES = os.getenv("PAGES")
FOLLOWER_MIN=os.getenv("FOLLOWER_MIN")
SITES = pathlib.Path("queries") / "sites.json"
CITIES = pathlib.Path("queries") / "cities.json"
VIEWS = pathlib.Path("queries") / "views.json"
PATTERNS = pathlib.Path("queries") / "patterns.json"
KEYWORDS = pathlib.Path("queries") / "keywords.json"

def create_queries() -> dict[str, list[str]]:
    with open(SITES) as sites_file:
        sites = json.load(sites_file)
    with open(CITIES) as cities_file:
        cities = json.load(cities_file)
    with open(KEYWORDS) as keywords_file:
        keywords = json.load(keywords_file)

    q = dict()
    for site in sites["social_media"]:
        # modify this part to get different filter combos
        q[site] = [f"{site} {keyword} {city}" for keyword in keywords["nail"] for city in cities["top_major_cities"]]

    return q

QUERY = create_queries()

In [4]:
for i in QUERY:
    print(i, QUERY[i])

instagram.com ['instagram.com nail Tokyo', 'instagram.com nail Osaka', 'instagram.com nail Seoul', 'instagram.com nail Shanghai', 'instagram.com nail Hong Kong', 'instagram.com nail Singapore', 'instagram.com nail Bangkok', 'instagram.com nail New York', 'instagram.com nail Los Angeles', 'instagram.com nail Paris', 'instagram.com nail London', 'instagram.com nails Tokyo', 'instagram.com nails Osaka', 'instagram.com nails Seoul', 'instagram.com nails Shanghai', 'instagram.com nails Hong Kong', 'instagram.com nails Singapore', 'instagram.com nails Bangkok', 'instagram.com nails New York', 'instagram.com nails Los Angeles', 'instagram.com nails Paris', 'instagram.com nails London', 'instagram.com press-on nails Tokyo', 'instagram.com press-on nails Osaka', 'instagram.com press-on nails Seoul', 'instagram.com press-on nails Shanghai', 'instagram.com press-on nails Hong Kong', 'instagram.com press-on nails Singapore', 'instagram.com press-on nails Bangkok', 'instagram.com press-on nails New

### Parse url and gather data with patterns and queries

In [10]:
import re
import requests
from urllib.parse import urlparse


def customized_search(query: str, start: int = 1):
    r = requests.get(
        "https://www.googleapis.com/customsearch/v1",
        params={"key": API_KEY, "cx": CX_ID, "q": query, "start": start},
        timeout=30
    )
    r.raise_for_status()
    return r.json()



def gather_raw_data(query: dict[str, list[str]]) -> dict[str, list[str]]:
    raw_data = dict()
    for site, fil in query.items():
        raw_data[site] = customized_search(fil).get("items", [])
    return raw_data


print(gather_raw_data({'instagram.com': ['instagram.com nail Tokyo followers']}))


def _to_int(num_str: str, text: str) -> int:
    s = num_str.replace(',', '')
    try:
        val = float(s)
    except ValueError:
        return 0
    # suffixes in same token or surrounding text
    t = text.lower()
    if re.search(r'\b(\d[\d,]*\.?\d*)\s*[k]\b', text, re.I) and num_str in text:
        return int(val * 1_000)
    if re.search(r'\b(\d[\d,]*\.?\d*)\s*[m]\b', text, re.I) and num_str in text:
        return int(val * 1_000_000)
    # Japanese units
    if '万' in text:
        return int(val * 10_000)
    if '千' in text:
        return int(val * 1_000)
    return int(val)

def parse_ig_snippet(snippet: str) -> dict[str, list[str]]:
    return json.loads(snippet)

def parse_ig_url(url: str) -> tuple[str, str]:
    """
    Given a URL, return its username.
    :param url:
    :return: account web address
    """
    p = urlparse(url)
    url_segments = p.path.split("/")

    # this set might not contain all possible tags, so the result might contain some incorrect links
    if url_segments[0].lower() not in {"p", "reels", "stories", "explore", "tv", "channels", "direct"}:
        return p, "a"
    elif url_segments[0].lower() == "p":
        return p, "p"
    return "none", "none"

"""
def parse_data(data: dict[str, list[str]]) -> dict[str, list[str]]:
    items = gather_raw_data(QUERY).get("items", [])
    rows = list()

    for it in items:
        link = it.get("link", "")
        if "instagram.com" not in link: continue
        uname = parse_ig_username(link)
        if not uname: continue

        snippet = (it or {}).get("snippet", "").strip()
        if
"""

{'instagram.com': [{'kind': 'customsearch#result', 'title': 'Tokyo Nails (@tokyonails.chicago) · Chicago, IL', 'htmlTitle': '<b>Tokyo Nails</b> (@<b>tokyonails</b>.chicago) · Chicago, IL', 'link': 'https://www.instagram.com/tokyonails.chicago/?hl=en', 'displayLink': 'www.instagram.com', 'snippet': '16K followers · 1.1K+ posts · Editorial & Design Salon Specializing in Japanese Gel/GelX Hand Painted Art By Appointment Only Acrylic #japanesegel 215 N\xa0...', 'htmlSnippet': '16K <b>followers</b> · 1.1K+ posts · Editorial &amp; Design Salon Specializing in Japanese Gel/GelX Hand Painted Art By Appointment Only Acrylic #japanesegel 215 N&nbsp;...', 'formattedUrl': 'https://www.instagram.com/tokyonails.chicago/?hl=en', 'htmlFormattedUrl': 'https://www.<b>instagram.com</b>/<b>tokyonail</b>s.chicago/?hl=en', 'pagemap': {'cse_thumbnail': [{'src': 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRUfJ8TLL7IqTbcNWBPQYj59UpC6Xv9VFTf_NK3cMP45HVRd9ZxWXNRXcCp&s', 'width': '194', 'height': '259'

'\ndef parse_data(data: dict[str, list[str]]) -> dict[str, list[str]]:\n    items = gather_raw_data(QUERY).get("items", [])\n    rows = list()\n\n    for it in items:\n        link = it.get("link", "")\n        if "instagram.com" not in link: continue\n        uname = parse_ig_username(link)\n        if not uname: continue\n\n        snippet = (it or {}).get("snippet", "").strip()\n        if\n'

In [None]:
print("Hello World!")
