### Test environment, Create a folder to store result

In [28]:
import os, pathlib
from dotenv import load_dotenv


pathlib.Path("Out").mkdir(exist_ok=True)
print("Out/ ready.")

load_dotenv()
print("Env loaded. API_KEY present? ", bool(os.getenv("API_KEY")))
print("Kernel OK.")

Out/ ready.
Env loaded. API_KEY present?  True
Kernel OK.


### Test API Key and Search Engine ID

In [29]:
API_KEY = os.getenv("API_KEY")
CX_ID = os.getenv("CX_ID")
assert API_KEY and CX_ID, "Set API_KEY and CX_ID in your .env file"
print("OK, keys loaded.")

OK, keys loaded.


### Setup Queries and Pages

In [26]:
import json


PAGES = os.getenv("PAGES")
FOLLOWER_MIN=os.getenv("FOLLOWER_MIN")
SITES = pathlib.Path("queries") / "sites.json"
CITIES = pathlib.Path("queries") / "cities.json"
VIEWS = pathlib.Path("queries") / "views.json"
PATTERNS = pathlib.Path("queries") / "patterns.json"
KEYWORDS = pathlib.Path("queries") / "keywords.json"

def create_queries():
    '''modify this part to get different filter combos'''
    with open(SITES) as sites_file:
        sites = json.load(sites_file)
    with open(CITIES) as cities_file:
        cities = json.load(cities_file)
    with open(KEYWORDS) as keywords_file:
        keywords = json.load(keywords_file)

    q = dict()
    for site in sites:
        q[site] = [f"{site} {keyword} {city}" for keyword in keywords["nail"] for city in cities["top_major_cities"]]

    return q

QUERY = create_queries()

In [27]:
import re
from urllib.parse import urlparse


def username_from_url(url: str):
    """
    Accept only top-level profile URLs, ignore posts/reels/stories.
    """
    p = urlparse(url)

    parts = [seg for seg in p.path.split("/") if seg]
    if not parts:
        return None
    if parts[0].lower() in {"p", "reel", "stories", "explore", "tv", "channels", "direct"}:
        return None
    return parts[0] if re.fullmatch(r"[A-Za-z0-9._]+", parts[0]) else None


# Handles: "12,345 followers", "2.1k followers", "3m followers"
# and JP formats like "フォロワー 1.2万"
FOLLOWER_PATTERNS = [
    r"([\d.,]+)\s*(k|m|b)?\s+followers",
    r"followers:\s*([\d.,]+)\s*(k|m|b)?",
    r"フォロワー\s*([\d.,]+)\s*(万|千)?",
]

def parse_followers(snippet: str):
    text = (snippet or "").lower()
    for pat in FOLLOWER_PATTERNS:
        m = re.search(pat, text)
        if not m:
            continue
        num = m.group(1).replace(",", "")
        unit = (m.group(2) or "").lower()
        try:
            val = float(num)
        except ValueError:
            continue
        mult = 1
        if unit in {"k", "千"}: mult = 1_000
        elif unit == "m":       mult = 1_000_000
        elif unit == "b":       mult = 1_000_000_000
        elif unit == "万":      mult = 10_000
        return int(val * mult)
    # Loose fallback like "12k" without the word "followers"
    m2 = re.search(r"\b([\d.,]+)\s*(k|m|b)\b", text)
    if m2:
        val = float(m2.group(1).replace(",", ""))
        mult = {"k":1_000,"m":1_000_000,"b":1_000_000_000}[m2.group(2)]
        return int(val * mult)
    return None

In [24]:
import requests


def google_page(query: str, start: int = 1):
    r = requests.get(
        "https://www.googleapis.com/customsearch/v1",
        params={"key": API_KEY, "cx": CX_ID, "q": query, "start": start},
        timeout=30
    )
    r.raise_for_status()
    return r.json()

data = google_page(QUERY, start=1)
len(data.get("items", []) or [])

10

In [25]:
items = data.get("items", []) or []
rows = []

for it in items:
    link = it.get("link", "")
    if "instagram.com" not in link:
        continue
    uname = username_from_url(link)
    if not uname:
        continue

    snippet = (it.get("snippet") or "").strip()
    followers = parse_followers(snippet)

    rows.append({
        "username": uname,
        "followers": followers,
        "bio": snippet,
        "profile_url": f"https://www.instagram.com/{uname}/"
    })

rows[:]

test p from urlparse: ('https', 'www.instagram.com', '/nail.sucre/', '', '', '')
test p from urlparse: ('https', 'www.instagram.com', '/hanna_nailwizard/', '', '', '')
test p from urlparse: ('https', 'www.instagram.com', '/reina.tokyonailart/', '', '', '')
test p from urlparse: ('https', 'www.instagram.com', '/chirin._chan/', '', '', '')
test p from urlparse: ('https', 'www.instagram.com', '/topcoat.tokyo/', '', '', '')
test p from urlparse: ('https', 'www.instagram.com', '/japanesenails_tokyo/', '', '', '')
test p from urlparse: ('https', 'www.instagram.com', '/tokyonails.chicago/', '', '', '')
test p from urlparse: ('https', 'www.instagram.com', '/p/DFHCobhzHso/', '', '', '')
test p from urlparse: ('https', 'www.instagram.com', '/p/B9inKeen0bp/', '', '', '')
test p from urlparse: ('https', 'www.instagram.com', '/lacielo.jp/', '', '', '')


[{'username': 'nail.sucre',
  'followers': 94000,
  'bio': '94K followers · 22 following · 10012 posts · @nail.sucre: “新宿区新宿3丁目3-9新宿伍名館204 TOKYO.SHINJUKU @nail.sucreamer @sucre.newyork sister store No\xa0...',
  'profile_url': 'https://www.instagram.com/nail.sucre/'},
 {'username': 'hanna_nailwizard',
  'followers': None,
  'bio': 'ACCEPTING AUG/SEP BOOKING LA BASED Currently in TOKYO Setagaya English/Jpnse speaking NailTech DM for appointment #englishspeakingnailsalon. Follow.',
  'profile_url': 'https://www.instagram.com/hanna_nailwizard/'},
 {'username': 'reina.tokyonailart',
  'followers': None,
  'bio': "Japanese nail tech ߉ #SomervilleNJ. Life's too short for boring nails.✶ Nail reservation,collab ▶︎DM me Founder of @hitomebore.tokyo · reinatokyonailart.",
  'profile_url': 'https://www.instagram.com/reina.tokyonailart/'},
 {'username': 'chirin._chan',
  'followers': None,
  'bio': "I'm Chirin, a nail artist at Hakusan Station, I'm happy to create beautiful, customized nail art fo

In [None]:
print("Hello World!")
