### Test environment, Create a folder to store result

In [7]:
import os, pathlib
from dotenv import load_dotenv

pathlib.Path("Out").mkdir(exist_ok=True)
print("Out/ ready.")

load_dotenv()
print("Env loaded. API_KEY present? ", bool(os.getenv("API_KEY")))
print("Kernel OK.")

Out/ ready.
Env loaded. API_KEY present?  True
Kernel OK.


### Test API Key and Search Engine ID

In [2]:
API_KEY = os.getenv("API_KEY")
CX_ID = os.getenv("CX_ID")
GOOGLE_API = 'https://www.googleapis.com/customsearch/v1'
assert API_KEY and CX_ID, "Set API_KEY and CX_ID in your .env file"
print("OK, keys loaded.")

OK, keys loaded.


### Setup Queries and Pages

In [3]:
import json


PAGES = os.getenv("PAGES")
FOLLOWER_MIN=os.getenv("FOLLOWER_MIN")
SITES = pathlib.Path("queries") / "sites.json"
CITIES = pathlib.Path("queries") / "cities.json"
VIEWS = pathlib.Path("queries") / "views.json"
PATTERNS = pathlib.Path("queries") / "patterns.json"
KEYWORDS = pathlib.Path("queries") / "keywords.json"

def create_queries() -> dict[str, list[str]]:
    with open(SITES) as sites_file:
        sites = json.load(sites_file)
    with open(CITIES) as cities_file:
        cities = json.load(cities_file)
    with open(KEYWORDS) as keywords_file:
        keywords = json.load(keywords_file)

    q = dict()
    for site in sites["social_media"]:
        # modify this part to get different filter combos
        q[site] = [f"{site} {keyword} {city}" for keyword in keywords["nail"] for city in cities["top_major_cities"]]

    return q

QUERY = create_queries()

In [4]:
for i in QUERY:
    print(i, QUERY[i])

instagram.com ['instagram.com nail Tokyo', 'instagram.com nail Osaka', 'instagram.com nail Seoul', 'instagram.com nail Shanghai', 'instagram.com nail Hong Kong', 'instagram.com nail Singapore', 'instagram.com nail Bangkok', 'instagram.com nail New York', 'instagram.com nail Los Angeles', 'instagram.com nail Paris', 'instagram.com nail London', 'instagram.com nails Tokyo', 'instagram.com nails Osaka', 'instagram.com nails Seoul', 'instagram.com nails Shanghai', 'instagram.com nails Hong Kong', 'instagram.com nails Singapore', 'instagram.com nails Bangkok', 'instagram.com nails New York', 'instagram.com nails Los Angeles', 'instagram.com nails Paris', 'instagram.com nails London', 'instagram.com press-on nails Tokyo', 'instagram.com press-on nails Osaka', 'instagram.com press-on nails Seoul', 'instagram.com press-on nails Shanghai', 'instagram.com press-on nails Hong Kong', 'instagram.com press-on nails Singapore', 'instagram.com press-on nails Bangkok', 'instagram.com press-on nails New

### Parse url and gather data with patterns and queries

In [5]:
import re
import requests
from urllib.parse import urlparse

def customized_search(query: str, start: int = 1):
    r = requests.get(GOOGLE_API, params={
        "key": API_KEY,
        "cx": CX_ID,
        "q": query,
        "start": start
    }, timeout=30)
    r.raise_for_status()
    return r.json()

def gather_raw_data(query: dict[str, list[str]]) -> dict[str, list[str]]:
    raw_data = dict()
    for site, fil in query.items():
        raw_data[site] = customized_search(fil).get("items", [])
    return raw_data


print(gather_raw_data({'instagram.com': ['instagram.com nail Tokyo followers']}))

{'instagram.com': [{'kind': 'customsearch#result', 'title': 'NAILS BY MEI⁖ ˖ (@nailsbymei) • Instagram photos and videos', 'htmlTitle': '<b>NAILS</b> BY MEI⁖   ˖ (@<b>nailsbymei</b>) • <b>Instagram</b> photos and videos', 'link': 'https://www.instagram.com/nailsbymei/?hl=en', 'displayLink': 'www.instagram.com', 'snippet': '348K followers · 2.4K+ following · 8229 posts · @nailsbymei\u200e: “\u200e ... TOKYO ✨. Follow. suki_nailz. Nail salon East london. Follow. roho7nailz. Follow.', 'htmlSnippet': '348K <b>followers</b> · 2.4K+ following · 8229 posts · @nailsbymei\u200e: “\u200e ... <b>TOKYO</b> ✨. Follow. suki_nailz. <b>Nail</b> salon East london. Follow. roho7nailz. Follow.', 'formattedUrl': 'https://www.instagram.com/nailsbymei/?hl=en', 'htmlFormattedUrl': 'https://www.<b>instagram.com</b>/<b>nail</b>sbymei/?hl=en', 'pagemap': {'cse_thumbnail': [{'src': 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQ9Dw7m8_hEcZaD8wFEkItN06W_E164vfhoFi58YLAKMY63PFUqNHrBSmYU&s', 'width': '201'

In [8]:
def parse_ig_url(url: str) -> tuple[str, str]:
    """
    Given a URL, return its username.
    :param url:
    :return: account web address
    """
    p = urlparse(url)
    url_segments = p.path.split("/")
    if url_segments[0].lower() not in {"p", "reels", "stories", "explore", "tv", "channels", "direct"}:
        return url, "a"
    elif url_segments[0].lower() == "p":
        return url, "p"
    return "none", "none"


def parse_nums(m):
    if not m:
        return 0

    num_str, suffix = m.groups()
    num = float(num_str.replace(',', ''))

    mult = {
        None: 1,
        '': 1,
        'K': 1_000,
        'M': 1_000_000,
        'B': 1_000_000_000
    }
    return int(num * mult.get(suffix.upper() if suffix else '', 1))


def parse_ig_followers(raw):
    pattern = '(\\d[\\d,.]*)\\s*(K|M|B)?\\s+followers'
    m = re.search(pattern, raw, re.I)
    return parse_nums(m)


def parse_ig_posts(raw):
    pattern = '(\\d[\\d,.]*)\\s*(K|M|B)?\\s+posts'
    m = re.search(pattern, raw, re.I)
    return parse_nums(m)

def parse_ig_following(raw):
    pattern = '(\\d[\\d,.]*)\\s*(K|M|B)?\\s+following'
    m = re.search(pattern, raw, re.I)
    return parse_nums(m)

def trim_after_posts(raw: str) -> str:
    if not raw:
        return ""

    # Normalize common invisible spaces
    s = raw.replace("\xa0", " ")

    # Find 'posts' (or 'post') as a whole word
    m = re.search(r'\bposts?\b', s, flags=re.IGNORECASE)
    if not m:
        return s.strip()

    # Everything after the word 'posts'
    tail = s[m.end():]

    # Strip leading punctuation/delimiters often seen after 'Posts'
    # (space, dots, hyphen, en/em dash, colon, bullets, pipes)
    tail = re.sub(r'^[\s\.\-–—:•·|]+', '', tail)

    return tail.strip()

def clean_instagram_title(title: str) -> str:
    """
    Strip trailing '• Instagram …' (any suffix starting with a separator + 'Instagram')
    from Google CSE titles, even if truncated with … or '...'.
    """
    if not title:
        return ""

    t = title.replace("\xa0", " ").strip()
    t = re.sub(r"\s+", " ", t)  # normalize whitespace

    # Remove: [separator] Instagram ... [to end]
    # separators seen: • · - – — | (&bull; sometimes appears)
    t = re.sub(
        r'\s*(?:[•·\-\–\—|]|&bull;)\s*Instagram\b.*$',
        '',
        t,
        flags=re.IGNORECASE
    )

    # Also handle rare '(Instagram)' suffixes
    t = re.sub(r'\s*\(\s*Instagram\s*\)\s*$', '', t, flags=re.IGNORECASE)

    # Trim dangling punctuation/spaces
    t = t.strip(' -–—|·•').strip()
    return t


FOLLOWER_THRESHOLD = 2000


def parse_and_filter_ig_data(raw_data):
    result = dict()

    for value in raw_data['instagram.com']:
        d = dict()
        raw_title = clean_instagram_title(value['title'])

        raw_url = value['link']
        raw_snippet = value['snippet']

        parsed_url = parse_ig_url(raw_url)
        if parsed_url[1] == 'p':
            # result is a post, temporarily skip this result
            continue
        elif parsed_url[1] == 'a':
            # result is an account
            follower = parse_ig_followers(raw_snippet)
            if follower < FOLLOWER_THRESHOLD:
                # we don't care followers < 2k accounts
                continue
            else:
                following = parse_ig_following(raw_snippet)
                post = parse_ig_posts(raw_snippet)
                description = trim_after_posts(raw_snippet)
        else:
            # dont handle this case for now
            continue
        d['type'] = 'a'
        d['url'] = parsed_url[0]
        d['follower'] = follower
        d['following'] = following
        d['post'] = post
        d['description'] = description
        result[raw_title] = d
    return result

data = parse_and_filter_ig_data(gather_raw_data({'instagram.com': ['instagram.com nail Tokyo followers']}))
print(data)

{'NAILS BY MEI⁖ ˖ (@nailsbymei)': {'type': 'a', 'url': 'https://www.instagram.com/nailsbymei/?hl=en', 'follower': 348000, 'following': 0, 'post': 8229, 'description': '@nailsbymei\u200e: “\u200e ... TOKYO ✨. Follow. suki_nailz. Nail salon East london. Follow. roho7nailz. Follow.'}, 'Tokyo Nails (@tokyonails.chicago) · Chicago, IL': {'type': 'a', 'url': 'https://www.instagram.com/tokyonails.chicago/?hl=en', 'follower': 16000, 'following': 0, 'post': 0, 'description': 'Editorial & Design Salon Specializing in Japanese Gel/GelX Hand Painted Art By Appointment Only Acrylic #japanesegel 215 N ...'}, 'SIMONE BILES (@simonebiles)': {'type': 'a', 'url': 'https://www.instagram.com/simonebiles/?hl=en', 'follower': 12000000, 'following': 291, 'post': 2987, 'description': "SIMONE BILES (@simonebiles) on ... nails's profile picture. nails. madrid's profile picture. madrid. bahamas's ..."}, 'Khloé Kardashian (@khloekardashian)': {'type': 'a', 'url': 'https://www.instagram.com/khloekardashian/?hl=en'

In [9]:
import pandas as pd


df = pd.DataFrame.from_dict(data, orient="index")
print(df)

                                                type  \
NAILS BY MEI⁖ ˖ (@nailsbymei)                      a   
Tokyo Nails (@tokyonails.chicago) · Chicago, IL    a   
SIMONE BILES (@simonebiles)                        a   
Khloé Kardashian (@khloekardashian)                a   
SUCRE (@nail.sucre)                                a   
Sam is Home | Hong Kong (@samishome)               a   
Úrsula Corberó (@ursulolita)                       a   
Rina Sawayama (@rinasonline)                       a   
Kim Kardashian (@kimkardashian)                    a   
Ilona Maher (@ilonamaher)                          a   

                                                                                               url  \
NAILS BY MEI⁖ ˖ (@nailsbymei)                          https://www.instagram.com/nailsbymei/?hl=en   
Tokyo Nails (@tokyonails.chicago) · Chicago, IL  https://www.instagram.com/tokyonails.chicago/?...   
SIMONE BILES (@simonebiles)                           https://www.instagram.c

In [10]:
import csv

def save_profiles_to_csv(data: dict, path: str = "instagram_profiles.csv"):
    """
    Save your {title: {...fields...}} dict to a CSV.
    """
    # Desired column order (others will be appended if present)
    base_fields = ["title", "type", "url", "follower", "following", "post", "description"]

    # Build rows and collect any extra keys that appear
    rows, extra_keys = [], set()
    for title, info in data.items():
        row = {"title": title}
        row.update(info or {})
        rows.append(row)
        extra_keys.update(k for k in row.keys() if k not in base_fields)

    # Final field order
    fieldnames = base_fields + sorted(extra_keys)

    with open(path, "w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=fieldnames)
        w.writeheader()
        for r in rows:
            # ensure all fields exist (avoid KeyError)
            for k in fieldnames:
                r.setdefault(k, "")
            w.writerow(r)

# Example:
save_profiles_to_csv(data, "profiles.csv")

In [None]:
print("Hello World!")
