In [1]:
import json
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time
import re
from typing import Dict, Optional

In [2]:
def json_to_df(json_file_path):
    # Read JSON file
    with open(json_file_path, 'r') as file:
        data = json.load(file)
    
    # Create a list of dictionaries with the desired columns
    transformed_data = []
    for name, info in data.items():
        transformed_data.append({
            'NAME': name,
            'INS': info['instagram'] if info['instagram'] != '**null**' else None
        })
    
    # Convert to DataFrame
    df = pd.DataFrame(transformed_data)
    
    return df

# Example usage
# file_path = 'data.json'
# result_df = json_to_df(file_path)
# print(result_df)

In [3]:
with open('artist_instagram_id.json', 'r') as file:
        data = json.load(file)

In [4]:
# Create a list of dictionaries with the desired columns
transformed_data = []
for name, info in data.items():
    transformed_data.append({
        'NAME': name,
        'INS': info['instagram'] if info['instagram'] != '**null**' else None
    })

# Convert to DataFrame
df = pd.DataFrame(transformed_data)

In [5]:
df

Unnamed: 0,NAME,INS
0,Tony Touch,
1,Martha Redbone,
2,Jonny Pierce,https://www.instagram.com/jonnypierce
3,Quarter Water,https://www.instagram.com/quarterwatermusic
4,Alexander 23,https://www.instagram.com/alexander23
...,...,...
12713,Desert Dwellers,https://www.instagram.com/desert_dwellers_music
12714,Nivrana,
12715,Sir Please,https://www.instagram.com/sirpleaseband
12716,Jason Crabb,https://www.instagram.com/jasoncrabbmusic


In [6]:
(df['INS'].isna().sum() / len(df)) * 100

28.79383550872779

In [7]:
df['INS'].isna().sum()

3662

In [8]:
df[~df['INS'].isna()]

Unnamed: 0,NAME,INS
2,Jonny Pierce,https://www.instagram.com/jonnypierce
3,Quarter Water,https://www.instagram.com/quarterwatermusic
4,Alexander 23,https://www.instagram.com/alexander23
5,Chris Schweizer,https://www.instagram.com/schweizerchris
6,Tonstartssbandht,https://www.instagram.com/tonstartssbandht
...,...,...
12712,Jake Wesley Rogers,https://www.instagram.com/jakewesleyrogers
12713,Desert Dwellers,https://www.instagram.com/desert_dwellers_music
12715,Sir Please,https://www.instagram.com/sirpleaseband
12716,Jason Crabb,https://www.instagram.com/jasoncrabbmusic


In [9]:
df.to_csv('artist_instagram_id.csv', index=False)

In [10]:
df1 = df[~df['INS'].isna()]

In [11]:
df1 = df1.rename(columns={"INS": "URL"})

In [12]:
df1

Unnamed: 0,NAME,URL
2,Jonny Pierce,https://www.instagram.com/jonnypierce
3,Quarter Water,https://www.instagram.com/quarterwatermusic
4,Alexander 23,https://www.instagram.com/alexander23
5,Chris Schweizer,https://www.instagram.com/schweizerchris
6,Tonstartssbandht,https://www.instagram.com/tonstartssbandht
...,...,...
12712,Jake Wesley Rogers,https://www.instagram.com/jakewesleyrogers
12713,Desert Dwellers,https://www.instagram.com/desert_dwellers_music
12715,Sir Please,https://www.instagram.com/sirpleaseband
12716,Jason Crabb,https://www.instagram.com/jasoncrabbmusic


In [13]:
def setup_driver():
    """Setup Chrome driver with appropriate options"""
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-notifications')
    chrome_options.add_argument('--disable-infobars')
    chrome_options.add_argument('--disable-extensions')
    chrome_options.add_argument('--log-level=3')
    return webdriver.Chrome(options=chrome_options)

def parse_follower_count(text: str) -> Optional[int]:
    """Convert Instagram's follower count format to integer"""
    if not text:
        return None
    
    text = text.lower().replace(',', '')
    if 'k' in text:
        return int(float(text.replace('k', '')) * 1000)
    elif 'm' in text:
        return int(float(text.replace('m', '')) * 1000000)
    elif 'b' in text:
        return int(float(text.replace('b', '')) * 1000000000)
    else:
        try:
            return int(text)
        except ValueError:
            return None

def get_instagram_followers(driver, url: str) -> Optional[int]:
    """Get follower count for an Instagram profile using Selenium"""
    try:
        username = url.strip('/').split('/')[-1]
        
        driver.get(url)
        wait = WebDriverWait(driver, 10)
        meta_section = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "meta[property='og:description']"))
        )
        
        content = meta_section.get_attribute('content')
        match = re.search(r'([\d,.]+[KMB]?) Followers', content, re.IGNORECASE)
        
        if match:
            follower_count = match.group(1)
            return parse_follower_count(follower_count)
        
        return None
            
    except Exception:
        return None

def process_dataframe_urls(df: pd.DataFrame) -> pd.DataFrame:
    """Process URLs from DataFrame and add follower counts"""
    driver = setup_driver()
    
    try:
        follower_counts = []
        
        # Process each URL and show progress
        for index, row in df.iterrows():
            time.sleep(3)  # Delay to avoid rate limiting
            url = row['URL']
            name = row['NAME']
            
            followers = get_instagram_followers(driver, url)
            follower_counts.append(followers)
            
            # Print progress message
            if followers is not None:
                print(f"Row {index + 1}: {name} processed successfully ({followers:,} followers)")
            else:
                print(f"Row {index + 1}: {name} processed but no follower count found")
        
        df['INS_COUNT'] = follower_counts
        
    finally:
        driver.quit()
    
    return df

In [14]:
df1['INS_COUNT'] = None

In [15]:
df1

Unnamed: 0,NAME,URL,INS_COUNT
2,Jonny Pierce,https://www.instagram.com/jonnypierce,
3,Quarter Water,https://www.instagram.com/quarterwatermusic,
4,Alexander 23,https://www.instagram.com/alexander23,
5,Chris Schweizer,https://www.instagram.com/schweizerchris,
6,Tonstartssbandht,https://www.instagram.com/tonstartssbandht,
...,...,...,...
12712,Jake Wesley Rogers,https://www.instagram.com/jakewesleyrogers,
12713,Desert Dwellers,https://www.instagram.com/desert_dwellers_music,
12715,Sir Please,https://www.instagram.com/sirpleaseband,
12716,Jason Crabb,https://www.instagram.com/jasoncrabbmusic,


In [16]:
df1.loc[11264]

NAME                                           Aidonia
URL          https://www.instagram.com/aidonia4thgenna
INS_COUNT                                         None
Name: 11264, dtype: object

In [17]:
df_subset = df1.loc[11264:].copy()

In [18]:
df_subset

Unnamed: 0,NAME,URL,INS_COUNT
11264,Aidonia,https://www.instagram.com/aidonia4thgenna,
11265,Eliminate,https://www.instagram.com/eliminatemusic,
11266,Akiko Yano,https://www.instagram.com/akikoyano_staff,
11267,American Crush,https://www.instagram.com/americancrush_official,
11269,Poppy,https://www.instagram.com/impoppy,
...,...,...,...
12712,Jake Wesley Rogers,https://www.instagram.com/jakewesleyrogers,
12713,Desert Dwellers,https://www.instagram.com/desert_dwellers_music,
12715,Sir Please,https://www.instagram.com/sirpleaseband,
12716,Jason Crabb,https://www.instagram.com/jasoncrabbmusic,


In [20]:
process_dataframe_urls(df_subset.head(1))

Row 11265: Aidonia processed successfully (829,000 followers)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['INS_COUNT'] = follower_counts


Unnamed: 0,NAME,URL,INS_COUNT
11264,Aidonia,https://www.instagram.com/aidonia4thgenna,829000


In [21]:
process_dataframe_urls(df_subset)

Row 11265: Aidonia processed successfully (829,000 followers)
Row 11266: Eliminate processed successfully (163,000 followers)
Row 11267: Akiko Yano processed successfully (9,497 followers)
Row 11268: American Crush processed successfully (195 followers)
Row 11270: Poppy processed successfully (1,000,000 followers)
Row 11271: Eric Clapton processed successfully (977,000 followers)
Row 11273: Philip Sayce processed successfully (150,000 followers)
Row 11274: Bill Frisell processed successfully (107,000 followers)
Row 11275: Flwr Chyld processed successfully (15,000 followers)
Row 11277: Diamante processed successfully (113,000 followers)
Row 11278: DJ Kane processed successfully (32,000 followers)
Row 11280: Daniel Champagne processed successfully (30,000 followers)
Row 11281: Stanley Clarke processed successfully (96,000 followers)
Row 11282: Band Of Horses processed successfully (153,000 followers)
Row 11283: Lord Finesse processed successfully (248,000 followers)
Row 11285: Olivia Jea

Unnamed: 0,NAME,URL,INS_COUNT
11264,Aidonia,https://www.instagram.com/aidonia4thgenna,829000.0
11265,Eliminate,https://www.instagram.com/eliminatemusic,163000.0
11266,Akiko Yano,https://www.instagram.com/akikoyano_staff,9497.0
11267,American Crush,https://www.instagram.com/americancrush_official,195.0
11269,Poppy,https://www.instagram.com/impoppy,1000000.0
...,...,...,...
12712,Jake Wesley Rogers,https://www.instagram.com/jakewesleyrogers,185000.0
12713,Desert Dwellers,https://www.instagram.com/desert_dwellers_music,41000.0
12715,Sir Please,https://www.instagram.com/sirpleaseband,4526.0
12716,Jason Crabb,https://www.instagram.com/jasoncrabbmusic,116000.0


In [None]:
result_subset = process_dataframe_urls(df_subset)

In [49]:
final_df = pd.read_csv('final_df.csv')

In [44]:
final_df.rename(columns={'Unnamed: 0': 'ROW_NUM'}, inplace = True)

In [37]:
final_df.to_csv('final_df.csv')