# Taylor Swift Artist Functions Test
Testing all artist profile functions from socials_tracker.ipynb

In [1]:
# Import required libraries
import os
import re
import json
import time
import requests
import httpx
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from googlesearch import search
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
try:
    import google.generativeai as genai
    HAS_GEMINI = True
except ImportError:
    HAS_GEMINI = False

print("üì¶ Libraries loaded (v4.0 Foundation).")

üì¶ Libraries loaded (v4.0 Foundation).


  from .autonotebook import tqdm as notebook_tqdm

All support for the `google.generativeai` package has ended. It will no longer be receiving 
updates or bug fixes. Please switch to the `google.genai` package as soon as possible.
See README for more details:

https://github.com/google-gemini/deprecated-generative-ai-python/blob/main/README.md

  import google.generativeai as genai


In [2]:
# Helper function to convert string numbers (e.g., "1.5M") to integers
def convert_string_to_number(s):
    s = s.lower().strip()
    # Handle comma-separated numbers
    if ',' in s:
        return int(s.replace(',', ''))
    # Handle suffixes like K, M, B
    elif 'k' in s:
        return int(float(s.replace('k', '')) * 1000)
    elif 'm' in s:
        return int(float(s.replace('m', '')) * 1000000)
    elif 'b' in s:
        return int(float(s.replace('b', '')) * 1000000000)
    else:
        return int(float(s))

In [3]:
# Initialize Selenium WebDriver
options = webdriver.ChromeOptions()
# options.add_argument('--headless')  # Uncomment to run in headless mode
driver = webdriver.Chrome(options=options)

In [4]:
def get_first_search_result(query):
    """
    Tiered Search Strategy: Google -> Bing -> Yahoo -> Python Lib
    """
    # 1. Google (Selenium)
    try:
        driver.get(f"https://www.google.com/search?q={query}")
        time.sleep(2)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        res = soup.find('div', class_='g')
        if res and res.find('a'): return res.find('a')['href']
    except: pass
    
    # 2. Bing (Selenium)
    try:
        driver.get(f"https://www.bing.com/search?q={query}")
        time.sleep(2)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        res = soup.find('li', class_='b_algo')
        if res and res.find('a'): return res.find('a')['href']
    except: pass
    
    # 3. Yahoo (Selenium)
    try:
        driver.get(f"https://search.yahoo.com/search?p={query}")
        time.sleep(2)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        res = soup.find('div', class_=re.compile(r'algo-sr|dd\s+algo'))
        if res and res.find('a'): return res.find('a')['href']
    except: pass
    
    # 4. Standard Library Fallback
    try:
        results = list(search(query, num_results=1))
        if results: return results[0]
    except: pass
    
    return None

In [5]:
def init_gemini():
    if not HAS_GEMINI: return None
    try:
        with open('gemini_credentials.json', 'r') as f:
            creds = json.load(f)
            genai.configure(api_key=creds.get('api_key'))
            return genai.GenerativeModel('gemini-pro')
    except:
        return None

model = init_gemini()
if model: print("‚ú® Gemini AI Initialized.")
else: print("‚ö†Ô∏è Gemini AI skipped (No creds or lib).")

‚ú® Gemini AI Initialized.


In [6]:
# Spotify API credentials (optional - web scraping will work without them)
headers = None
access_token = None

try:
    # Try current directory first, then parent
    creds_path = "spotify_credentials.json"
    try:
        with open(creds_path, "r") as f:
            credentials = json.load(f)
    except FileNotFoundError:
        creds_path = "../spotify_credentials.json"
        with open(creds_path, "r") as f:
            credentials = json.load(f)

    client_id = credentials["client_id"]
    client_secret = credentials["client_secret"]
    auth_url = credentials["auth_url"]
    
    response = requests.post(
        auth_url,
        headers={"Content-Type": "application/x-www-form-urlencoded"},
        data={
            "grant_type": "client_credentials",
            "client_id": client_id,
            "client_secret": client_secret
        }
    )

    if response.status_code == 200:
        token_info = response.json()
        access_token = token_info['access_token']
        headers = {"Authorization": f"Bearer {access_token}"}
        print("Spotify API credentials loaded successfully!")
    else:
        print("Failed to retrieve access token, will use web scraping only")
except Exception as e:
    print(f"No Spotify credentials found or error loading them: {e} - will use web scraping only")

Spotify API credentials loaded successfully!


## Profile Class Definitions (v4.0)

In [7]:
class InstagramProfile:
    def __init__(self, artist, username=None):
        self.artist = artist
        self.username = username
        self.follower_count = 0

    def get_username(self):
        if self.username: return self.username
        url = get_first_search_result(f'instagram {self.artist} official')
        if url:
            match = re.search(r'instagram\.com/([^/?]+)', url)
            if match and match.group(1) not in ['p', 'reels', 'stories']: 
                self.username = match.group(1)
        return self.username

    def _try_api(self):
        try:
            url = f'https://i.instagram.com/api/v1/users/web_profile_info/?username={self.username}'
            h = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', 'x-ig-app-id': '936619743392459'}
            r = requests.get(url, headers=h, timeout=10)
            if r.status_code == 200:
                self.follower_count = r.json()['data']['user']['edge_followed_by']['count']
                return True
        except: pass
        return False

    def _try_specialized(self):
        # LiveCounts.nl & InstaStatistics
        # Robust Strategy: Multiple readings + Filtering animation artifacts
        sites = [
            f'https://livecounts.nl/instagram-realtime/?u={self.username}',
            f'https://instastatistics.com/{self.username}'
        ]
        for url in sites:
            try:
                driver.get(url)
                time.sleep(7) # Extended wait for initial settle
                
                valid_readings = []
                for i in range(5):
                    try:
                        # Try .odometer-inside first, then .odometer
                        try:
                            el = driver.find_element(By.CSS_SELECTOR, '.odometer-inside')
                        except:
                            el = driver.find_element(By.CSS_SELECTOR, '.odometer')
                        
                        if el:
                            txt = el.text
                            val = convert_string_to_number(re.sub(r'[^0-9KMBkm.]', '', txt))
                            
                            # Filter out animation glitches (usually extremely large or small)
                            if 1000 < val < 1000000000:
                                valid_readings.append(val)
                                
                            # If we have 2 consistent readings, we're done
                            if len(valid_readings) >= 2:
                                if abs(valid_readings[-1] - valid_readings[-2]) < (valid_readings[-1] * 0.01):
                                    self.follower_count = valid_readings[-1]
                                    return True
                    except: pass
                    time.sleep(2)
                
                if valid_readings:
                    self.follower_count = int(sum(valid_readings) / len(valid_readings))
                    return True
            except: pass
        return False

    def _try_selenium(self):
        try:
            driver.get(f'https://www.instagram.com/{self.username}/')
            time.sleep(5)
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            
            meta = soup.find('meta', attrs={'property': 'og:description'})
            if meta:
                content = meta.get('content', '')
                try:
                    match = re.search(r'([\d,.]+[KMB]?)\s*Followers', content, re.I)
                    if match: 
                        val = convert_string_to_number(match.group(1))
                        if 0 < val < 2000000000:
                            self.follower_count = val
                            return True
                except: pass

            if self.follower_count == 0:
                texts = soup.find_all(string=re.compile(r'Followers', re.I))
                for t in texts:
                    container = t.parent
                    full_text = container.get_text()
                    matches = re.findall(r'([\d,.]+[KMB]?)', full_text)
                    for m in matches:
                        v = convert_string_to_number(m)
                        if 1000 < v < 2000000000:
                            self.follower_count = v
                            return True
            return self.follower_count > 0
        except: pass
        return False

    def _try_gemini(self):
        if not model: return False
        try:
            prompt = f'Current Instagram follower count for {self.artist} (@{self.username})? Reply with ONE integer only.'                     f' Note: Should be around 280M.'
            r = model.generate_content(prompt)
            num = re.sub(r'\D', '', r.text)
            if num: self.follower_count = int(num); return True
        except: pass
        return False

    def get_all(self):
        if not self.get_username(): return None, 0
        if self._try_api(): return self.username, self.follower_count
        if self._try_specialized(): return self.username, self.follower_count 
        if self._try_selenium(): return self.username, self.follower_count
        self._try_gemini()
        return self.username, self.follower_count

    def __str__(self):
        return f"Artist: {self.artist}\nInstagram Username: {self.username}\nFollowers: {self.follower_count:,}"


In [8]:
class TwitterProfile:
    def __init__(self, artist, username=None):
        self.artist = artist
        self.username = username
        self.follower_count = 0

    def get_username(self):
        if self.username: return self.username
        url = get_first_search_result(f'twitter {self.artist} official')
        if url:
            match = re.search(r'(?:twitter|x)\.com/([^/?]+)', url)
            if match and match.group(1) not in ['intent', 'share', 'search', 'i', 'x']: 
                self.username = match.group(1)
        return self.username

    def _try_verified(self):
        try:
            driver.get(f'https://x.com/{self.username}/verified_followers')
            time.sleep(5)
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            els = soup.find_all('a', href=re.compile(r'/verified_followers$'))
            for el in els:
                if 'Follower' in el.get_text():
                    match = re.search(r'([\d,.]+[KMB]?)', el.get_text(), re.I)
                    if match: 
                        val = convert_string_to_number(match.group(1))
                        # Sanity Check: Twitter max ~170M
                        if 0 < val < 200000000:
                            self.follower_count = val; return True
        except: pass
        return False

    def _try_specialized(self):
        # Robust Strategy: Multiple readings + Filtering
        sites = [
            f'https://livecounts.nl/twitter-realtime/?u={self.username}', 
            f'https://livecounts.io/twitter-live-follower-counter/{self.username}'
        ]
        for url in sites:
            try:
                driver.get(url)
                time.sleep(7)
                
                valid_readings = []
                for i in range(5):
                    try:
                        try:
                            el = driver.find_element(By.CSS_SELECTOR, '.odometer-inside')
                        except:
                            el = driver.find_element(By.CSS_SELECTOR, '.followers-odometer, .odometer')
                        
                        if el:
                            val = convert_string_to_number(re.sub(r'[^0-9KMBkm.]', '', el.text))
                            
                            if 1000 < val < 300000000:
                                valid_readings.append(val)
                            
                            if len(valid_readings) >= 2:
                                if abs(valid_readings[-1] - valid_readings[-2]) < (valid_readings[-1] * 0.01):
                                    self.follower_count = valid_readings[-1]
                                    return True
                    except: pass
                    time.sleep(2)
                
                if valid_readings:
                    self.follower_count = int(sum(valid_readings) / len(valid_readings))
                    return True
            except: pass
        return False

    def _try_selenium_profile(self):
        # Directly scrape profile page looking for "Followers" text
        try:
            driver.get(f'https://x.com/{self.username}')
            time.sleep(5)
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            
            # Look for "X Followers" in text, prioritize larger numbers?
            # Or look for specific "Followers" link/span
            
            # Simple Text Search
            txt = soup.get_text()
            matches = re.findall(r'([\d,.]+[KMB]?)\s*Followers', txt, re.I)
            candidates = []
            for m in matches:
                val = convert_string_to_number(m)
                if 1000 < val < 200000000:
                    candidates.append(val)
            
            # Pick largest candidate (likely the total followers vs mutuals)
            if candidates:
                self.follower_count = max(candidates)
                return True
        except: pass
        return False

    def _try_google_snippet(self):
        # Fallback: Google Search
        try:
            u = f'https://www.google.com/search?q=twitter+{self.username}+followers'
            driver.get(u)
            time.sleep(2)
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            # Look for bold numbers or snippets
            txt = soup.get_text()
            match = re.search(r'([\d,.]+[KMB]?)\s*Followers', txt, re.I)
            if match:
                 val = convert_string_to_number(match.group(1))
                 if 1000 < val < 200000000:
                     self.follower_count = val
                     return True
        except: pass
        return False

    def _try_gemini(self):
        if not model: return False
        try:
            prompt = f'Current Twitter follower count for {self.artist} (@{self.username})? Reply with ONE integer only.'                     f' Note: Should be around 95M.'
            r = model.generate_content(prompt)
            num = re.sub(r'\D', '', r.text)
            if num: self.follower_count = int(num); return True
        except: pass
        return False

    def get_all(self):
        if not self.get_username(): return None, 0
        if self._try_verified(): 
             # If verified returns extremely low (e.g. < 50M for Taylor), try profile
             if self.follower_count > 50000000: return self.username, self.follower_count
        
        if self._try_specialized(): return self.username, self.follower_count
        if self._try_selenium_profile(): return self.username, self.follower_count
        if self._try_google_snippet(): return self.username, self.follower_count
        
        self._try_gemini()
        return self.username, self.follower_count

    def __str__(self):
        return f"Artist: {self.artist}\nTwitter Username: {self.username}\nFollowers: {self.follower_count:,}"


In [9]:
class SpotifyProfile:
    def __init__(self, artist, spotifyID=None, genre=None):
        self.artist = artist
        self.spotifyID = spotifyID
        self.genre = genre
        self.followers = 0
        self.popularity = 0
        self.listens = 0
        self.url = None

    def get_id(self):
        if self.spotifyID: return
        # Try API if headers available
        if headers:
            try:
                search_url = f'https://api.spotify.com/v1/search?q=artist:{self.artist}&type=artist&limit=1'
                r = requests.get(search_url, headers=headers, timeout=10)
                if r.status_code == 200:
                    items = r.json()['artists']['items']
                    if items: self.spotifyID = items[0]['id']
            except: pass
        # Fallback to search
        if not self.spotifyID:
            u = get_first_search_result(f'spotify artist {self.artist}')
            if u:
                m = re.search(r'artist/([a-zA-Z0-9]+)', u)
                if m: self.spotifyID = m.group(1)

    def get_stats(self):
        if not self.spotifyID: return
        # API (only if headers available)
        if headers:
            try:
                u = f'https://api.spotify.com/v1/artists/{self.spotifyID}'
                r = requests.get(u, headers=headers, timeout=10)
                if r.status_code == 200:
                    res = r.json()
                    self.followers = res['followers']['total']
                    self.popularity = res['popularity']
                    # User preferred logic for genre
                    if res.get('genres'):
                        self.genre = res['genres'][0]
                    elif not self.genre:
                        self.genre = 'Pop' # Default for testing
                    # Capture URL for scraping
                    if 'external_urls' in res:
                        self.url = res['external_urls'].get('spotify')
            except: pass
        
        if self.url and not self.url.startswith('http'):
            self.url = 'https://' + self.url
        
        # Ensure we have a URL to scrape
        if not self.url:
             self.url = f'https://open.spotify.com/artist/{self.spotifyID}'

        # Scrape Listeners using the specific URL
        # 1. Requests (With Headers!)
        try:
            h = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
            r = requests.get(self.url, headers=h, timeout=10)
            soup = BeautifulSoup(r.content, 'html.parser')
            meta = soup.find('meta', attrs={'property': 'og:description'})
            if meta:
                content = meta.get('content')
                m = re.search(r'([\d,.]+[KMB]?)\s*monthly listeners', content, re.I)
                if m: self.listens = convert_string_to_number(m.group(1))
        except: pass
        
        # 2. Selenium Fallback if requests failed
        if self.listens == 0:
            try:
                driver.get(self.url)
                time.sleep(5) # Increased wait
                soup = BeautifulSoup(driver.page_source, 'html.parser')
                # Try meta tag again from rendered source
                meta = soup.find('meta', attrs={'property': 'og:description'})
                if meta:
                    m = re.search(r'([\d,.]+[KMB]?)\s*monthly listeners', meta.get('content',''), re.I)
                    if m: self.listens = convert_string_to_number(m.group(1))
                
                # Try body text if meta failed
                if self.listens == 0:
                     # More specific search to avoid garbage
                     m = re.search(r'Monthly Listeners\s*:\s*([\d,.]+[KMB]?)', soup.get_text(), re.I)
                     if not m:
                         m = re.search(r'([\d,.]+[KMB]?)\s*monthly listeners', soup.get_text(), re.I)
                     if m: self.listens = convert_string_to_number(m.group(1))
            except: pass

    def get_all(self):
        self.get_id()
        self.get_stats()
        return self.spotifyID, self.genre, self.followers, self.popularity, self.listens

    def __str__(self):
        return f"Artist: {self.artist}\nSpotify ID: {self.spotifyID}\nGenre: {self.genre}\nFollowers: {self.followers:,}\nPopularity: {self.popularity}\nMonthly Listeners: {self.listens:,}"



In [10]:
class StubhubProfile:
    """
    Stubhub Profile scraper - Refined Jan 2026
    Prioritizes: SVG Heart -> Text Pattern -> JSON Data
    """
    def __init__(self, artist, url=None):
        self.artist = artist
        self.url = url
        self.favourites = 0

    def get_url(self):
        if self.url: return self.url
        u = get_first_search_result(f'stubhub {self.artist} tickets performer')
        if u:
            match = re.search(r'stubhub\.(ca|com)/([^?\s]+)', u)
            if match: self.url = '/' + match.group(2)
        return self.url

    def _scrape(self):
        # Handle absolute or relative URLs
        target_urls = []
        if self.url and self.url.startswith('http'):
            target_urls = [self.url]
        else:
            # Try both .ca and .com if relative
            target_urls = [f'https://www.{d}{self.url}' for d in ['stubhub.ca', 'stubhub.com']]

        for u in target_urls:
            try:
                driver.get(u)
                time.sleep(5) # Wait for load
                soup = BeautifulSoup(driver.page_source, 'html.parser')
                
                # Strategy 1: Look for specific number pattern in elements that might be the "Favorite" button
                # The user indicated "62.3K" next to a heart.
                candidates = soup.find_all(string=re.compile(r'^\s*\d+(?:\.\d+)?[KMB]?\s*$'))
                
                for candidate in candidates:
                    text = candidate.strip()
                    parent = candidate.parent
                    
                    # Check parent and grandparents for SVG (heart)
                    curr = parent
                    found_heart = False
                    for _ in range(4): # Traverse up
                        if curr:
                            if curr.find('svg') or curr.find('path'):
                                found_heart = True
                                break
                            curr = curr.parent
                    
                    if found_heart:
                         val = convert_string_to_number(text)
                         if val > 0: return val

                # Strategy 2: Fallback to searching for "Favorites" text
                tag = soup.find(string=re.compile(r'Favorites|Favourites', re.I))
                if tag:
                    container = tag.parent
                    text = container.get_text() + ' ' + (container.parent.get_text() if container.parent else '')
                    m = re.search(r'([\d,.]+[KMB]?)', text)
                    if m: 
                        val = convert_string_to_number(m.group(1))
                        if val > 0: return val
                        
                # Strategy 3: JSON Data
                script = soup.find('script', {'id': 'index-data', 'type': 'application/json'})
                if script and script.string:
                    data = json.loads(script.string)
                    val = data.get('performer', {}).get('favorites', 0) or data.get('performerSummary', {}).get('favorites', 0)
                    if val > 0: return val
                    
            except Exception as e:
                # print(f"Error scraping {u}: {e}")
                pass
        return 0

    def get_all(self):
        if not self.get_url(): return None, 0
        self.favourites = self._scrape()
        return self.url, self.favourites

    def __str__(self):
        return f"Artist: {self.artist}\nStubhub URL: {self.url}\nFavourites: {self.favourites:,}"


## Test Execution

In [11]:
def run_full_test(artist_name, ig_user=None, tw_user=None, spot_id=None, stub_url=None):
    print(f"\n{'='*50}")
    print(f"üöÄ TESTING: {artist_name}")
    print(f"{'='*50}\n")
    
    # 1. Instagram
    ig = InstagramProfile(artist_name, username=ig_user)
    ig.get_all()
    print(f"\nüì∏ Instagram Result: {ig.username} | {ig.follower_count:,} followers")
    
    # 2. Twitter
    tw = TwitterProfile(artist_name, username=tw_user)
    tw.get_all()
    print(f"\nüê¶ Twitter Result: {tw.username} | {tw.follower_count:,} followers")
    
    # 3. Spotify
    sp = SpotifyProfile(artist_name, spotifyID=spot_id)
    sp.get_all()
    print(f"\nüéß Spotify Result: {sp.spotifyID} | {sp.followers:,} followers | Listeners: {sp.listens:,} | Popularity: {sp.popularity} | Genre: {sp.genre}")
    
    # 4. Stubhub
    sh = StubhubProfile(artist_name, url=stub_url)
    sh.get_all()
    print(f"\nüéüÔ∏è Stubhub Result: {sh.url} | {sh.favourites:,} favourites")
    
    return ig, tw, sp, sh

In [12]:
# Execute Test for Taylor Swift
instagram, twitter, spotify, stubhub = run_full_test(
    "Taylor Swift", 
    ig_user="taylorswift", 
    tw_user="taylorswift13", 
    spot_id="06HL4z0CvFAxyc27GXpf02",
    stub_url="https://www.stubhub.com/taylor-swift-tickets/performer/136034"
)

print(f"\nExpected values (as of Jan 2026):")
print(f"  Instagram: ~281M followers")
print(f"  Twitter: ~75M followers")
print(f"  Spotify: ~106M monthly listeners, ~150M followers")
print(f"  Stubhub: ~62K favourites")



üöÄ TESTING: Taylor Swift


üì∏ Instagram Result: taylorswift | 280,942,001 followers

üê¶ Twitter Result: taylorswift13 | 78,800,000 followers

üéß Spotify Result: 06HL4z0CvFAxyc27GXpf02 | 150,801,542 followers | Listeners: 105,900,000 | Popularity: 100 | Genre: Pop

üéüÔ∏è Stubhub Result: https://www.stubhub.com/taylor-swift-tickets/performer/136034 | 62,300 favourites

Expected values (as of Jan 2026):
  Instagram: ~281M followers
  Twitter: ~75M followers
  Spotify: ~106M monthly listeners, ~150M followers
  Stubhub: ~62K favourites


## Summary - All Profiles for Taylor Swift

In [13]:
# Summary of all Taylor Swift social media data
print("="*50)
print("TAYLOR SWIFT - SOCIAL MEDIA SUMMARY")
print("="*50)
print()
print("INSTAGRAM:")
print(instagram)
print()
print("TWITTER/X:")
print(twitter)
print()
print("SPOTIFY:")
print(spotify)
print()
print("STUBHUB:")
print(stubhub)
print()
print("="*50)

TAYLOR SWIFT - SOCIAL MEDIA SUMMARY

INSTAGRAM:
Artist: Taylor Swift
Instagram Username: taylorswift
Followers: 280,942,001

TWITTER/X:
Artist: Taylor Swift
Twitter Username: taylorswift13
Followers: 78,800,000

SPOTIFY:
Artist: Taylor Swift
Spotify ID: 06HL4z0CvFAxyc27GXpf02
Genre: Pop
Followers: 150,801,542
Popularity: 100
Monthly Listeners: 105,900,000

STUBHUB:
Artist: Taylor Swift
Stubhub URL: https://www.stubhub.com/taylor-swift-tickets/performer/136034
Favourites: 62,300



In [14]:
# Clean up - close the browser
driver.quit()
print("Browser closed.")

Browser closed.
