# Hashtag Culture Analysis

A hashtag (#) is a type of metadata tag used on social networks such as Twitter and other microblogging services. It lets users apply dynamic, user-generated tagging that helps other users easily find messages with a specific theme or content. We can borrow some basic principles from Network Science and graph theory to understand how hashtags on Instagram are connected.

# What aspects of Graph Theory can we use in our analysis?
<ul>
    <li><b>Community Detection</b>: We can use algorithms to identify and label clusters of topics/themes</li>
    <li><b>Degree Centrality/ Betweenness Centrality</b>: We can calculate what hashtags in the network are particularly important in linking the whole network.</li>
    <li><b>Visualization</b>: If we plot the network using scatterplots, it’s a very compelling way to visualise a huge amount of information about hashtags that would be cumbersome to do otherwise</li>
</ul>
    

In [77]:
import pandas as pd
import requests
import time
import re
import json
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options

# Logging in Instagram

In [37]:
def instagram_login(driver, username, password):
        """
        Returns driver logged in to instagram
        """

        # Login url
        driver.get('https://www.instagram.com/accounts/login/?source=auth_switcher')

        # Wait 3 seconds to make instagram think I'm a human
        time.sleep(3)

        # Find username field
        username_input = driver.find_element_by_css_selector("input[name='username']")

        # Click on username field
        driver.execute_script("arguments[0].click();", username_input)

        # Send username
        username_input.send_keys(username)
        
        time.sleep(3)
        
        # Find password field
        try:
            password_input = driver.find_element_by_css_selector("input[name='password']")

        except:
            password_input = driver.find_element_by_xpath('//*[@id="react-root"]/section/main/div/article/div/div[1]/div/form/div[4]/div/label/input')

        # Click on password field
        driver.execute_script("arguments[0].click();", password_input)
        
        # Send password
        password_input.send_keys(password)
        
        time.sleep(3)

        # Find and click log in button
        login_button = driver.find_element_by_xpath("//button[@type='submit']")
        driver.execute_script("arguments[0].click();", login_button)
    
        time.sleep(10)
        
        return driver, True


# Getting url of posts with hashtag

In [38]:
def scrape_links(hashtag, driver):
    """
    Scrape unique post links from instagram using Selenium
    Returns links of posts containing hashtag
    """
    options = webdriver.ChromeOptions()
    options.add_argument("--start-maximized")
    
    hashtag_url = f"https://www.instagram.com/explore/tags/{hashtag}/"
    driver.get(hashtag_url)
    
    # Gets scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")
    
    # List for unique instagram links
    unique_links = []
    
    # Loop until page ends
    while True:
        time.sleep(10)
        page_source = driver.page_source
        page_data = BeautifulSoup(page_source, "html.parser")
        data_body = page_data.find("body")
        
        for unique_link in data_body.findAll("a"):
            if re.match("/p", unique_link.get('href')):
                unique_links.append(f"https://www.instagram.com{unique_link.get('href')}")
            
        # Scroll to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        
        time.sleep(10)
        
        # If new height equal to previous screen height, break because no more content
        new_height = driver.execute_script("return document.body.scrollHeight")
        time.sleep(10)
        if last_height == new_height:
            break
        else:
            last_height = new_height
        
        # Update on scraping
        
        print (f"Scraped {len(unique_links)} links, {len(set(unique_links))} unique links")
        
    print(f"Finished scraping. Scraped {len(unique_links)} links, {len(set(unique_links))} unique links")
    print("\n")
    print("Closing driver.")
    driver.quit()
    unique_links = list(set(unique_links))
    return unique_links

In [39]:
driver = webdriver.Chrome('../util/chromedriver.exe')
driver, status = instagram_login(driver, "IS434_G1T5", "IS434@g1t5")
hashtag = "hawkerculturesg"
if status:
    unique_links = scrape_links(hashtag, driver)
else:
    print("Error logging in")

Scraped 21 links, 21 unique links
Scraped 54 links, 33 unique links
Scraped 93 links, 39 unique links
Scraped 141 links, 51 unique links
Scraped 189 links, 63 unique links
Scraped 228 links, 66 unique links
Scraped 276 links, 78 unique links
Scraped 324 links, 90 unique links
Scraped 366 links, 96 unique links
Scraped 414 links, 108 unique links
Scraped 462 links, 120 unique links
Scraped 504 links, 126 unique links
Scraped 552 links, 138 unique links
Scraped 600 links, 150 unique links
Scraped 642 links, 156 unique links
Scraped 690 links, 168 unique links
Scraped 738 links, 180 unique links
Scraped 777 links, 183 unique links
Scraped 828 links, 198 unique links
Scraped 870 links, 204 unique links
Scraped 921 links, 219 unique links
Scraped 969 links, 231 unique links
Scraped 1008 links, 234 unique links
Scraped 1059 links, 249 unique links
Scraped 1107 links, 261 unique links
Scraped 1161 links, 279 unique links
Scraped 1209 links, 291 unique links
Scraped 1263 links, 309 unique link

In [102]:
def get_hashtags(url):
    """
    Takes in instagram post URLs
    Returns hashtags in instagram post, seperated by comma
    """
    page = requests.get(url)
    data = BeautifulSoup(page.content, "html.parser")
    body_data = data.find("body")
    script = body_data.find("script")

    raw_data = script.text.strip().replace('window._sharedData =', '').replace(';', '')
    json_data = json.loads(raw_data)
    words_from_post = json_data['entry_data']['PostPage'][0]['graphql']['shortcode_media']['edge_media_to_caption']['edges'][0]['node']['text'].split()
    
    post_hashtags = []
    for word in words_from_post:
        if word[0] == "#":
            post_hashtags.append(word)
    post_hashtags = ", ".join(post_hashtags)
    return post_hashtags


# Loop through instagram url and append hashtags into list
hashtags_list = []
for index,url in enumerate(unique_links):
    post_hashtags = get_hashtags(url)
    hashtags_list.append(post_hashtags)
    print(index)

KeyError: 'PostPage'

In [86]:
# Convert to dataframe and export as csv
hashtags_df = pd.DataFrame(list(zip(unique_links, hashtags_list)),
               columns =['post_url', 'hashtags'])
hashtags_df.to_csv("../data/instagram_hashtag_posts.csv")

In [108]:
unique_links[0]
page = requests.get(unique_links[0])
data = BeautifulSoup(page.content, "html.parser")
body_data = data.find("body")
script = body_data.find("script")

In [110]:
raw_data = script.text.strip().replace('window._sharedData =', '').replace(';', '')
json_data = json.loads(raw_data)

In [113]:
json_data

{'config': {'csrf_token': 'mkB0KgFmbJBvHCO8DP0ODoXWAO6PmAaD',
  'viewer': None,
  'viewerId': None},
 'country_code': 'SG',
 'language_code': 'en',
 'locale': 'en_US',
 'entry_data': {'LoginAndSignupPage': [{'captcha': {'enabled': False,
     'key': ''},
    'gdpr_required': False,
    'tos_version': 'row',
    'username_hint': ''}]},
 'hostname': 'www.instagram.com',
 'is_whitelisted_crawl_bot': False,
 'connection_quality_rating': 'EXCELLENT',
 'deployment_stage': 'c2',
 'platform': 'web',
 'nonce': 'I2pnnibXwO4CkN/GrxwWOA==',
 'mid_pct': 98.40663,
 'zero_data': {},
 'cache_schema_version': 3,
 'server_checks': {},
 'knobx': {'070bc16ba2d873c073001690561934e3': True,
  '086c12b43fda5eee54ed0fa85f2bbea8': 25000,
  '17aeb9de94ea257e02570f12cdb2812f': False,
  '27e1c3d9ed3e05886fb474b960e3baa4': False,
  '3c50bdecc6078abf9e53f13d9246d9e2': True,
  '417a8e79ba5d5da0284a8efb2178791a': True,
  '5a00d32f3b18ef1b85a8d6af5be1ad47': True,
  '5f14c608e32ae0b85932fb93091c4546': False,
  '624aa9c

In [101]:
json_data["entry_data"]

{'PostPage': [{'graphql': {'shortcode_media': {'__typename': 'GraphSidecar',
     'id': '2611798928200300732',
     'shortcode': 'CQ--UmMH_i8',
     'dimensions': {'height': 1350, 'width': 1080},
     'gating_info': None,
     'fact_check_overall_rating': None,
     'fact_check_information': None,
     'sensitivity_friction_info': None,
     'sharing_friction_info': {'should_have_sharing_friction': False,
      'bloks_app_url': None},
     'media_overlay_info': None,
     'media_preview': None,
     'display_url': 'https://instagram.fsin9-1.fna.fbcdn.net/v/t51.2885-15/e35/p1080x1080/212053481_2197914870349828_1133636955074255064_n.jpg?_nc_ht=instagram.fsin9-1.fna.fbcdn.net&_nc_cat=104&_nc_ohc=G1vbXnJYT14AX8zMVyE&edm=AABBvjUBAAAA&ccb=7-4&oh=00_AT-eQJDlcVdb1XQGaGrz7be4H2FyGxJ5tJQsZ2ia7w4GbA&oe=620E8B0B&_nc_sid=83d603',
     'display_resources': [{'src': 'https://instagram.fsin9-1.fna.fbcdn.net/v/t51.2885-15/sh0.08/e35/p640x640/212053481_2197914870349828_1133636955074255064_n.jpg?_nc_ht=i