# Hashtag Culture Analysis

A hashtag (#) is a type of metadata tag used on social networks such as Twitter and other microblogging services. It lets users apply dynamic, user-generated tagging that helps other users easily find messages with a specific theme or content. We can borrow some basic principles from Network Science and graph theory to understand how hashtags on Instagram are connected.

# What aspects of Graph Theory can we use in our analysis?
<ul>
    <li><b>Community Detection</b>: We can use algorithms to identify and label clusters of topics/themes</li>
    <li><b>Degree Centrality/ Betweenness Centrality</b>: We can calculate what hashtags in the network are particularly important in linking the whole network.</li>
    <li><b>Visualization</b>: If we plot the network using scatterplots, it’s a very compelling way to visualise a huge amount of information about hashtags that would be cumbersome to do otherwise</li>
</ul>
    

In [1]:
import pandas as pd
import requests
import time
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options

In [2]:
def instagramLogin(driver, username, password):
        """
        Returns driver logged in to instagram
        """

        # Login url
        driver.get('https://www.instagram.com/accounts/login/?source=auth_switcher')

        # Wait 3 seconds to make instagram think I'm a human
        time.sleep(3)

        # Find username field
        username_input = driver.find_element_by_css_selector("input[name='username']")

        # Click on username field
        driver.execute_script("arguments[0].click();", username_input)

        # Send username
        username_input.send_keys(username)
        
        time.sleep(3)
        
        # Find password field
        try:
            password_input = driver.find_element_by_css_selector("input[name='password']")

        except:
            password_input = driver.find_element_by_xpath('//*[@id="react-root"]/section/main/div/article/div/div[1]/div/form/div[4]/div/label/input')

        # Click on password field
        driver.execute_script("arguments[0].click();", password_input)
        
        # Send password
        password_input.send_keys(password)
        
        time.sleep(3)

        # Find and click log in button
        login_button = driver.find_element_by_xpath("//button[@type='submit']")
        driver.execute_script("arguments[0].click();", login_button)
    
        time.sleep(3)
        return driver
#         #locate floating window to click and close
#         floating_window = driver.find_element_by_class_name('piCib')

#         button = floating_window.find_element_by_class_name('mt3GC')

#         not_now = button.find_element_by_xpath('/html/body/div[4]/div/div/div[3]/button[2]')

#         driver.execute_script("arguments[0].click();", not_now)

#         return driver

In [3]:
driver = webdriver.Chrome('../util/chromedriver.exe')
driver = instagramLogin(driver, "IS434_G1T5", "IS434@g1t5")

In [4]:
def scrape_links(hashtag, driver):
    """
    Scrape unique post links from instagram using Selenium
    """
    options = webdriver.ChromeOptions()
    options.add_argument("--start-maximized")
    
    hashtag_url = f"https://www.instagram.com/explore/tags/{hashtag}/"
    driver.get(hashtag_url)
    
    # Gets scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")
    
    # List for unique instagram links
    unique_links = []
    
    # Loop until page ends
    while True:
        print("Iterative while loop")
        time.sleep(3)
        page_source = driver.page_source
        page_data = BeautifulSoup(page_source, "html.parser")
        data_body = page_data.find("body")
        
        for unique_link in data_body.findAll("a"):
            if re.match("/p", unique_link.get('href')):
                unique_links.append(f"https://www.instagram.com{unique_link.get('href')}")
            
        # Scroll to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        
        time.sleep(3)
        
        # If new height equal to previous screen height, break because no more content
        new_height = driver.execute_script("return document.body.scrollHeight")
        if last_height == new_height:
            break
        else:
            last_height = new_height
        
        # Update on scraping
        
        print (f"Scraped {len(unique_links)} links, {len(set(unique_links))} unique links")
        
    print(f"Finished scraping. Scraped {len(unique_links)} links, {len(set(unique_links))} unique links")
    print("\n")
    print("Closing driver.")
    driver.quit()
    
    return unique_links

In [5]:
hashtag = "hawkerculturesg"
unique_links = scrape_links(hashtag, driver)

Iterative while loop
Scraped 33 links, 33 unique links
Iterative while loop
Scraped 72 links, 39 unique links
Iterative while loop
Scraped 123 links, 51 unique links
Iterative while loop
Scraped 177 links, 63 unique links
Iterative while loop
Scraped 222 links, 66 unique links
Iterative while loop
Scraped 276 links, 78 unique links
Iterative while loop
Scraped 330 links, 90 unique links
Iterative while loop
Scraped 378 links, 96 unique links
Iterative while loop
Scraped 432 links, 108 unique links
Iterative while loop
Scraped 486 links, 120 unique links
Iterative while loop
Scraped 534 links, 126 unique links
Iterative while loop
Scraped 588 links, 138 unique links
Iterative while loop
Scraped 642 links, 150 unique links
Iterative while loop
Finished scraping. Scraped 690 links, 156 unique links


Closing driver.


In [8]:
len(set(unique_links))

156

In [None]:
time.sleep(3)