In [1]:
import matplotlib
import seaborn as sns
import requests
import calendar
import dill
import re
from bs4 import BeautifulSoup
from datetime import datetime
import calendar
import numpy as np
import pandas as pd
import spacy

# Outline

In this project, we are going to be crawling through the [Party Picture Archive](https://web.archive.org/web/20150913224145/http://www.newyorksocialdiary.com/party-pictures).

We outline the steps in this project as follows.

1. Visiting [this url](https://web.archive.org/web/20150913224145/http://www.newyorksocialdiary.com/party-pictures) returns a list of events, with each linking to a seperate URL detailing each event. We will scrap the page of those URLS. Note there are 25 pages in total.
2. Each event contains several pictures, with associated captions. We shall scrap those captions for names.
3. From those names, we shall build a graph. The nodes (or vertices) are names, the (weighted) edges refer to how many photos two individuals appeared together.

From there, we use various elements of graph theory to study relationships between individuals.

# Processing Index


### Core Functions
We need to script a means of processing each page. Let's start with the first page.

In [2]:
url = 'https://web.archive.org/web/20150913224145/http://www.newyorksocialdiary.com/party-pictures'
page = requests.get(url) # Use requests.get to download the page.
soup = BeautifulSoup(page.text, "lxml")

We need to grab the links in each page.

In [3]:
party_re_pattern = re.compile('.*/party-pictures/\d{4}/.*')

links = soup.find_all('a', href=True)
links = [link for link in links if re.search(party_re_pattern, link['href'])]

In [4]:
link = links[0]

In [5]:
month_name_to_number = dict()
i = 0

for month in calendar.month_name:
    month_name_to_number[month] = i
    i += 1


def get_link_date(el):
    """
    Keyword Args:
    el - BS4 tagged linked from the Party Picture Archive.
    
    Returns:
    - URL of link.
    - Date link tagged as.
    """
    url = el['href']
    date_pattern = re.compile('(.*), (.*) (\d{1,2}), (\d{4})')
    date_match = re.match(date_pattern, list(el.parent.parent.parent.children)[3].text)

    _, month, day, year = date_match[1], date_match[2], date_match[3], date_match[4]
    month = month_name_to_number[month]
    day = int(day)
    year = int(year)
    date = datetime(year, month, day)
    return url, date

In [6]:
get_link_date(link)

('/web/20150913224145/http://www.newyorksocialdiary.com/party-pictures/2015/kicks-offs-sing-offs-and-pro-ams',
 datetime.datetime(2015, 9, 11, 0, 0))

In [7]:
def get_links(response):
    """
    Used to parse all (url, date) pairs inside a given given address.
    
    Keyword Arguments:
    response - Either a URL or requests response of an index page.
    
    Returns:
    - A list of (url, date) pairs linked to inside the address.
    """
    if isinstance(response, str):
        response = requests.get(response)
    soup = BeautifulSoup(response.text, "lxml")
    links = soup.find_all('a', href=True)
    links = [link for link in links if re.search(party_re_pattern, link['href'])]
    return [get_link_date(link) for link in links]

In [8]:
def filter_by_date(links, cutoff=datetime(2014, 12, 1)):
    """
    Given a list of (url, date) pairs, cuts off all those before a certain period.
    """
    return [link for link in links if link[1] <= cutoff]

### Iterating Through Each Page

Now, we need to go through and process each page one at a time.

In [9]:
page_indices = [''] + ['?page={}'.format(i) for i in list(range(1, 25))]
urls = [url + page_index for page_index in page_indices]
pages = [requests.get(url) for url in urls]

In [11]:
link_list = []
addresses = []

for page in pages:
    links = get_links(page)
    links = filter_by_date(links)
    for link in links:
        # Note that the first 20 characters are used for web.archive.org.
        if link[0][20:] not in addresses:
            addresses.append(link[0][20:])
            link_list.append(link)

In [12]:
# dill.dump(link, open('graph_checkpoints/link_list.pkd', 'wb'))

In [10]:
link = dill.load(open('graph_checkpoints/link_list.pkd', 'rb'))

# Analyze Event

So, we have multiple links. Each link contains several photos. We're going to go through each page, and grab the names from the captions.

### Grabbing Names from Caption

First, we need to look at one is in each caption. Let us start with a simple test page.

In [11]:
url = 'https://web.archive.org/web/20151114014941/http://www.newyorksocialdiary.com/party-pictures/2015/celebrating-the-neighborhood'
page = requests.get(url)
soup = BeautifulSoup(page.text, "lxml")

Let's grab the first caption.

In [12]:
caption = soup.find_all(attrs={'class': 'photocaption'})[0].text

In [13]:
print(caption)

Glenn Adamson, Simon Doonan, Victoire de Castellane, Craig Leavitt, Jerome Chazen, Andi Potamkin, Ralph Pucci, Kirsten Bailey, Edwin Hathaway, and Dennis Freedman at the Museum of Art and Design's annual MAD BALL. 


The easiest solution to grab all the names is to use a library such as spaCy to parse each caption. Note that we use the large English model for increased accuracy.

In [14]:
nlp = spacy.load('en_core_web_lg')

In [15]:
def get_names(text):
    """
    Grabs the names located inside some caption text.
    
    Keyword Argument:
    text - Text we wish to analyze.
    
    Returns:
    - List of names inside text.
    """
    doc = nlp(text)
    names = []
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            names.append(ent.text.strip())
            
    return names

In [16]:
print(get_names(caption))

['Glenn Adamson', 'Simon Doonan', 'Victoire de Castellane', 'Craig Leavitt', 'Jerome Chazen', 'Andi Potamkin', 'Ralph Pucci', 'Kirsten Bailey', 'Edwin Hathaway', 'Dennis Freedman']


Now, we just need to go through the link, listing out all the names associated with each photo.

### Iterating Through our Link List

In [17]:
def get_captions(path):
    page = requests.get(path)
    soup = BeautifulSoup(page.text, "lxml")

    captions = []
    for caption in soup.find_all(attrs={'class': 'photocaption'}):
        caption = caption.text
        captions.append(caption)
                
    return captions

Note this may take a considerable amount of time.

In [25]:
captions = []

for link in link_list:
    captions.extend(get_captions('https://web.archive.org' + link[0]))

Due to the time this took, we will cache our results.

In [27]:
dill.dump(captions, open('graph_checkpoints/captions.pkd', 'wb'))

To load our results, simply run the following command.

In [18]:
captions = dill.load(open('graph_checkpoints/captions.pkd', 'rb'))

Now, we only need names out of this text. Note that we could save time by combining this with the previous loop. I nonetheless wished to do them separately to by able to save the results, in case changes needed to be made.

In [19]:
captions = [get_names(caption) for caption in captions]

Note this list contains multiple instances of empty lists, which contain no valuable information. Let us remove these to speed things up a little.

In [33]:
dill.dump(captions, open('graph_checkpoints/caption_names.pkd', 'wb'))

### Cleaning Up

Now, there are a couple things that need to be done to clean up. First of all, some captions contain no names. Instead of just ignoring them, Python designates that to be the empty list. We should remove these to speed things up as well as save on RAM.

In [21]:
while True:
    try:
        captions.remove([])
    except:
        break

Next, we have cases where the caption reads 'John and Bonnie Williamson.' These are translated as 'John' and 'Bonnie Williamson', where 'John' implicitly has the last name 'Williamson.' Let's fix that.

In [36]:
for caption in captions:
    for i, name in enumerate(caption):
        if len(name.split(' ')) == 1:
            try:
                caption[i] = name + ' ' + caption[i + 1].split(' ')[-1]
            except:
                caption.remove(name)

Lastly, note one of the most popular names, 'Patrick McMullan,' is actually the photographer. We need to remove his name from all the captions, as they influence results.

In [42]:
for i, caption in enumerate(captions):
    try:
        caption.remove('Patrick McMullan')
        captions[i] = caption
    except:
        pass

# Graph Construction

Now, we wish to construct a graph which we could analyze our list.

In [43]:
from itertools import combinations
import networkx as nx

In [44]:
G = nx.Graph()

for caption in captions:
    for n, m in combinations(caption, 2):
        if G.has_edge(n, m):
            G[n][m]['weight'] += 1
        else:
            G.add_edge(n, m, weight=1)

# Analysis

### Who is the most popular?

First, let us define what I mean in this case by popular. I am simply referring to who has the largest number of edges, or who is found next to the largest number of people.

In [45]:
people = dict()

for name in G.nodes():
    people[name] = G.degree(name)
    
people = [
    (person, people[person]) for person in people.keys()
]

people = sorted(people, key=lambda x: x[1], reverse=True)

Let's see who some of the most popular people are.

In [46]:
people[:50]

[('Jean Shafiroff', 453),
 ('Mark Gilbertson', 352),
 ('Alexandra Lebenthal', 245),
 ('Gillian Miniter', 239),
 ('Geoffrey Bradfield', 228),
 ('Mario Buatta', 224),
 ('Michael Bloomberg', 205),
 ('Eleanora Kennedy', 203),
 ('Kamie Lightburn', 201),
 ('Yaz Hernandez', 200),
 ('Alina Cho', 185),
 ('Somers Farkas', 184),
 ('Sharon Bush', 181),
 ('Lucia Hwong Gordon', 181),
 ('Andrew Saffir', 181),
 ('Debbie Bancroft', 167),
 ('Jamee Gregory', 161),
 ('Liliana Cavendish', 161),
 ('Barbara Tober', 160),
 ('Allison Aston', 159),
 ('Bettina Zilkha', 148),
 ('Amy Fine Collins', 145),
 ('Donna Karan', 145),
 ('Karen Klopp', 142),
 ('Leonard Lauder', 142),
 ('Karen LeFrak', 141),
 ('Martha Stewart', 139),
 ('Christopher Hyland', 139),
 ('Deborah Norville', 137),
 ('Diana Taylor', 137),
 ('Ellen V. Futter', 132),
 ('Jennifer Creel', 130),
 ('Grace Meigher', 129),
 ('Margo Langenberg', 129),
 ('Paula Zahn', 129),
 ('Lydia Fenet', 129),
 ('Alec Baldwin', 128),
 ('Nicole Miller', 126),
 ('Elizabeth 

Note a couple of names here are unimportant. For instance, having only first names is an issue. Also, some are specifically titles or credidentials, which while they refer to specific people, are not those individuals in question.

### Who the the most connected?

This is definitely one of the most ambiguous statements. So, to rank connectedness, I'm going to using PageRank, which originally brought Google on the map.

While the technical details can be found elsewhere (for instance, Kevin Murphy's text), the basic idea is as follows. At each node, we look at who is connected to that individual. But each connection is weighed based off how connected those individuals are.

In [50]:
pagerank = nx.pagerank(G)

most_connected = [(key, pr[key]) for key in pr.keys()]
most_connected = sorted(most_connected, key=lambda x: x[1], reverse=True)

In [51]:
most_connected[:50]

[('Jean Shafiroff', 0.0007590385181084754),
 ('Mark Gilbertson', 0.0005258709418796361),
 ('Gillian Miniter', 0.0004441257514677958),
 ('Geoffrey Bradfield', 0.0004065273986735837),
 ('Alexandra Lebenthal', 0.0003970781346492939),
 ('Mario Buatta', 0.0003436314606364027),
 ('Yaz Hernandez', 0.0003358716181693452),
 ('Andrew Saffir', 0.00033150251980629025),
 ('Kamie Lightburn', 0.00032632104929191123),
 ('Eleanora Kennedy', 0.00031725877514452144),
 ('Michael Bloomberg', 0.00030075976754423634),
 ('Sharon Bush', 0.0002991035986510989),
 ('Alina Cho', 0.0002934927135324691),
 ('Barbara Tober', 0.0002884279337231575),
 ('Somers Farkas', 0.0002861021252845084),
 ('Debbie Bancroft', 0.0002748729742278528),
 ('Lucia Hwong Gordon', 0.0002654207603430986),
 ('Bonnie Comley', 0.00025970619702183007),
 ('Jamee Gregory', 0.00025473987502650766),
 ('Liliana Cavendish', 0.00024853807256546066),
 ('Christopher Hyland', 0.0002363157830562482),
 ('Karen LeFrak', 0.0002257863228488658),
 ('Donna Karan

Note there is some similarity in the order.

### Who are best friends?

Now let's look at who has the strongest edge weights, which indicates that they commonly interact together.

In [53]:
number_of_connections_between = []

for i, j in G.edges():
    num_connections = G[i][j]['weight']
    if num_connections > 10:
        number_of_connections_between.append(((i, j), num_connections))
        
number_of_connections_between = sorted(number_of_connections_between, key=lambda x: x[1], reverse=True)

In [54]:
number_of_connections_between[:10]

[(('Gillian Miniter', 'Sylvester Miniter'), 117),
 (('Bonnie Comley', 'Stewart Lane'), 83),
 (('Jamee Gregory', 'Peter Gregory'), 77),
 (('Geoffrey Bradfield', 'Roric Tobin'), 69),
 (('Daniel Benedict', 'Andrew Saffir'), 66),
 (('Barbara Tober', 'Donald Tober'), 57),
 (('Jean Shafiroff', 'Martin Shafiroff'), 56),
 (('Eleanora Kennedy', 'Michael Kennedy'), 50),
 (('Alexandra Lebenthal', 'Jay Diamond'), 47),
 (('Peter Regna', 'Barbara Regna'), 46)]

As a side note, most of the highest entries here are referencing a lot of celebraty couples.