In [1]:
import json

# Open and read extracted tweets
with open('parsed_data.json', 'r', encoding='utf-8') as extracted_file:
    extracted_data = json.load(extracted_file)
# Open and read user metadata
with open('gg2013_user_metadata.json', 'r', encoding='utf-8') as metadata_file:
    user_metadata = json.load(metadata_file)

In [2]:
import datetime

# Find ceremony year
dt = datetime.datetime.fromtimestamp(extracted_data[0]['timestamp_ms'] / 1000.0,
                                     tz=datetime.timezone.utc)
year = dt.year

In [25]:
import csv

# Import IMDB data relevant to the ceremony year
def get_imdb_title_basics():

    title_basics = {}

    with open('./imdb/title.basics.tsv') as imdb_file:
        reader = csv.DictReader(imdb_file, delimiter='\t', quotechar='"')
        for row in reader:
            # Skip title with unknown start years
            if row['startYear'] == "\\N":
                continue

            # Save title if it is from the current or previous year, OR if it is a
            # TV series that ran at some point during that time
            start_year = int(row['startYear'])
            if (row['endYear'] == "\\N" \
                and (start_year == year or start_year == year-1 \
                    or (row['titleType'] == "tvSeries" and start_year < year))) \
            or (row['endYear'] != "\\N" and start_year <= year \
                and int(row['endYear']) >= year-1):
                category = row['titleType']
                if category not in title_basics:
                    title_basics[category] = []
                title = {'title': row['primaryTitle'], \
                         'genres': row['genres'].split(',')}
                title_basics[category].append(title)

    return title_basics

In [26]:
title_basics = get_imdb_title_basics()

In [27]:
# Set up results object
results = {}

In [28]:
from collections import Counter

# Ranking
possible_hosts = Counter()
for tweet in extracted_data:
    # Calculate tweet weight based on user + tweet reliability
    user = tweet['user']['screen_name'].lower()
    t_weight = (tweet['retweets'] + 1) \
        + (user_metadata[user]['rt_average'] + 1)

    # Host
    if 'host_resolutions' in tweet:
        hr = tweet['host_resolutions']
        for host in hr['hosts']:
            possible_hosts[host.lower()] += t_weight * hr['confidence']

In [29]:
import difflib

# Combine like name categories
def combine_names(name, possible_names, orig_vals, len_penalty, diff):
    if name not in possible_names:
         return
    
    # Temporarily remove current name to avoid comparing it to itself
    temp = possible_names.pop(name)

    # Check for direct substrings
    for other_name in possible_names:
        # Prioritize longer string, unless large popularity difference
        if (name in other_name \
            and orig_vals[name] * len_penalty < orig_vals[other_name]) \
                or (other_name in name \
                    and orig_vals[other_name] * len_penalty >= orig_vals[name]):
                possible_names[other_name] += temp
                return
        elif name in other_name or other_name in name:
            possible_names[name] = temp + possible_names[other_name]
            del possible_names[other_name]
            return

    # If not a direct substring, check for misspellings
    if name in possible_names:
        closest = difflib.get_close_matches(name, possible_names.keys(), 1, diff)
        if closest and orig_vals[name] > orig_vals[closest[0]]:
            possible_names[name] = temp + possible_names[closest[0]]
            del possible_names[closest[0]]
            return
        elif closest:
            possible_names[closest[0]] += temp
            return
    
    # Reinsert current name if it is distinct
    possible_names[name] = temp

# Retrieve the most likely combination of names from a given set
def get_names(possible_names, threshold):
    names = []
    if not possible_names:
        return names
    
    # Set the minimum score based on the given threshold
    min_score = possible_names[0][1] * threshold
    for name in possible_names:
        # Insert all possible hosts above the minimum score
        if name[1] > min_score:
            names.append(name[0])
        else:
            break

    return names

In [30]:
# Aggregation
hosts_in_order = possible_hosts.most_common()
hosts_in_order.reverse()
orig_vals = dict(hosts_in_order)
for name in orig_vals:
    combine_names(name, possible_hosts, orig_vals, 0.8, 0.8)

results['hosts'] = get_names(possible_hosts.most_common(), 0.8)

In [31]:
# Output
with open('gg2013_results.json', 'w', encoding='utf-8') as output_file:
    json.dump(results, output_file, indent=6)