In [1]:
import json

# Open and read extracted tweets
with open('parsed_data.json', 'r', encoding='utf-8') as extracted_file:
    extracted_data = json.load(extracted_file)
# Open and read user metadata
with open('gg2013_user_metadata.json', 'r', encoding='utf-8') as metadata_file:
    user_metadata = json.load(metadata_file)

In [2]:
import datetime

# Find ceremony year
dt = datetime.datetime.fromtimestamp(extracted_data[0]['timestamp_ms'] / 1000.0,
                                     tz=datetime.timezone.utc)
year = dt.year

In [3]:
import csv

# Import IMDB data relevant to the ceremony year
def get_imdb_title_basics():

    title_basics = {}

    with open('./imdb/title.basics.tsv') as imdb_file:
        reader = csv.DictReader(imdb_file, delimiter='\t', quotechar='"')
        for row in reader:
            # Skip title with unknown start years
            if row['startYear'] == "\\N":
                continue

            # Save title if it is from the current or previous year, OR if it is a
            # TV series that ran at some point during that time, AND it is not an
            # episode or special
            start_year = int(row['startYear'])
            if ((row['endYear'] == "\\N" \
                 and (start_year == year or start_year == year-1 \
                    or (row['titleType'] == "tvSeries" and start_year < year))) or \
                (row['endYear'] != "\\N" and start_year <= year \
                 and int(row['endYear']) >= year-1)) and \
               row['titleType'] != "tvEpisode" and \
               row['titleType'] != 'tvSpecial':
                category = row['titleType'].lower()
                if category not in title_basics:
                    title_basics[category] = []
                title = row['primaryTitle']
                title_basics[category].append(title)

    return title_basics

In [4]:
title_basics = get_imdb_title_basics()
# IMDB genre tags, minus 'Short' and 'Adult'
genres = ['Comedy', 'Music', 'Crime', 'Drama', 'Game-Show', 'Talk-Show', 'Family', 'Mystery', 'Sport', 'Horror', 'Western', 'Adventure', 'News', 'Action', 'Documentary', 'Reality-TV', 'Sci-Fi', 'Thriller', 'Animation', 'War', 'Musical', 'Romance', 'Fantasy', 'Biography', 'History']

In [87]:
# Set up results object
results = {}

In [88]:
from collections import Counter

# Ranking
possible_hosts = Counter()
possible_awards = Counter()
for tweet in extracted_data:
    # Calculate tweet weight based on user + tweet reliability
    user = tweet['user']['screen_name'].lower()
    tweet['weight'] = (tweet['retweets'] + 1) \
        + (user_metadata[user]['rt_average'] + 1)

    # Host
    if 'host_resolutions' in tweet:
        hr = tweet['host_resolutions']
        for host in hr['hosts']:
            possible_hosts[host.lower()] += tweet['weight'] * hr['confidence']

    # Award
    if 'win_resolutions' in tweet:
        wr = tweet['win_resolutions']
        for award in wr['award']:
            possible_awards[award.lower()] += tweet['weight'] * wr['confidence']
    if 'nominee_resolutions' in tweet:
        nr = tweet['nominee_resolutions']
        possible_awards[nr['award'].lower()] += tweet['weight'] * nr['confidence']
    if 'present_resolutions' in tweet:
        pr = tweet['present_resolutions']
        possible_awards[pr['award'].lower()] += tweet['weight'] * pr['confidence']

In [89]:
import difflib
from nltk.metrics import distance

# Find closest match by averaging distance calculations
def closest_match(names, possible_names, diff):
    if not isinstance(names, list):
        names = [names]

    max_ratio = []
    for name in names:
        name = name.lower()
        sm = difflib.SequenceMatcher(b=name)
        for other_name in possible_names:
            other_name = other_name.lower()
            ratio = 1 - distance.edit_distance(name, other_name) / \
                max(len(name), len(other_name))
            sm.set_seq1(other_name)
            ratio = (sm.ratio() + ratio)/2.0
            if not max_ratio or (max_ratio and ratio > max_ratio[1]):
                max_ratio = [other_name, ratio]

    if max_ratio and max_ratio[1] > diff:
        return max_ratio[0]
    return []

# Combine like name categories
def combine_names(name, possible_names, orig_vals, len_penalty, diff):
    if name not in possible_names:
         return
    
    # Temporarily remove current name to avoid comparing it to itself
    temp = possible_names.pop(name)

    # Check for direct substrings
    for other_name in possible_names:
        # Prioritize longer string, unless large popularity difference
        if (name in other_name \
            and orig_vals[name] * len_penalty < orig_vals[other_name]) \
                or (other_name in name \
                    and orig_vals[other_name] * len_penalty >= orig_vals[name]):
                possible_names[other_name] += temp
                return
        elif name in other_name or other_name in name:
            possible_names[name] = temp + possible_names[other_name]
            del possible_names[other_name]
            return

    # If not a direct substring, check for misspellings
    closest = closest_match(name, possible_names.keys(), diff)
    if closest and orig_vals[name] > orig_vals[closest]:
        possible_names[name] = temp + possible_names[closest]
        del possible_names[closest]
        return
    elif closest:
        possible_names[closest] += temp
        return

    # Reinsert current name if it is distinct
    possible_names[name] = temp

# Retrieve the most likely combination of names from a given set
def get_names(possible_names, threshold):
    names = []
    if not possible_names:
        return names
    
    # Set the minimum score based on the given threshold
    min_score = threshold
    if threshold <= 1:
        min_score *= possible_names[0][1]
    for name in possible_names:
        # Insert all possible hosts above the minimum score
        if name[1] >= min_score:
            names.append(name[0])
        else:
            break

    return names

# Aggregate results for a category
def aggregate_results(possible_options, len_penalty, diff, threshold):
    in_order = possible_options.most_common()
    in_order.reverse()
    orig_vals = dict(in_order)
    for name in orig_vals:
        combine_names(name, possible_options, orig_vals, len_penalty, diff)
    return get_names(possible_options.most_common(), threshold)

In [90]:
## Aggregation

# Hosts
results['hosts'] = aggregate_results(possible_hosts, 0.8, 0.8, 0.8)

# Award names
awards = aggregate_results(possible_awards, 0.8, 0.7, 10)
for award in awards:
    results[award] = {}

In [91]:
# Per-award ranking
possible_winners = {}
possible_nominees = {}
possible_presenters = {}

for award in awards:
    possible_winners[award] = Counter()
    possible_nominees[award] = Counter()
    possible_presenters[award] = Counter()

for tweet in extracted_data:

    # Winner
    if 'win_resolutions' in tweet:
        wr = tweet['win_resolutions']
        award = closest_match(wr['award'], awards, 0.8)
        if award:
            for winner in wr['winner']:
                if winner:
                    possible_winners[award][winner.lower()] += \
                        tweet['weight'] * wr['confidence']

    # Nominee
    if 'nominee_resolutions' in tweet:
        nr = tweet['nominee_resolutions']
        award = closest_match(nr['award'], awards, 0.8)
        if award and nr['nominee']:
            possible_nominees[award][nr['nominee'].lower()] += \
                tweet['weight'] * nr['confidence']

    # Presenter
    if 'present_resolutions' in tweet:
        pr = tweet['present_resolutions']
        award = closest_match(pr['award'], awards, 0.8)
        if award:
            for presenter in pr['presenters']:
                if presenter:
                    possible_presenters[award][presenter.lower()] += \
                        tweet['weight'] * pr['confidence']

In [92]:
print(possible_winners)
print(possible_nominees)
print(possible_presenters)

{'best director': Counter({'ben affleck': 168.90565014862946, 'usa today': 25.600000000000005, 'interesting moment - ben affleck': 16.57894736842105, 'benaffleck': 10.823076923076924, '@benaffleck': 8.866666666666667, 'goldenglobes 2013: ben affleck': 6.329999999999999, 'hope @benaffleck': 5.6, 'usa today [a link in my bio]': 4.800000000000001, 'idiocy of oscarnoms affleck not good enough to even merit a nomination and yet': 3.8499999999999996, 'o sodorinpantatjlo pukpuk': 3.2, 'non-oscar nominated ben affleck': 2.9909090909090907, 'argo': 2.4499999999999997, 'breaking: ben affleck': 2.0999999999999996, 'that nice ben affleck': 1.9249999999999998, 'hfpa gives the academy the finger, and gives affleck a well deserved win': 1.6, 'that movie was excellent! goldenglobes': 1.6, 'usa today ymdlu': 1.6, 'usa today (a link in my bio)': 1.6, 'usa today news': 1.6, 'i hope jared leto': 1.4, 'excited to see who': 1.4, 'hoping @benaffleck': 1.4, "going by django's dominance i wouldn't be surprised

In [93]:
# Per-award aggregation
for award in awards:
    winner = aggregate_results(possible_winners[award], 0.8, 0.8, 1)
    if winner:
        results[award]['winner'] = winner[0]
    else:
        results[award]['winner'] = ''
    results[award]['nominees'] = \
        aggregate_results(possible_nominees[award], 0.8, 0.8, 0.8)
    results[award]['presenters'] = \
        aggregate_results(possible_presenters[award], 0.8, 0.8, 0.4)

In [94]:
# Output to JSON
with open('gg2013_results.json', 'w', encoding='utf-8') as output_file:
    json.dump(results, output_file, indent=6)

In [98]:
# Output human-readable to TXT
with open('gg2013_results.txt', 'w', encoding='utf-8') as output_file:
    print('Hosts:', (' '.join(results['hosts']).title()), file=output_file)
    for award in awards:
        print("\nAward: ", award.title(), sep='', file=output_file)
        for category in results[award]:
            if isinstance(results[award][category], list):
                print(category.title(), ": ",
                      (' '.join(results[award][category])).title(), sep='',
                      file=output_file)
            else:
                print(category.title(), ": ", results[award][category].title(),
                      sep='', file=output_file)