In [1]:
# Import necessary packages
import json

# Open and read extracted tweets
with open('parsed_data.json', 'r', encoding='utf-8') as extracted_file:
    extracted_data = json.load(extracted_file)
# Open and read user metadata
with open('gg2013_user_metadata.json', 'r', encoding='utf-8') as metadata_file:
    user_metadata = json.load(metadata_file)

In [2]:
# Set up results object
results = {}

In [4]:
from collections import Counter

# Ranking
possible_hosts = Counter()
for tweet in extracted_data:
    # Calculate tweet weight based on user + tweet reliability
    user = tweet['user']['screen_name'].lower()
    t_weight = (tweet['retweets'] + 1) \
        + (user_metadata[user]['rt_average'] + 1)

    # Host
    if 'host_resolutions' in tweet:
        hr = tweet['host_resolutions']
        for host in hr['hosts']:
            possible_hosts[host.lower()] += t_weight * hr['confidence']

In [5]:
import difflib

# Combine like host categories
def combine_hosts(name, possible_hosts, orig_vals, len_penalty, diff):
    if name not in possible_hosts:
         return
    
    # Temporarily remove current name to avoid comparing it to itself
    temp = possible_hosts.pop(name)

    # Check for direct substrings
    for other_name in possible_hosts:
        # Prioritize longer string, unless large popularity difference
        if (name in other_name \
            and orig_vals[name] * len_penalty < orig_vals[other_name]) \
                or (other_name in name \
                    and orig_vals[other_name] * len_penalty >= orig_vals[name]):
                possible_hosts[other_name] += temp
                return
        elif name in other_name or other_name in name:
            possible_hosts[name] = temp + possible_hosts[other_name]
            del possible_hosts[other_name]
            return

    # If not a direct substring, check for misspellings
    if name in possible_hosts:
        closest = difflib.get_close_matches(name, possible_hosts.keys(), 1, diff)
        if closest and orig_vals[name] > orig_vals[closest[0]]:
            possible_hosts[name] = temp + possible_hosts[closest[0]]
            del possible_hosts[closest[0]]
            return
        elif closest:
            possible_hosts[closest[0]] += temp
            return
    
    # Reinsert current name if it is distinct
    possible_hosts[name] = temp

# Retrieve the most likely combination of hosts
def get_hosts(possible_hosts, threshold):
    hosts = []
    if not possible_hosts:
        return hosts
    
    # Set the minimum score based on the given threshold
    host_min = possible_hosts[0][1] * threshold
    for host in possible_hosts:
        # Insert all possible hosts above the minimum score
        if host[1] > host_min:
            hosts.append(host[0])
        else:
            break

    return hosts

In [6]:
# Aggregation
hosts_in_order = possible_hosts.most_common()
hosts_in_order.reverse()
orig_vals = dict(hosts_in_order)
for name in orig_vals:
    combine_hosts(name, possible_hosts, orig_vals, 0.8, 0.8)

results['hosts'] = get_hosts(possible_hosts.most_common(), 0.8)

In [7]:
# Output
with open('gg2013_results.json', 'w', encoding='utf-8') as output_file:
    json.dump(results, output_file, indent=6)