In [16]:
import json
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from bs4 import BeautifulSoup
import requests
import re
import operator
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
import time

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nlp = en_core_web_sm.load()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\casey\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\casey\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\casey\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\casey\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


## Graded Functions

cache all tweets in global dictionary ALL_TWEETS

In [64]:
ALL_TWEETS = {}

path_to_dataset = "../gg-datasets/"
def convert(seconds): 
    seconds = seconds % (24 * 3600) 
    hour = seconds // 3600
    seconds %= 3600
    minutes = seconds // 60
    seconds %= 60
      
    return "%d:%02d:%02d" % (hour, minutes, seconds)


def get_tweets(year):
    if year in ALL_TWEETS:
        return ALL_TWEETS[year]
    else:
        try:
            file = open(path_to_dataset+'gg'+str(year)+'.json', encoding = 'utf8')
            data = json.load(file)

            ALL_TWEETS[year] = [line['text'] for line in data]
            return ALL_TWEETS[year]
        except:
            print('could not read json in the form of dictionary list')
            print('trying jsonl reading instead')
        try:
            with open(path_to_dataset+'gg'+str(year)+'.json', encoding='utf8') as f:
                lines = f.readlines()

            gg_data = []
            for line in lines:
                gg_data.append(json.loads(line)['text'])

            ALL_TWEETS[year] = gg_data
            return ALL_TWEETS[year]
        except:
            print('could not read json as jsonl')
            print('check json format')
            print('get_tweets returned []')
        return []


def load_all():
    init_time = time.time()
    
    start_time = time.time()
    print('loading gg2013.json start time', convert(start_time))
    get_tweets(2013)
    end_time = time.time()
    print('loading gg2013.json end time', convert(end_time))
    elapsed_time = time.time() - start_time
    print('time elapsed', convert(elapsed_time))
 
    
    start_time = time.time()
    print('loading gg2015.json start time', convert(start_time))
    get_tweets(2015)
    end_time = time.time()
    print('loading gg2015.json end time', convert(end_time))
    elapsed_time = time.time() - start_time
    print('time elapsed', convert(elapsed_time))
    
    start_time = time.time()
    print('loading gg2020.json start time', convert(start_time))
    get_tweets(2020)
    end_time = time.time()
    print('loading gg2020.json end time', convert(end_time))
    elapsed_time = time.time() - start_time
    print('time elapsed', convert(elapsed_time))
    
    print('total time elapsed', time.time() - init_time)
    
def pre_ceremony():
    '''This function loads/fetches/processes any data your program
    will use, and stores that data in your DB or in a json, csv, or
    plain text file. It is the first thing the TA will run when grading.
    Do NOT change the name of this function or what it returns.'''
    # Your code here
    load_all()
    print("Pre-ceremony processing complete.")
    return

pre_ceremony()

loading gg2013.json start time 20:11:10
loading gg2013.json end time 20:11:11
time elapsed 0:00:00
loading gg2015.json start time 20:11:11
loading gg2015.json end time 20:11:20
time elapsed 0:00:08
loading gg2020.json start time 20:11:20
could not read json in the form of dictionary list
trying jsonl reading instead
loading gg2020.json end time 20:11:21
time elapsed 0:00:01
total time elapsed 11.06644868850708
Pre-ceremony processing complete.


In [65]:
# define constants
OFFICIAL_AWARDS_1315 = ['cecil b. demille award', 'best motion picture - drama', 'best performance by an actress in a motion picture - drama', 'best performance by an actor in a motion picture - drama', 'best motion picture - comedy or musical', 'best performance by an actress in a motion picture - comedy or musical', 'best performance by an actor in a motion picture - comedy or musical', 'best animated feature film', 'best foreign language film', 'best performance by an actress in a supporting role in a motion picture', 'best performance by an actor in a supporting role in a motion picture', 'best director - motion picture', 'best screenplay - motion picture', 'best original score - motion picture', 'best original song - motion picture', 'best television series - drama', 'best performance by an actress in a television series - drama', 'best performance by an actor in a television series - drama', 'best television series - comedy or musical', 'best performance by an actress in a television series - comedy or musical', 'best performance by an actor in a television series - comedy or musical', 'best mini-series or motion picture made for television', 'best performance by an actress in a mini-series or motion picture made for television', 'best performance by an actor in a mini-series or motion picture made for television', 'best performance by an actress in a supporting role in a series, mini-series or motion picture made for television', 'best performance by an actor in a supporting role in a series, mini-series or motion picture made for television']
OFFICIAL_AWARDS_1819 = ['best motion picture - drama', 'best motion picture - musical or comedy', 'best performance by an actress in a motion picture - drama', 'best performance by an actor in a motion picture - drama', 'best performance by an actress in a motion picture - musical or comedy', 'best performance by an actor in a motion picture - musical or comedy', 'best performance by an actress in a supporting role in any motion picture', 'best performance by an actor in a supporting role in any motion picture', 'best director - motion picture', 'best screenplay - motion picture', 'best motion picture - animated', 'best motion picture - foreign language', 'best original score - motion picture', 'best original song - motion picture', 'best television series - drama', 'best television series - musical or comedy', 'best television limited series or motion picture made for television', 'best performance by an actress in a limited series or a motion picture made for television', 'best performance by an actor in a limited series or a motion picture made for television', 'best performance by an actress in a television series - drama', 'best performance by an actor in a television series - drama', 'best performance by an actress in a television series - musical or comedy', 'best performance by an actor in a television series - musical or comedy', 'best performance by an actress in a supporting role in a series, limited series or motion picture made for television', 'best performance by an actor in a supporting role in a series, limited series or motion picture made for television', 'cecil b. demille award']

In [3]:
path_to_dataset = "../gg-datasets/"

In [6]:
def get_names(text):
    """
    Gets NER from sentence using NLTK
    """
    article = nlp(text)
    labels = [x.label_ for x in article.ents]
    [(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(text) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]
    parts_of_speech = dict([(str(x), x.label_) for x in nlp(text).ents])
    names = []
    for (key, value) in parts_of_speech.items() :
        if(value == "PERSON") :
            names.append(key)
    return names 

def get_hosts(year):
    '''Hosts is a list of one or more strings. Do NOT change the name
    of this function or what it returns.'''
    # Your code here
    
    # Load dataset from that year
    filename = path_to_dataset+'gg'+str(year)+'.json'
    
    with open(filename, 'r', encoding='utf8') as f:
        gg_data = json.loads(f.read())
        print("json from "+ str(year)+" done loading")
        
    # Get tweets with keyword 'host'
        tweets_with_host = []
        for tweet in gg_data: 
            tweet_text = tweet['text']
            if("host" in tweet_text.lower()) :
                tweets_with_host.append(tweet)
        print("done filtering for keyword host")
        
        host_name_count = {}
        
        for tweet in tweets_with_host: 
            names = get_names(tweet["text"])
            for name in names :
                if name.lower() in ["goldenglobes", "goldenglobe", "golden", "globes", "golden globes", "golden globe"]:
                    continue
                if name in host_name_count :
                    host_name_count[name] = host_name_count[name] + 1
                else:
                    host_name_count[name] = 1
        print("done applying bag of words on NER")
        max_count_name = max(host_name_count.items(), key=operator.itemgetter(1))[0]
        
        

#         if year >= 2013 and year <= 2015 or year == 2019:
#             hosts = potential_hosts[:1]
#         else: #one host
#             hosts = max(host_name_count.items(), key=operator.itemgetter(1))[0]

        return max_count_name, gg_data



In [7]:
mcn,data = get_hosts(2013)

json from 2013 done loading
done filtering for keyword host
done applying bag of words on NER


In [53]:
print(mcn)

Will Ferrell


In [4]:
def get_awards(year):
    '''Awards is a list of strings. Do NOT change the name
    of this function or what it returns.'''
    # Your code here
    return awards

In [5]:
def get_nominees(year):
    '''Nominees is a dictionary with the hard coded award
    names as keys, and each entry a list of strings. Do NOT change
    the name of this function or what it returns.'''
    # Your code here
    return nominees


In [6]:
def get_winner(year):
    '''Winners is a dictionary with the hard coded award
    names as keys, and each entry containing a single string.
    Do NOT change the name of this function or what it returns.'''
    # Your code here
    return winners


In [7]:
def get_presenters(year):
    '''Presenters is a dictionary with the hard coded award
    names as keys, and each entry a list of strings. Do NOT change the
    name of this function or what it returns.'''
    # Your code here
    return presenters


In [8]:
def pre_ceremony():
    '''This function loads/fetches/processes any data your program
    will use, and stores that data in your DB or in a json, csv, or
    plain text file. It is the first thing the TA will run when grading.
    Do NOT change the name of this function or what it returns.'''
    # Your code here
    print("Pre-ceremony processing complete.")
    return

## Helper Functions

In [36]:
def get_names(text):
    """
    Gets NER from sentence using NLTK
    """
    article = nlp(text)
    labels = [x.label_ for x in article.ents]
    [(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(text) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]
    parts_of_speech = dict([(str(x), x.label_) for x in nlp(text).ents])
    names = []
    for (key, value) in parts_of_speech.items() :
        if(value == "PERSON") :
            names.append(key)
    return names 

In [18]:
def filter_by(tweets, )
    