In [1]:
import json
import re
# from nltk.tokenize import sent_tokenize, word_tokenize


In [2]:
def load_processed_tweets_from_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

data = load_processed_tweets_from_json("gg2013_preprocessed.json")

In [3]:
def clean_retweet_text(text):
    """Removes the 'rt @username:' prefix from a retweet, leaving only the main content."""
    # Check if the text starts with 'rt @'
    if text.startswith("rt @"):
        # Find the position of the first colon after 'rt @username'
        colon_position = text.find(":")
        # If a colon exists, return the text after it; otherwise, return the full string
        if colon_position != -1:
            return text[colon_position + 1:].strip()
    return text.strip()

In [4]:

# import nltk
# from nltk.corpus import words
# nltk.download('words')

award_name_allowlist = ["drama", "musical", "comedy", "animated", "animation", "motion", "television", "series"]

def extract_award_name(sentence):
    """Extracts the award name and winner from the sentence based on specified conditions."""
    # Find all hyphen and colon positions
    split_positions = [i for i, char in enumerate(sentence) if char in "-:"]
    award_name = None
    winner = None

    for index, pos in enumerate(split_positions):
        # Split the sentence at the current hyphen/colon position

        if index >= 1:
            left_part = sentence[split_positions[index - 1] + 1: split_positions[index]].strip()
        else:
            left_part = sentence[:pos].strip()
        right_part = sentence[pos + 1:].strip()

        # Check if there's something on the right
        if not right_part:
            return award_name, winner

        # Split left part into words
        left_words = left_part.split()

        # Check for "best" or "award" in left part
        if any(word.lower() in left_words for word in ['best', 'award']):

            # Determine the right portion based on allowlist keywords
            if index < len(split_positions) - 1:
                next_segment = sentence[pos + 1:split_positions[index + 1]].strip()
            else:
                next_segment = right_part

            # Assign award name and winner based on allowlist
            if any(word.lower() in next_segment.lower() for word in award_name_allowlist):
                award_name = f"{left_part} - {next_segment}".strip()
                # Capture the winner if more splits remain
                if index < len(split_positions) - 2:
                    winner = sentence[split_positions[index + 1] + 1: split_positions[index + 2]].strip()
            else:
                award_name = left_part
                winner = next_segment
            break

    return [award_name, winner]












In [5]:
def extract_winner_info(text):
    """Check if the sentence contains a pattern like '... wins ...' with 'best' or 'award' in the second part."""
    text = clean_retweet_text(text)
    
    pattern = r'^(.*?)(wins|receives)(.*?)(best|award)(.*?)$'
    
    # Split the text into sentences
    sentences = re.split(r'[.!?]', text)
    
    # Check each sentence individually
    for sentence in sentences:
        match = re.search(pattern, sentence.strip(), re.IGNORECASE)
        
        if match:
            # Extract parts based on the match
            first_part = match.group(1).strip()
            second_part = match.group(3).strip() + " " + match.group(4).strip() + " " + match.group(5).strip()
            
            return [first_part, second_part]
    
    
    return None

def get_combinations(input_string):
   
    words = input_string.split()
    combinations = []
    
    

    for j in range(1, len(words) + 1):
        
        combinations.append(' '.join(words[0:j]))
    
    return combinations

In [6]:
for index, tweet in enumerate(data):
    
    tweet_text = tweet['text']
    tweet_text = clean_retweet_text(tweet_text)
    
    award_name = extract_winner_info(tweet_text)
    
    if award_name:
        win_resolutions = {
            "award" : get_combinations(award_name[1]),
            "winner": [award_name[0]],
            "confidence": 0.7
            
        }
        tweet["win_resolutions"] = win_resolutions
        
        continue
    award_name = extract_award_name(tweet_text)
    
    if award_name[0]:
        win_resolutions = {
            "award" : get_combinations(award_name[0]),
            "winner": [award_name[1]],
            "confidence": 0.5
            
        }
        tweet["win_resolutions"] = win_resolutions

        
        

In [7]:
%pip install blis

Note: you may need to restart the kernel to use updated packages.


In [8]:
%pip install langdetect

Note: you may need to restart the kernel to use updated packages.


In [9]:
!python -m spacy download en_core_web_sm


/System/Library/Frameworks/Python.framework/Versions/2.7/Resources/Python.app/Contents/MacOS/Python: No module named spacy


In [10]:
import spacy
from langdetect import detect, DetectorFactory
nlp = spacy.load("en_core_web_sm")
DetectorFactory.seed = 0

def find_people_from_text(text):

    # Skip if not English
    try:
        if detect(text) != 'en':
            return {}
    except:
        return {}

    doc = nlp(text)
    confidence = 0.5

    # Find the people mentioned in the tweet
    potential_hosts = [entity for entity in doc.ents \
                           if entity.label_ == "PERSON"]
    # Add to hosts
    hosts =[host.text for host in potential_hosts]

    for sentence in doc.sents:
        
        # Find the root verb of the sentence
        root_verb = [token for token in sentence if token.dep_ == "ROOT"]

        if root_verb and root_verb[0].lemma_ == "host":
            confidence = 0.7
            break

    return {'hosts': hosts, 'confidence': confidence}

In [11]:
have_host_resolutions = []
for index, tweet in enumerate(data):
    
    tweet_text = tweet['text']
    
    # Skip if no mention of host in text, or about next year
    if re.search(r'(?:\bhost)|(?:host(?:s|ed|ing)?\b)', tweet_text, re.IGNORECASE) == None \
        or re.search(r'\bnext\b', tweet_text, re.IGNORECASE) != None:
        host_resolutions = {}
    else:
        host_resolutions = find_people_from_text(tweet_text)
    
    if host_resolutions:
        have_host_resolutions.append(index)
        tweet["host_resolutions"] = host_resolutions

In [12]:
def split_string_on_and(text: str):
    # Split the text by "and" or "&" with surrounding whitespace handling
    parts = re.split(r'\s*(?:and|&)\s*', text)
    # Filter out any empty strings in the list
    return [part.strip() for part in parts if part.strip()]

def parse_presenters(tweet: str):
    # Define the regex pattern to match the presenters and award pattern
    pattern = r"(.*?)(?<!re)(?:\bpresent\b|\bpresents\b|\bpresenting\b|\bpresenting for\b)\s+(.*?(?:best).*)"
    
    # Split the tweet into sentences
    sentences = re.split(r'[.!?]', tweet)
    
    # Check each sentence individually
    for sentence in sentences:
        match = re.search(pattern, sentence.strip(), re.IGNORECASE)
        
        if match:
            # Extract the presenters and award from the match
            presenters = match.group(1).strip()
            award = match.group(2).strip()

            # Clean the award string
            # Remove any words before "best"
            award = re.sub(r".*?\bbest\b", "best", award, flags=re.IGNORECASE)
            # Remove any words including and after "at," "and," "for," or "to"
            award = re.sub(r"\s+(at|and|for|to)\b.*", "", award, flags=re.IGNORECASE)
            
            return [award.strip(),   split_string_on_and(presenters)]
    
    
    return [None, None]

In [13]:
for index, tweet in enumerate(data):
    
    tweet_text = tweet['text']
    tweet_text = clean_retweet_text(tweet_text)
    
    
    possible_presenters  = parse_presenters(tweet_text)
    if possible_presenters[0]:
      present_resolutions = {"award": possible_presenters[0], "presenters": possible_presenters[1], "confidence": 0.7}
      tweet["present_resolutions"] = present_resolutions
      
        

In [14]:
def get_nominees(text: str):
    # Define the regex pattern to match "nominee/nominated/nominees ... ... best"
    pattern = r"(.*)\b(?:nominee|nominated|nominees)\b(.*?\bbest\b.*)"
    # Define a list of exclusion words/phrases
    exclusions = r"\b(?:not|should've|should have|wasn't|introduce|present|should)\b"

    # Split the text into sentences
    sentences = re.split(r'[.!?]', text)

    # Check each sentence individually
    for sentence in sentences:
        # Check if any exclusion word is present
        if re.search(exclusions, sentence, re.IGNORECASE):
            continue

        
        match = re.search(pattern, sentence, re.IGNORECASE)
        if match:
            # Clean and separate the nominee part and award name
            before_nominee = re.sub(r"\b(?:nominee|nominated|nominees)\b", "", match.group(1), flags=re.IGNORECASE).strip()
            award_name = re.sub(r".*?\bbest\b", "best", match.group(2), flags=re.IGNORECASE).strip()

            return [before_nominee, award_name]

    
    return None

In [15]:
for index, tweet in enumerate(data):
    
    tweet_text = tweet['text']
    tweet_text = clean_retweet_text(tweet_text)
    
    
    possible_nominee  = get_nominees(tweet_text)
    if possible_nominee:
    

        nominee_resolutions = {"award": possible_nominee[1], "nominee": possible_presenters[0], "confidence": 0.5}
        tweet["nominee_resolutions"] = nominee_resolutions
      
      
    
        

In [16]:
filename = 'parsed_data.json'

# Write data to JSON file
with open(filename, 'w') as json_file:
    json.dump(data, json_file, indent=4)