In [80]:
import json
import re
# from nltk.tokenize import sent_tokenize, word_tokenize


In [81]:
def load_processed_tweets_from_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

data = load_processed_tweets_from_json("gg2013_preprocessed.json")

In [82]:
def clean_retweet_text(text):
    """Removes the 'rt @username:' prefix from a retweet, leaving only the main content."""
    # Check if the text starts with 'rt @'
    if text.startswith("rt @"):
        # Find the position of the first colon after 'rt @username'
        colon_position = text.find(":")
        # If a colon exists, return the text after it; otherwise, return the full string
        if colon_position != -1:
            return text[colon_position + 1:].strip()
    return text.strip()

In [83]:


import re

award_name_allowlist = ["drama", "musical", "comedy", "animated", "animation", "motion", "television", "series", "award", "best"]

def extract_award_name(sentence):
    """Extracts the award name and winner from the sentence based on specified conditions."""
    # Find all hyphen and colon positions
    split_positions = [i for i, char in enumerate(sentence) if char in "-:"]
    award_name = None
    winner = None
    
    if not re.search(r'[-:]', sentence):
        return [None, None]  # Return None if there is no hyphen or colon

        # Step 2: Check if sentence contains "best" (case insensitive)
    if "best" not in sentence.lower():
        return [None, None]  # Return None if "best" is not present

    for index, pos in enumerate(split_positions):
        # Split the sentence at the current hyphen/colon position

        if index >= 1:
            left_part = sentence[split_positions[index - 1] + 1: split_positions[index]].strip()
        else:
            left_part = sentence[:pos].strip()
        right_part = sentence[pos + 1:].strip()

        # Check if there's something on the right
        if not right_part:
            return award_name, winner

        # Split left part into words
        left_words = left_part.split()

        # Check for "best" or "award" in left part
        if any(word.lower() in left_words for word in ['best', 'award']):

            # Determine the right portion based on allowlist keywords
            if index < len(split_positions) - 1:
                next_segment = sentence[pos + 1:split_positions[index + 1]].strip()
            else:
                next_segment = right_part

            # Assign award name and winner based on allowlist
            if any(word.lower() in next_segment.lower() for word in award_name_allowlist):
                award_name = f"{left_part} - {next_segment}".strip()
                # Capture the winner if more splits remain
                if index < len(split_positions) - 2:
                    winner = sentence[split_positions[index + 1] + 1: split_positions[index + 2]].strip()
            else:
                award_name = left_part
                winner = next_segment
            break
    if winner and '.' in winner:
        winner = re.split(r'\.', winner, 1)[0].strip()

    return [award_name, winner]

In [84]:
def extract_winner_info(text):
    """Check if the sentence contains a pattern like '... wins ...' with 'best' or 'award' in the second part."""
    text = clean_retweet_text(text)
    
    pattern = r'^(.*?)(wins|receives)(.*?)(best)(.*?)$'
    
    # Split the text into sentences
    sentences = re.split(r'[.!?]', text)
    
    # Check each sentence individually
    for sentence in sentences:
        match = re.search(pattern, sentence.strip(), re.IGNORECASE)
        
        if match:
            # Extract parts based on the match
            first_part = match.group(1).strip()
            second_part = match.group(3).strip() + " " + match.group(4).strip() + " " + match.group(5).strip()
            
            return [first_part, second_part]
    
    
    return None

def extract_award_sequences(input_str):
    
    initial_pattern = r"(?i)^(?:golden globes for|golden globes|award for)\s+(best)\s+(.+)"
    match = re.search(initial_pattern, input_str)
    if match:
        input_str = match.group(1) + " " + match.group(2)
        

    
    refine_pattern = r"(?i)(best|award)\s+(.+?)(?:\s+golden globes|goldenglobes|$)"
    match = re.search(refine_pattern, input_str)
    if match:
        input_str = match.group(1) + " " + match.group(2).strip()

    
    split_parts = re.split(r'(\s*-\s*|\s+for\s+)', input_str) 
    sequences = []
    current_sequence = split_parts[0].strip()

    
    for i in range(1, len(split_parts) - 1, 2):
        sequences.append(current_sequence) 
        separator = split_parts[i].strip() 
        next_part = split_parts[i + 1].strip()
        current_sequence += f" {separator} {next_part}" 

    sequences.append(current_sequence)  

    return sequences


In [85]:
for index, tweet in enumerate(data):
    
    tweet_text = tweet['text']
    tweet_text = clean_retweet_text(tweet_text)
    
    award_name = extract_winner_info(tweet_text)
    
    if award_name:
        win_resolutions = {
            "award" : extract_award_sequences(award_name[1]),
            "winner": [award_name[0]],
            "confidence": 0.7
            
        }
        tweet["win_resolutions"] = win_resolutions
        
        continue
    award_name = extract_award_name(tweet_text)
    
    if award_name[0]:
        win_resolutions = {
            "award" : extract_award_sequences(award_name[0]),
            "winner": [award_name[1]],
            "confidence": 0.8
            
        }
        tweet["win_resolutions"] = win_resolutions
        

        
        

In [86]:
%pip install blis

Note: you may need to restart the kernel to use updated packages.


In [87]:
%pip install langdetect

Note: you may need to restart the kernel to use updated packages.


In [88]:
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
                                              0.0/12.8 MB ? eta -:--:--
                                              0.1/12.8 MB 1.7 MB/s eta 0:00:08
                                              0.2/12.8 MB 2.1 MB/s eta 0:00:07
                                              0.3/12.8 MB 2.2 MB/s eta 0:00:06
     -                                        0.4/12.8 MB 2.2 MB/s eta 0:00:06
     -                                        0.4/12.8 MB 1.9 MB/s eta 0:00:07
     -                                        0.5/12.8 MB 1.8 MB/s eta 0:00:08
     -                                        0.5/12.8 MB 1.8 MB/s eta 0:00:07
     -                                        0.6/12.8 MB 1.6 MB/s eta 0:00:08
     -                                        0.6/12.8 MB 1.6 MB/s eta 0:00:08
     --                                  

In [89]:
import spacy
from langdetect import detect, DetectorFactory
nlp = spacy.load("en_core_web_sm")
DetectorFactory.seed = 0

def find_people_from_text(text):

    # Skip if not English
    try:
        if detect(text) != 'en':
            return {}
    except:
        return {}

    doc = nlp(text)
    confidence = 0.5

    # Find the people mentioned in the tweet
    potential_hosts = [entity for entity in doc.ents \
                           if entity.label_ == "PERSON"]
    # Add to hosts
    hosts =[host.text for host in potential_hosts]

    for sentence in doc.sents:
        
        # Find the root verb of the sentence
        root_verb = [token for token in sentence if token.dep_ == "ROOT"]

        if root_verb and root_verb[0].lemma_ == "host":
            confidence = 0.7
            break

    return {'hosts': hosts, 'confidence': confidence}

In [90]:
have_host_resolutions = []
for index, tweet in enumerate(data):
    
    tweet_text = tweet['text']
    
    # Skip if no mention of host in text, or about next year
    if re.search(r'(?:\bhost)|(?:host(?:s|ed|ing)?\b)', tweet_text, re.IGNORECASE) == None \
        or re.search(r'\bnext\b', tweet_text, re.IGNORECASE) != None:
        host_resolutions = {}
    else:
        host_resolutions = find_people_from_text(tweet_text)
    
    if host_resolutions:
        have_host_resolutions.append(index)
        tweet["host_resolutions"] = host_resolutions

In [91]:
def split_string_on_and(text: str):
    # Split the text by "and" or "&" with surrounding whitespace handling
    parts = re.split(r'\s*(?:\band\b|&)\s*', text)
    # Filter out any empty strings in the list
    return [part.strip() for part in parts if part.strip()]

def parse_presenters(tweet: str):
    # Define the regex pattern to match the presenters and award pattern
    pattern = r"(.*?)(?<!re)(?:\bpresent\b|\bpresents\b|\bpresenting\b|\bpresenting for\b)\s+(.*?(?:best).*)"
    
    # Split the tweet into sentences
    sentences = re.split(r'[.!?]', tweet)
    
    # Check each sentence individually
    for sentence in sentences:
        match = re.search(pattern, sentence.strip(), re.IGNORECASE)
        
        if match:
            # Extract the presenters and award from the match
            presenters = match.group(1).strip()
            award = match.group(2).strip()

            # Clean the award string
            # Remove any words before "best"
            award = re.sub(r".*?\bbest\b", "best", award, flags=re.IGNORECASE)
            # Remove any words including and after "at," "and," "for," or "to"
            award = re.sub(r"\s+\b(at|and|for|to)\b.*", "", award, flags=re.IGNORECASE)
            
            return [award.strip(),   split_string_on_and(presenters)]
    
    
    return [None, None]

In [92]:
for index, tweet in enumerate(data):
    
    tweet_text = tweet['text']
    tweet_text = clean_retweet_text(tweet_text)
    
    
    possible_presenters  = parse_presenters(tweet_text)
    if possible_presenters[0]:
      present_resolutions = {"award": possible_presenters[0], "presenters": possible_presenters[1], "confidence": 0.7}
      tweet["present_resolutions"] = present_resolutions
      
        

In [93]:
def get_nominees(text: str):
    # Define the regex pattern to match "nominee/nominated/nominees ... ... best"
    pattern = r"(.*)\b(?:nominee|nominated|nominees)\b(.*?\bbest\b.*)"
    # Define a list of exclusion words/phrases
    exclusions = r"\b(?:not|should've|should have|wasn't|introduce|present|should)\b"

    # Split the text into sentences
    sentences = re.split(r'[.!?]', text)

    # Check each sentence individually
    for sentence in sentences:
        # Check if any exclusion word is present
        if re.search(exclusions, sentence, re.IGNORECASE):
            continue

        
        match = re.search(pattern, sentence, re.IGNORECASE)
        if match:
            # Clean and separate the nominee part and award name
            before_nominee = re.sub(r"\b(?:nominee|nominated|nominees)\b", "", match.group(1), flags=re.IGNORECASE).strip()
            award_name = re.sub(r".*?\bbest\b", "best", match.group(2), flags=re.IGNORECASE).strip()

            return [before_nominee, award_name]

    
    return None

In [94]:
def get_nominees2(text: str) :
    # Define the regex pattern to match a sentence with a negative word, "win/won," and "best"
    pattern = r"(.*?)(\bdoes not\b|\bnot\b|\bshould have\b|\bshould've\b|\bshould\b|\bdidn't\b|\bdid not\b)(.*?\b(?:win|won)\b.*?\bbest\b.*)"

    # Split the text into sentences
    sentences = re.split(r'[.:!?]', text)

    # Process each sentence individually
    for sentence in sentences:
        match = re.search(pattern, sentence.strip(), re.IGNORECASE)

        if match:
            # Extract the part before the negative word as the nominee
            nominee = match.group(1).strip()
            # Extract the part from "best" onward as the award_name
            award_name = re.sub(r".*?\bbest\b", "best", match.group(3), flags=re.IGNORECASE).strip()

            # Clean the nominee by removing text before and including "that" or "if"
            nominee = re.sub(r".*\b(that|if)\b", "", nominee, flags=re.IGNORECASE).strip()

            return [nominee, award_name]

    
    return None


In [95]:
for index, tweet in enumerate(data):
    
    tweet_text = tweet['text']
    tweet_text = clean_retweet_text(tweet_text)
    
    
    possible_nominee  = get_nominees(tweet_text)
    if possible_nominee:
    

        nominee_resolutions = {"award": possible_nominee[1], "nominee": possible_presenters[0], "confidence": 0.5}
        tweet["nominee_resolutions"] = nominee_resolutions
        
    possible_nominee  = get_nominees2(tweet_text)
    if possible_nominee:
    

        nominee_resolutions = {"award": possible_nominee[1], "nominee": possible_presenters[0], "confidence": 0.5}
        tweet["nominee_resolutions"] = nominee_resolutions
      
      
    
        

In [96]:
filename = 'parsed_data.json'

# Write data to JSON file
with open(filename, 'w') as json_file:
    json.dump(data, json_file, indent=4)