In [1]:
import json
import re
from nltk.tokenize import sent_tokenize, word_tokenize


In [3]:
def load_processed_tweets_from_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

data = load_processed_tweets_from_json("gg2013_preprocessed.json")

In [7]:

import nltk
from nltk.corpus import words
nltk.download('words')

english_words = set(words.words())

def is_dictionary_word(string):
    
    if string == "-":
        return True
    return string.lower() in english_words


def extract_award_name(sentence):
    """Extracts the part of the sentence based on specified conditions."""
    # Find all hyphen positions
    hyphen_positions = [i for i, char in enumerate(sentence) if char == '-']
    res = None

    for index, pos in enumerate(hyphen_positions):
        # Split the sentence at the current hyphen position
        left_part = sentence[:pos].strip()
        right_part = sentence[pos + 1:].strip()

        # Check if there's something on the right
        if not right_part:
            return res

        # Split left part into words
        left_words = left_part.split()

        # Check conditions
        if (any(word.lower() in left_words for word in ['best', 'award']) and
            all(is_dictionary_word(word) for word in left_words if word)):  # Ensure no empty strings are checked
            
            if index < len(hyphen_positions) - 1:
                rr = sentence[pos+1:hyphen_positions[index + 1]]
            else:
                rr = sentence[pos+1:]

            res = [left_part.strip(), rr ]
        else:
            return res

    return res


print(extract_award_name("best supporting actor in a motion picture - christoph waltz - django unchained - goldenglobes"))
print(extract_award_name("best supporting actor in a motion picture - comedy or picture - christoph waltz - django unchained - goldenglobes"))
print(extract_award_name("best supporting actor in a motion picture - christoph waltz"))







['best supporting actor in a motion picture', ' christoph waltz ']
['best supporting actor in a motion picture - comedy or picture', ' christoph waltz ']
['best supporting actor in a motion picture', ' christoph waltz']


[nltk_data] Downloading package words to
[nltk_data]     C:\Users\18723\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [11]:
def extract_winner_info(sentence):
    """Check if the sentence is in the format '... wins ...' with 'best' or 'award' in the second part."""

    pattern = r'^(.*?)(wins|receives)(.*?)(best|award)(.*?)$'
    match = re.search(pattern, sentence, re.IGNORECASE)

    if match:

        first_part = match.group(1).strip()
        second_part = match.group(3).strip() + " " + match.group(
            4).strip()  + match.group(5).strip()
        return [first_part, second_part]

    return []

def get_combinations(input_string):
   
    words = input_string.split()
    combinations = []
    
    

    for j in range(1, len(words) + 1):
        
        combinations.append(' '.join(words[0:j]))
    
    return combinations

In [18]:
for index, tweet in enumerate(data):
    
    tweet_text = tweet['text']
    
    award_name = extract_winner_info(tweet_text)
    
    if award_name:
        win_resolutions = {
            "award" : get_combinations(award_name[1]),
            "winner": [award_name[0]],
            "confidence": 0.7
            
        }
        tweet["win_resolutions"] = win_resolutions
        
        continue
    award_name = extract_award_name(tweet_text)
    
    if award_name:
        win_resolutions = {
            "award" : get_combinations(award_name[0]),
            "winner": [award_name[1]],
            "confidence": 0.5
            
        }
        tweet["win_resolutions"] = win_resolutions

        
        

In [19]:
filename = 'parsed_data.json'

# Write data to JSON file
with open(filename, 'w') as json_file:
    json.dump(data, json_file, indent=4)  