In [27]:
import re
import json
from collections import defaultdict
from helpers import merge

# Initialize a dictionary to store the counts of each pattern
pattern_counts = defaultdict(int)

def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Replace multiple consecutive whitespaces with a single space
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def find_and_count_patterns(text):
    text = clean_text(text)
    
    patterns = [
        r'\bbest\b.*?\b(picture|drama|musical|film)\b',
        r'\bbest (tv|television)\b.*?\b(drama|musical)\b',
        r'\bcecil.*?\baward\b',
        r'\bbest performance by\b.*?\b(musical|picture|tv|television|drama)\b'
    ]
    
    matches = []
    for pattern in patterns:
        for match in re.finditer(pattern, text):
            match_text = match.group(0)
            pattern_counts[match_text] += 1
            matches.append((match_text, pattern_counts[match_text]))
            
    return matches

def find_categories(text, retweet):
    patterns = [
        r'\bbest\b.*?\b(picture|drama|musical|film)\b',
        r'\bbest (tv|television)\b.*?\b(drama|musical)\b',
        r'\bcecil.*?\baward\b',
        r'\bbest performance by\b.*?\b(musical|picture|tv|television|drama)\b'
    ]

    matches_dict = {}
    
    for t in text:
        t = clean_text(t)
    
        for pattern in patterns:
            for match in re.finditer(pattern, t):
                match_text = match.group(0)
                pattern_counts[match_text] += 1
                matches_dict[match_text] = pattern_counts[match_text]

    for rt in retweet:
        rt = clean_text(rt)    

        for pattern in patterns:
            for match in re.finditer(pattern, rt):
                match_text = match.group(0)
                pattern_counts[match_text] += 2
                matches_dict[match_text] = pattern_counts[match_text]
    
    categories_counter = merge.merge(matches_dict)
    
    x = 15

    # Initialize an empty list to store categories
    categories = []

    # Loop through the Counter object
    for category, count in categories_counter.items():
        if count > x:
            categories.append(category)

    answer = {"hosts": categories}

    with open("categories.json", 'w') as f:
        # Dump the dictionary to the file
        json.dump(answer, f)
    
    return categories

In [28]:
with open("text.json", "r") as file:
        text = json.load(file)

with open("retweet.json", "r") as file:
        retweet = json.load(file)

In [29]:
c = find_categories(text, retweet)

In [30]:
print(c)

['best motion picture', 'best actress in a tv series drama', 'best picture', 'best actress drama', 'best drama', 'cecil b de mille award', 'best animated feature film', 'best original score motion picture', 'best supporting actress in a motion picture', 'best screenplay motion picture', 'best foreign film', 'best foreign language film', 'best actor in a motion picture', 'best actress in a drama', 'best film', 'best animated film', 'best actress in a comedy or musical', 'best tv series comedy or musical', 'best comedy or musical', 'best tv series drama', 'best actress in a motion picture', 'best actor drama', 'best director motion picture', 'best tv drama', 'best supporting actor motion picture', 'best actor in a drama']


In [31]:
print(len(c))

26
