In [None]:
import os
from bs4 import BeautifulSoup
import json
# Define a class to store the scraped data
class CodeforcesProblemItem:
    def __init__(self):
        self.url = None
        self.contestId = None
        self.index = None
        self.title = None
        self.timeLimit = None
        self.memoryLimit = None
        self.inputFile = None
        self.outputFile = None
        self.statement = None
        self.inputSpecification = None
        self.outputSpecification = None
        self.sampleTests = None
        self.note = None

# Function to parse a single HTML file
def parse_html(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()

    soup = BeautifulSoup(html_content, 'html.parser')
    
    item = CodeforcesProblemItem()

    # Populate the item attributes
    problem = file_path.split("/")[-1][:-5]
    contest, nr = problem.split("-")
    item.url = f"https://codeforces.com/contest/{contest}/problem/{nr}"  # Simulating a URL with the file path
    item.contestId = contest
    item.index = soup.find('div', attrs={'problemindex': True})['problemindex'] if soup.find('div', attrs={'problemindex': True}) else None
    item.title = soup.select_one('.header .title').text.strip() if soup.select_one('.header .title') else None
    item.timeLimit = soup.select_one('.header .time-limit').contents[-1].strip() if soup.select_one('.header .time-limit') else None
    item.memoryLimit = soup.select_one('.header .memory-limit').contents[-1].strip() if soup.select_one('.header .memory-limit') else None
    item.inputFile = soup.select_one('.header .input-file').contents[-1].strip() if soup.select_one('.header .input-file') else None
    item.outputFile = soup.select_one('.header .output-file').contents[-1].strip() if soup.select_one('.header .output-file') else None
    item.statement = soup.select_one('.problem-statement > div:nth-of-type(2)').decode_contents() if soup.select_one('.problem-statement > div:nth-of-type(2)') else None
    item.inputSpecification = soup.select_one('.input-specification').decode_contents() if soup.select_one('.input-specification') else None
    item.outputSpecification = soup.select_one('.output-specification').decode_contents() if soup.select_one('.output-specification') else None
    item.sampleTests = [div.decode_contents() for div in soup.select('.sample-tests .sample-test > div')] if soup.select('.sample-tests .sample-test > div') else None
    item.note = [p.decode_contents() for p in soup.select('.note > p')] if soup.select('.note > p') else None

    return item

# Function to process all files in a directory
def process_directory(directory_path):
    results = []
    for filename in os.listdir(directory_path):
        if filename.endswith('.html'):  # Only process HTML files
            file_path = os.path.join(directory_path, filename)
            try:
                item = parse_html(file_path)
                content = {k:v for k,v in vars(item).items()}
                with open(os.path.join("json",filename[:-5]+".json"), "w") as outfile: 
                    json.dump(content, outfile, indent=4, sort_keys=True)
            except:
                print (filename)
            #results.append(item)
    return results

# Example usage
if __name__ == '__main__':
    directory = ''  # Path to the directory containing HTML files
    all_items = process_directory(directory)
    
#     for item in all_items:
#         for k,v in vars(item).items():
#             print (k,v)
#         #print(vars(item))  # Print the attributes of each item as a dictionary
#         break

In [None]:
def parse_html_interactive(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()

    soup = BeautifulSoup(html_content, 'html.parser')
    
    item = CodeforcesProblemItem()

    # Populate the item attributes
    problem = file_path.split("/")[-1][:-5]
    contest, nr = problem.split("-")
    item.url = f"https://codeforces.com/contest/{contest}/problem/{nr}"  # Simulating a URL with the file path
    item.contestId = contest
    item.index = soup.find('div', attrs={'problemindex': True})['problemindex'] if soup.find('div', attrs={'problemindex': True}) else None
    item.title = soup.select_one('.header .title').text.strip() if soup.select_one('.header .title') else None
    #item.timeLimit = soup.select_one('.header .time-limit').text.strip() if soup.select_one('.header .time-limit') else None
    item.timeLimit = soup.select_one('.header .time-limit').contents[-1].strip() if soup.select_one('.header .time-limit') else None
    #item.memoryLimit = soup.select_one('.header .memory-limit').text.strip() if soup.select_one('.header .memory-limit') else None
    item.memoryLimit = soup.select_one('.header .memory-limit').contents[-1].strip() if soup.select_one('.header .memory-limit') else None
    #item.inputFile = soup.select_one('.header .input-file').text.strip() if soup.select_one('.header .input-file') else None
    item.inputFile = soup.select_one('.header .input-file').contents[-1].strip() if soup.select_one('.header .input-file') else None
    #item.outputFile = soup.select_one('.header .output-file').text.strip() if soup.select_one('.header .output-file') else None
    item.outputFile = soup.select_one('.header .output-file').contents[-1].strip() if soup.select_one('.header .output-file') else None
    item.statement = soup.select_one('.problem-statement > div:nth-of-type(2)').decode_contents() if soup.select_one('.problem-statement > div:nth-of-type(2)') else None
    item.inputSpecification = soup.select_one('.input-specification').decode_contents() if soup.select_one('.input-specification') else None
    item.outputSpecification = soup.select('.problem-statement > div')[3].decode_contents() if len(soup.select('.problem-statement > div')) > 3 else None
    #item.outputSpecification = soup.select_one('.output-specification').decode_contents() #if soup.select_one('.output-specification') else None
    item.sampleTests = [div.decode_contents() for div in soup.select('.sample-tests .sample-test > div')] if soup.select('.sample-tests .sample-test > div') else None
    item.note = [p.decode_contents() for p in soup.select('.note > p')] if soup.select('.note > p') else None

    return item

In [None]:
def parse_html_interactive_no_input(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()

    soup = BeautifulSoup(html_content, 'html.parser')
    
    item = CodeforcesProblemItem()

    # Populate the item attributes
    problem = file_path.split("/")[-1][:-5]
    contest, nr = problem.split("-")
    item.url = f"https://codeforces.com/contest/{contest}/problem/{nr}"  # Simulating a URL with the file path
    item.contestId = contest
    item.index = soup.find('div', attrs={'problemindex': True})['problemindex'] if soup.find('div', attrs={'problemindex': True}) else None
    item.title = soup.select_one('.header .title').text.strip() if soup.select_one('.header .title') else None
    #item.timeLimit = soup.select_one('.header .time-limit').text.strip() if soup.select_one('.header .time-limit') else None
    item.timeLimit = soup.select_one('.header .time-limit').contents[-1].strip() if soup.select_one('.header .time-limit') else None
    #item.memoryLimit = soup.select_one('.header .memory-limit').text.strip() if soup.select_one('.header .memory-limit') else None
    item.memoryLimit = soup.select_one('.header .memory-limit').contents[-1].strip() if soup.select_one('.header .memory-limit') else None
    #item.inputFile = soup.select_one('.header .input-file').text.strip() if soup.select_one('.header .input-file') else None
    item.inputFile = soup.select_one('.header .input-file').contents[-1].strip() if soup.select_one('.header .input-file') else None
    #item.outputFile = soup.select_one('.header .output-file').text.strip() if soup.select_one('.header .output-file') else None
    item.outputFile = soup.select_one('.header .output-file').contents[-1].strip() if soup.select_one('.header .output-file') else None
    item.statement = soup.select_one('.problem-statement > div:nth-of-type(2)').decode_contents() if soup.select_one('.problem-statement > div:nth-of-type(2)') else None
    item.inputSpecification = soup.select_one('.input-specification').decode_contents() if soup.select_one('.input-specification') else None
    item.outputSpecification = soup.select('.problem-statement > div')[2].decode_contents() if len(soup.select('.problem-statement > div')) > 2 else None
    #item.outputSpecification = soup.select_one('.output-specification').decode_contents() #if soup.select_one('.output-specification') else None
    item.sampleTests = [div.decode_contents() for div in soup.select('.sample-tests .sample-test > div')] if soup.select('.sample-tests .sample-test > div') else None
    item.note = [p.decode_contents() for p in soup.select('.note > p')] if soup.select('.note > p') else None

    return item

In [None]:
def parse_html_1220-C(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()

    soup = BeautifulSoup(html_content, 'html.parser')
    
    item = CodeforcesProblemItem()

    # Populate the item attributes
    problem = file_path.split("/")[-1][:-5]
    contest, nr = problem.split("-")
    item.url = f"https://codeforces.com/contest/{contest}/problem/{nr}"  # Simulating a URL with the file path
    item.contestId = contest
    item.index = soup.find('div', attrs={'problemindex': True})['problemindex'] if soup.find('div', attrs={'problemindex': True}) else None
    item.title = soup.select_one('.header .title').text.strip() if soup.select_one('.header .title') else None
    #item.timeLimit = soup.select_one('.header .time-limit').text.strip() if soup.select_one('.header .time-limit') else None
    item.timeLimit = soup.select_one('.header .time-limit').contents[-1].strip() if soup.select_one('.header .time-limit') else None
    #item.memoryLimit = soup.select_one('.header .memory-limit').text.strip() if soup.select_one('.header .memory-limit') else None
    item.memoryLimit = soup.select_one('.header .memory-limit').contents[-1].strip() if soup.select_one('.header .memory-limit') else None
    #item.inputFile = soup.select_one('.header .input-file').text.strip() if soup.select_one('.header .input-file') else None
    item.inputFile = "standard input"
    #item.outputFile = soup.select_one('.header .output-file').text.strip() if soup.select_one('.header .output-file') else None
    item.outputFile = "standard output"
    item.statement = soup.select_one('.problem-statement > div:nth-of-type(2)').decode_contents() if soup.select_one('.problem-statement > div:nth-of-type(2)') else None
    item.inputSpecification = soup.select_one('.input-specification').decode_contents() if soup.select_one('.input-specification') else None
    item.outputSpecification = soup.select_one('.output-specification').decode_contents() if soup.select_one('.output-specification') else None
    item.sampleTests = [div.decode_contents() for div in soup.select('.sample-tests .sample-test > div')] if soup.select('.sample-tests .sample-test > div') else None
    item.note = [p.decode_contents() for p in soup.select('.note > p')] if soup.select('.note > p') else None

    return item