# Packages

In [2]:
from bs4 import BeautifulSoup
import requests
import os
import random
import time as t
import pandas as pd
import re
from tqdm.notebook import tqdm
from fake_useragent import UserAgent
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.service import Service
from stem import Signal
from stem.control import Controller
from subprocess import Popen, PIPE
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options 
from selenium.webdriver.common.keys import Keys
from IPython.core.display import HTML
from IPython.display import clear_output, display


import warnings
warnings.filterwarnings("ignore")

# Functions

In [14]:
def flatten(A):
    rt = []
    for i in A:
        if isinstance(i,list): rt.extend(flatten(i))
        else: rt.append(i)
    return rt

In [6]:
def uniquize(seq):
    seen = set()
    seen_add = seen.add
    return [x for x in seq if not (x in seen or seen_add(x))]

In [46]:
tor_path = r'C:\Users\david\Desktop\Tor Browser\Browser\TorBrowser\Tor\tor.exe'
chrome_driver_path = r'C:\chromedriver.exe'
socks_port = 9052
control_port = 9051

def start_tor():    
    torrc_content = f'''
    SocksPort {socks_port}
    ControlPort {control_port}
    CookieAuthentication 0
    '''
    with open('torrc.tmp', 'w') as f:
        f.write(torrc_content)
    
    tor_command = f'"{tor_path}" -f torrc.tmp'
    tor_process = Popen(tor_command, stdout=PIPE, stderr=PIPE, shell=True)
    t.sleep(1) 
    return tor_process
    
def stop_tor(tor_process):
    if tor_process:
        tor_process.terminate()
        tor_process.wait()

def change_tor_identity():
    try:
        with Controller.from_port(port=control_port) as controller:
            controller.authenticate()  # no authentication required with CookieAuthentication 0
            controller.signal(Signal.NEWNYM)
    except Exception as e:
        print(f"Failed to change Tor identity: {e}")

def create_browser():
    PROXY = f"socks5://localhost:{socks_port}"
    options = webdriver.ChromeOptions()
    options.add_argument('--proxy-server=%s' % PROXY)
    # options.add_argument('--headless')
    # options.add_argument('--disable-gpu')
    # options.add_argument('--no-sandbox')
    
    service = Service(executable_path=chrome_driver_path)
    driver = webdriver.Chrome(service=service, options=options)
    return driver

# Generate and Scrape

## Generate Data via Meta Kaggle 

In [None]:
data_path = "./"  

competitions_df = pd.read_csv(data_path + "Competitions.csv")
comps_to_use = ["Featured", "Research", "Recruitment"]
competitions_df = competitions_df[competitions_df["HostSegmentTitle"].isin(comps_to_use)]
competitions_df["EnabledDate"] = pd.to_datetime(competitions_df["EnabledDate"], format="%m/%d/%Y %H:%M:%S")
competitions_df = competitions_df.sort_values(by="EnabledDate", ascending=False).reset_index(drop=True)

forum_topics_df = pd.read_csv(data_path + "ForumTopics.csv")
comp_tags_df = pd.read_csv(data_path + "CompetitionTags.csv")
tags_df = pd.read_csv(data_path + "Tags.csv", usecols=["Id", "Name"])

def get_comp_tags(comp_id):
    temp_df = comp_tags_df[comp_tags_df["CompetitionId"] == comp_id]
    temp_df = pd.merge(temp_df, tags_df, left_on="TagId", right_on="Id")
    tags_str = "Tags : "
    for _, row in temp_df.iterrows():
        tags_str += row["Name"] + ", "
    return tags_str.strip(", ")

def check_solution(topic):
    is_solution = False
    to_exclude = ["?", "submit", "why", "what", "resolution", "benchmark"]
    if "solution" in topic.lower():
        is_solution = True
        for exc in to_exclude:
            if exc in topic.lower():
                is_solution = False
    to_include = ["2nd place code", '"dance with ensemble" sharing']
    for inc in to_include:
        if inc in topic.lower():
            is_solution = True
    return is_solution

def get_discussion_results(forum_id, n):
    results_df = forum_topics_df[forum_topics_df["ForumId"] == forum_id]
    results_df["is_solution"] = results_df["Title"].apply(lambda x: check_solution(str(x)))
    results_df = results_df[results_df["is_solution"] == 1]
    results_df = results_df.sort_values(by=["Score", "TotalMessages"], ascending=False).head(n).reset_index(drop=True)
    return results_df[["Title", "Id", "Score", "TotalMessages", "TotalReplies"]]

def render_html_for_comp(forum_id, comp_id, comp_name, comp_slug, comp_subtitle, n):
    results_df = get_discussion_results(forum_id, n)
    
    if len(results_df) < 1:
        return
    
    comp_tags = get_comp_tags(comp_id)
    
    comp_url = "https://www.kaggle.com/c/" + str(comp_slug)
    hs = f"""
        <style>
            .rendered_html tr {{font-size: 12px; text-align: left;}}
            th {{
                text-align: left;
            }}
        </style>
        <h3><font color="#1768ea"><a href="{comp_url}">{comp_name}</font></h3>
        <p>{comp_subtitle}</p>
    """
    
    if comp_tags != "Tags :":
        hs += f"""
            <p>{comp_tags}</p>
        """
    
    hs += """
        <table>
        <tr>
            <th><b>S.No</b></th>
            <th><b>Discussion Title</b></th>
            <th><b>Number of upvotes</b></th>
            <th><b>Total Replies</b></th>
        </tr>"""
    
    for i, row in results_df.iterrows():
        url = f"https://www.kaggle.com/c/{comp_slug}/discussion/{row['Id']}"
        hs += f"""
            <tr>
                <td>{i+1}</td>
                <td><a href="{url}" target="_blank"><b>{row['Title']}</b></a></td>
                <td>{row['Score']}</td>
                <td>{row['TotalReplies']}</td>
            </tr>"""
    hs += "</table>"
    display(HTML(hs))
    
    return hs  # Return the HTML string

# Collect the rendered HTML in a list
html_list = []

for _, comp_row in competitions_df.iterrows():
    html_string = render_html_for_comp(comp_row["ForumId"], comp_row["Id"], comp_row["Title"], comp_row["Slug"], comp_row["Subtitle"], 12)
    if html_string:
        html_list.append(html_string)

## Save Generated HTML as Website

In [None]:
# Combine the HTML strings into a single HTML document
full_html = "<html><body>" + "".join(html_list) + "</body></html>"

# Save the HTML to a file
with open("kaggle_winning_solutions.html", "w", encoding="utf-8") as file:
    file.write(full_html)

print("HTML file saved. You can now open 'kaggle_winning_solutions.html' in your web browser to inspect the content.")

## Parse Website & Scrape

### Data Part 1 (Competitions)

In [10]:
# Read the HTML file content
with open("kaggle_winning_solutions.html", "r", encoding="utf-8") as file:
    html_content = file.read()

soup = BeautifulSoup(html_content, 'html.parser')

# Extract competition titles
competition_titles_raw = soup.find_all('h3')
competition_titles = [i.text for i in competition_titles_raw]
competition_links = [i.find('a', {'href':True})['href'] for i in competition_titles_raw]

# Extract short descriptions for competitions
details_container = []
for h3 in competition_titles_raw:
    next_p_tag = h3.find_next('p')
    if next_p_tag:
        details_container.append(next_p_tag)
        
details_small = [i.text for i in details_container]

# Extract competition tags
tags_container = []
for h3 in competition_titles_raw:
    p_tags = []
    next_sibling = h3.find_next_sibling()
    while next_sibling and len(p_tags) < 2:
        if next_sibling.name == 'p':
            p_tags.append(next_sibling)
        next_sibling = next_sibling.find_next_sibling()
    
    if len(p_tags) == 2:
        tags_container.append(p_tags[1])
    else:
        tags_container.append(None)

for i, p_tag in enumerate(tags_container):
    if p_tag and 'Tags' not in p_tag.text:
        tags_container[i] = None

for i, tag in enumerate(tags_container):
    if tag is not None:
        tags_container[i] = tag.text.replace('Tags : ', '')
    else:
        continue

# Extract solutions
solutions_container = soup.find_all('table')
solutions_raw = [i.find_all('tr')[1:] for i in solutions_container]

# Store data extracted so far in dataframe
all_competition_links = []
all_competition_titles = []
all_details_small = []
all_tags = []
all_solution_links = []
for i, competition in enumerate(solutions_raw):
    solution_links = [i.find('a', {'href':True})['href'] for i in competition]
    for j in range(len(solution_links)):
        all_competition_titles.append(competition_titles[i])
        all_details_small.append(details_small[i])
        all_tags.append(tags_container[i])
        all_competition_links.append(competition_links[i])
        
    all_solution_links.append(solution_links)
    
all_solution_links = flatten(all_solution_links)

In [11]:
df = pd.DataFrame()

df['Competition Link'] = all_competition_links
df['Competition Title'] = all_competition_titles
df['Competition Short Description'] = all_details_small
df['Competition Tags'] = all_tags
df['Winning Solution Links'] = all_solution_links

### Data Part 2 (Competitions)

In [59]:
# Ensure any previous Tor processes are terminated
os.system("taskkill /f /im tor.exe")

all_overviews = []
all_data = []
previous_link = None
max_retries = 300  

tor_process = start_tor()
change_tor_identity()
browser = create_browser()

progress_bar = tqdm(all_competition_links)
log_display = display(HTML(""), display_id=True)
for index, link in enumerate(progress_bar):
    log_display.update(HTML(""))
    retries = 0
    success = False
    while retries < max_retries and not success:
        try:
            if link == previous_link:
                all_overviews.append(overview)
                all_data.append(data)
                success = True
            else:
                browser.get(link)
                t.sleep(3)
                source = browser.page_source
                soup = BeautifulSoup(source, 'html.parser')

                test_page = soup.find('div', {'class': 'sc-kpdYNm bUTszs'})
                if test_page:                
                    test_overview = soup.find('div', {'class': 'sc-gZPpYW YYkRx'})
                    if test_overview:
                        overview_raw = test_overview.find_all('p')
                        overview = [i.text.replace('\n', '').replace('\u200b', '') for i in overview_raw]
                        overview = ' '.join(overview)
                        all_overviews.append(overview)
                    else:
                        overview = None
                        all_overviews.append(overview)
                        
                    browser.get(link + '/data')
                    t.sleep(3)
                    source = browser.page_source
                    soup = BeautifulSoup(source, 'html.parser')
                    
                    test_data = soup.find('div', {'class': 'sc-ldJDQK inGMtK'})
                    if test_data:
                        test_ptag = test_data.find_all('p') 
                        if test_ptag:
                            data_narrow = test_ptag 
                            data_raw = [i.text.replace('\n', '').replace('\u200b', '') for i in data_narrow]
                            data = '. '.join(data_raw)
                            all_data.append(data)
                        else:
                            data_narrow = test_data.find_all('div')
                            data_raw = [i.text.replace('\n', '').replace('\u200b', '') for i in data_narrow]
                            data = '. '.join(data_raw)
                            all_data.append(data)
                    else:
                        data = None
                        all_data.append(data)
                    
                    previous_link = link
                    success = True 
                else:
                    raise ValueError(f'Error processing link {link} at index {index}')
        except:
            log_display.update(HTML(f"<p>Error processing link {link}</p>"))
            retries += 1
            if retries < max_retries:
                log_display.update(HTML(f"<p>Retrying with a new IP address... retry number {retries}</p>"))
                browser.quit()
                t.sleep(0.5)
                stop_tor(tor_process)
                os.system("taskkill /f /im tor.exe")
                tor_process = start_tor()
                change_tor_identity()
                browser = create_browser()
            else:
                log_display.update(HTML("<p>Max retries reached. Moving to next link.</p>"))

browser.quit()
stop_tor(tor_process)
os.remove('torrc')
log_display.update(HTML("<p>Cleanup completed.</p>"))

  0%|          | 0/2751 [00:00<?, ?it/s]

In [None]:
df['Competition Extended Description'] = all_overviews
df['Competition Data Description'] = all_data

### Data Part 3 (Solutions)

In [53]:
# Ensure any previous Tor processes are terminated
os.system("taskkill /f /im tor.exe")

all_solutions = []
all_images = []
all_urls = []

progress_bar = tqdm(list(df['Winning Solution Links'].values))
log_display = display(HTML(""), display_id=True)
batch_size = 300

for i, solution in enumerate(progress_bar):
    log_display.update(HTML(""))
    if i % batch_size == 0:
        if i != 0:
            browser.quit()
            stop_tor(tor_process)
            os.system("taskkill /f /im tor.exe")
            
        tor_process = start_tor()
        change_tor_identity()
        browser = create_browser()

    retries = 0
    success = False
    while retries < max_retries and not success:
        try:
            browser.get(solution)
            t.sleep(random.randint(5, 10))  # Randomize sleep time
            source = browser.page_source
            soup = BeautifulSoup(source, 'html.parser')

            test_solution = soup.find('div', {'class':'sc-fbbrMC eWoIO'})
            if test_solution:
                solution_container = test_solution.find_all('p')
                solution_raw = [i.text.replace('\n', '').replace('\u200b', '') for i in solution_container]
                solution = ' '.join(solution_raw)
                all_solutions.append(solution)

                test_image = test_solution.find('img')
                if test_image:
                    image = True
                    all_images.append(image)
                else:
                    image = False
                    all_images.append(image)

                test_urls = test_solution.find('a', {'href': True})
                if test_urls:
                    urls_raw = test_solution.find_all('a', {'href': True})
                    urls = [i['href'] for i in urls_raw]
                    all_urls.append(urls)
                else:
                    urls = None
                    all_urls.append(urls)
                success = True
            else:
                raise ValueError(f'Problem at iteration: {i}')

        except Exception as e:
            log_display.update(HTML(f"<p>Error processing solution {solution}: {e}</p>"))
            retries += 1
            if retries < max_retries:
                log_display.update(HTML(f"<p>Retrying with a new IP address... retry number {retries}</p>"))
                browser.quit()
                t.sleep(0.5)
                stop_tor(tor_process)
                os.system("taskkill /f /im tor.exe")
                tor_process = start_tor()
                change_tor_identity()
                browser = create_browser()
            else:
                success = True
                
                solution = None
                all_solutions.append(solution)
                image = None
                all_images.append(image)
                urls = None
                all_urls.append(urls)
                log_display.update(HTML("<p>Max retries reached. Moving to next link.</p>"))

browser.quit()
stop_tor(tor_process)
os.remove('torrc')
log_display.update(HTML("<p>Cleanup completed.</p>"))

  0%|          | 0/638 [00:00<?, ?it/s]

In [64]:
df['Solution Description'] = all_solutions
df['Contains Image'] = all_images
df['Internal Links'] = all_urls

# Store

In [86]:
df.to_csv('kaggleWinningSolutions.csv', index=False)