# Packages

In [30]:
import pandas as pd
import openai
import ast
import numpy as np
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
from fake_useragent import UserAgent
from datetime import datetime
import time as t
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options 
from selenium.webdriver.common.keys import Keys

import logging
import warnings
warnings.filterwarnings("ignore")

# Functions

In [225]:
# Function to read the API key from a text file
def load_api_key(file_path='api_key.txt'):
    with open(file_path, 'r') as file:
        return file.read().strip()

In [16]:
def flatten(A):
    rt = []
    for i in A:
        if isinstance(i,list): rt.extend(flatten(i))
        else: rt.append(i)
    return rt

In [227]:
def extract_list_from_response(response_content):
    """
    This helper function processes the response content to extract the actual Python list.
    It removes any extraneous backticks and code block markers, and then safely evaluates
    the content to return the list.
    """
    # Remove code block markers and extraneous characters
    cleaned_content = [line for line in response_content.splitlines() if line.startswith('[') and line.endswith(']')]
    
    if cleaned_content:
        try:
            # Safely evaluate the cleaned content as a Python list
            return ast.literal_eval(cleaned_content[0])
        except (SyntaxError, ValueError):
            return []
    return []

In [1004]:
# Function to process the dataframe
def process_dataframe(df, api_key, chatgptfunction, NewColNameStr, UseColNameStr):
    def process_list(x):
        try:
            if x != [''] or np.isnan(x):
                return chatgptfunction(x, api_key)
            else:
                return []
        except:
            if x != ['']:
                return chatgptfunction(x, api_key)
            else:
                return []

    df[NewColNameStr] = df[UseColNameStr].apply(process_list)
    return df

In [231]:
def extract_list_from_response(response_content):
    """
    This helper function processes the response content to extract the actual Python list.
    It removes any extraneous backticks and code block markers, and then safely evaluates
    the content to return the list.
    """
    # Remove code block markers and extraneous characters
    cleaned_content = [line for line in response_content.splitlines() if line.startswith('[') and line.endswith(']')]
    
    if cleaned_content:
        try:
            # Safely evaluate the cleaned content as a Python list
            return ast.literal_eval(cleaned_content[0])
        except (SyntaxError, ValueError):
            return []
    return []

In [465]:
# Function to standardize model names
def standardize_and_generalize_model_names(model_list, api_key):
    openai.api_key = api_key

    if model_list == ['']:
        return []

    standardization_prompt = f"""
    Standardize the following list of machine learning and deep learning model names according to the following rules:
    1. Standardize variations of the same model to a single form. Make sure short forms of model names are changed into long form (e.g., 'xgb' and 'XGBoost' should both be 'xgboost', 'CatBoost' and 'CB' should be 'catboost', LGBM should be lightgbm, etc.).
    2. For deep learning architectures, keep only the base architecture name (e.g., ResNet18 and ResNet50 should both be resnet).
    3. For general architecture types, simplify to the basic form (e.g., '2D CNN' and '3D CNN' should be 'cnn', '2D Unet' should be 'unet', rnn model should be rnn, etc.).
    4. Remove duplicates, ensuring each model name appears only once.

    List: {str(model_list)}

    Return only a final unique Python list of the model names.
    """

    # Use the chat completions endpoint for the filtering step
    generalizing_response = openai.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": standardization_prompt}
        ]
    )
    generalized_names = extract_list_from_response(generalizing_response.choices[0].message.content)
    
    return list(set([name.strip() for name in generalized_names if name.strip()]))

In [710]:
# Function to standardize model names
def filter_models(model_list, api_key):
    openai.api_key = api_key

    if model_list == ['']:
        return []

    filtering_prompt = f"""
    Filter the following list of machine learning (ML) and deep learning (DL) model and method names according to the following rules:
    1. Remove anything none ML or DL related (e.g. Python package and library names, as well as operations such as pairwise ranking, or smoothing)
    2. Remove things like paths, usernames for websites, etc.
    3. Remove anything very general such as the 'deep learning', or machine learning.
    3. Do not remove things such as names of language models, DL models and architecture types.

    List: {str(model_list)}

    Keep the remaining elements of the list with the exactlty as they were originally. Return only a final Python list containing the filtered elements.
    """

    # Use the chat completions endpoint for the filtering step
    filtering_response = openai.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": filtering_prompt}
        ]
    )
    filtered_names = extract_list_from_response(filtering_response.choices[0].message.content)
    
    return list(set([name.strip() for name in filtered_names if name.strip()]))

In [954]:
# Function to clean competition tags
def clean_tags(tag_list, api_key):
    openai.api_key = api_key
    if tag_list == ['']:
        return []

    cleaning_prompt = f"""
    Clean the following list of machine learning (ML) and deep learning (DL) task tags according to the following rules:
    1. Remove anything none ML or DL task related.
    2. Remove names of general fields such as education, sports, mathematics, robotics, etc. as they are not task specific (task specific examples are e.g image classification).
    3. Standardize the names of the tags into a simple form if neccesary.
    4. If neccessary, merge two tags into one (e.g 'image', 'classification' can be 'image classification').
    5. General ML fields such as computer vision or classical machine learning are fine.

    List: {str(tag_list)}

    Return only a final Python list without duplicates containing the cleaned tags.
    """

    # Use the chat completions endpoint for the filtering step
    cleaning_response = openai.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": cleaning_prompt}
        ]
    )
    cleaned_names = extract_list_from_response(cleaning_response.choices[0].message.content)
    
    return list(set([name.strip() for name in cleaned_names if name.strip()]))

In [1060]:
# Function to group models hierarchically 
def group_models(mod_list, api_key):
    openai.api_key = api_key
    if mod_list == ['']:
        return []

    grouping_prompt = f"""
   Given the following list of machine learning (ML) and deep learning (DL) models/architectures, categorize each model into its broader category or type. For example, if the model is 'efficientnet,' categorize it under 'cnn'. If the model is 'xgboost,' or 'catboost', 'lightgbm' categorize it under 'boosting', etc. For cases where a given model is a classical ML model and no specific category exists for it, you can label it as 'classical ml'. Similarly, group all models based on their broader ML/DL category.

    List: {str(mod_list)}

    Return only a final Python list without duplicates containing the broader category/types of the models within the list.
    """

    # Use the chat completions endpoint for the filtering step
    grouping_response = openai.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": grouping_prompt}
        ]
    )
    grouped_names = extract_list_from_response(grouping_response.choices[0].message.content)
    
    return list(set([name.strip() for name in grouped_names if name.strip()]))

# Preprocessing

In [None]:
api_key = load_api_key()

In [235]:
data = pd.read_csv('kaggleWinningSolutionsExtracted.csv')

In [237]:
for i, value in enumerate(data['Extracted Model Names']):
    if isinstance(value, float) and np.isnan(value):
        continue
    else:
        data['Extracted Model Names'][i] = value.replace('[', '').replace(']', '').replace('\'', '').strip(' ').split(', ')

## Standardize Model Names

In [467]:
df = process_dataframe(data, api_key, standardize_and_generalize_model_names, 'Generalized Model Names', 'Extracted Model Names')

In [471]:
df.to_excel('kaggleWinningSolutionsGeneralized.xlsx', index=False)

## Filter Noise from Models

In [482]:
data = pd.read_excel('kaggleWinningSolutionsGeneralized.xlsx')

In [718]:
df = process_dataframe(data, api_key, filter_models, 'Filtered Generalized Model Names', 'Generalized Model Names')

In [836]:
df.to_excel('kaggleWinningSolutionsGeneralized.xlsx', index=False)

## Clean Competition Tags

In [838]:
data = pd.read_excel('kaggleWinningSolutionsGeneralized.xlsx')

In [842]:
for i, value in enumerate(data['Competition Tags']):
    if isinstance(value, float) and np.isnan(value):
        continue
    else:
        data['Competition Tags'][i] = value.replace('[', '').replace(']', '').replace('"', '').replace('\'', '').split(', ')

In [964]:
df = process_dataframe(data, api_key, clean_tags, 'Clean Competition Tags', 'Competition Tags')

In [968]:
df.to_excel('kaggleWinningSolutionsGeneralized.xlsx', index=False)

## Categorize Models Hierarchically

In [971]:
data = pd.read_excel('kaggleWinningSolutionsGeneralized.xlsx')

In [988]:
for i, value in enumerate(data['Filtered Generalized Model Names']):
    if isinstance(value, float) and np.isnan(value):
        continue
    else:
        data['Filtered Generalized Model Names'][i] = value.replace('[', '').replace(']', '').replace('\'', '').strip(' ').split(', ')

In [1068]:
df = process_dataframe(data, api_key, group_models, 'Broader Hierarchical Group', 'Filtered Generalized Model Names')

In [1070]:
df.to_excel('kaggleWinningSolutionsGeneralized.xlsx', index=False)

# Scrape Timestamps

In [6]:
data = pd.read_excel('kaggleWinningSolutionsGeneralized.xlsx')

In [36]:
ua = UserAgent()
options = Options()
options.add_argument("--incognito")
options.add_argument("--headless")
options.add_argument('--disable-blink-features=AutomationControlled')

In [54]:
# Dictionary to store unique links and their corresponding dates
unique_links_dates = {}

browser = webdriver.Chrome(options=options)  
for link in tqdm(data['Competition Link']): 
    # Check if the link has already been processed
    if link in unique_links_dates:
        continue 

    browser.get(link)
    t.sleep(1)
    source = browser.page_source
    soup = BeautifulSoup(source, 'html.parser')

    test_date = soup.find('div', {'class':'sc-dxmpTp evmcxM'})
    if test_date:
        date_container = test_date.find_all('span', {'class':'sc-bSlUec bdWnNi'})
        date_raw = date_container[-1].text.strip()
        date_obj = datetime.strptime(date_raw, '%b %d, %Y')
        date = date_obj.strftime('%d-%m-%Y')
        unique_links_dates[link] = date  # Store the link and date in the dictionary
    else:
        unique_links_dates[link] = None  # Store None if the date is not found

browser.quit()

The chromedriver version (127.0.6533.72) detected in PATH at C:\chromedriver.exe might not be compatible with the detected chrome version (128.0.6613.113); currently, chromedriver 128.0.6613.86 is recommended for chrome 128.*, so it is advised to delete the driver in PATH and retry


  0%|          | 0/2613 [00:00<?, ?it/s]

In [60]:
data['Date'] = data['Competition Link'].map(unique_links_dates)

In [64]:
data.to_excel('kaggleWinningSolutionsGeneralized.xlsx', index=False)