# Packages

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import pipeline
import torch
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from keybert import KeyBERT
import openai
import ast
import re

import logging
import warnings
warnings.filterwarnings("ignore")

# Functions

In [221]:
# Function to read the API key from a text file
def load_api_key(file_path='api_key.txt'):
    with open(file_path, 'r') as file:
        return file.read().strip()

In [4]:
def extract_list_from_response(response_content):
    """
    This helper function processes the response content to extract the actual Python list.
    It removes any extraneous backticks and code block markers, and then safely evaluates
    the content to return the list.
    """
    # Remove code block markers and extraneous characters
    cleaned_content = [line for line in response_content.splitlines() if line.startswith('[') and line.endswith(']')]
    
    if cleaned_content:
        try:
            # Safely evaluate the cleaned content as a Python list
            return ast.literal_eval(cleaned_content[0])
        except (SyntaxError, ValueError):
            return []
    return []

In [161]:
# Function to extract and filter model names using a two-prompt approach
def extract_and_filter_model_names(description, api_key):
    openai.api_key = api_key

    if pd.isna(description):
        return []

    extraction_prompt = f"""
    Extract only the names of machine learning and deep learning models mentioned in the following solution description, keeping only the names that are used as part of the final solution. List all names in their original format. Do not change the format (if the model name is abbreviated keep it that way, letter case, etc.). Return only a Python list with the names. Do not include any additional text.

    Solution Description:
    {description}
    """

    # Use the chat completions endpoint for chat models like "gpt-4o"
    extraction_response = openai.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": extraction_prompt}
        ]
    )
    extracted_names = extract_list_from_response(extraction_response.choices[0].message.content)
    
    if not extracted_names or all(name.strip() == '' for name in extracted_names):
        return []

    filtering_prompt = f"""
    From the extracted list of names, exclude:
    1. Terms related to pooling, activation functions, normalization, and operations.
    2. Dataset names and hyperlinks.
    3. Duplicates, ensuring each model/method name appears only once.
    4. Packages/GitHub repos, mathematical operations.
    5. Additional model information.

    Return only a Python list with the filtered names. Do not include any additional text.

    Extracted list: {extracted_names}

    Provide a unique list of the remaining names.
    """

    # Use the chat completions endpoint for the filtering step
    filtering_response = openai.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": filtering_prompt}
        ]
    )

    filtered_names = extract_list_from_response(filtering_response.choices[0].message.content)
    
    return list(set([name.strip() for name in filtered_names if name.strip()]))

In [19]:
# Function to process the dataframe
def process_dataframe(df, api_key):
    df['Extracted Model Names'] = df['Solution Description'].apply(
        lambda x: extract_and_filter_model_names(x, api_key) if pd.notna(x) else []
    )
    return df

In [540]:
def normalize_empty_list(lst):
    """Convert a list with an empty string to an actual empty list."""
    if lst == [""]:
        return []
    return lst
    
def calculate_metrics(df):
    true_names = df['True Model Names']
    extracted_names = df['Extracted Model Names']
    
    f1_scores = []
    jaccard_indices = []
    
    for true, extracted in zip(true_names, extracted_names):
        # Normalize both true and extracted lists
        true_set = set(map(str.lower, normalize_empty_list(true)))
        extracted_set = set(map(str.lower, extracted))
        
        # Handle the case where both sets are empty: perfect match
        if not true_set and not extracted_set:
            f1_scores.append(1.0)
            jaccard_indices.append(1.0)
            continue
        
        # Calculate True Positives (TP), False Positives (FP), and False Negatives (FN)
        tp = len(true_set.intersection(extracted_set))
        fp = len(extracted_set - true_set)
        fn = len(true_set - extracted_set)
        
        # Calculate Precision, Recall, and F1 Score
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        f1_scores.append(f1)
        
        # Calculate Jaccard Index (Intersection over Union)
        if len(true_set.union(extracted_set)) > 0:
            jaccard = len(true_set.intersection(extracted_set)) / len(true_set.union(extracted_set))
        else:
            jaccard = 1.0  # This shouldn't be necessary now, but for safety
        
        jaccard_indices.append(jaccard)
    
    df['F1 Score'] = f1_scores
    df['Jaccard Index'] = jaccard_indices
    
    # Calculate final (mean) F1 Score and Jaccard Index
    final_f1_score = sum(f1_scores) / len(f1_scores)
    final_jaccard_index = sum(jaccard_indices) / len(jaccard_indices)
    
    return df, final_f1_score, final_jaccard_index

# Preprocessing

In [9]:
data = pd.read_csv('KaggleWinningSolutionsTraining.csv')

In [11]:
for i, value in enumerate(data['Competition Tags']):
    if isinstance(value, float) and np.isnan(value):
        continue
    else:
        data['Competition Tags'][i] = value.split(', ')

In [None]:
# 95:5 train/test split
train_data, test_data = train_test_split(data, test_size=0.05, random_state=42, shuffle=True)

In [None]:
train_data.to_csv('kaggleWinningSolutionsTraining.csv', index=False)
test_data.to_excel('kaggleWinningSolutionsTesting.xlsx', index=False)

In [480]:
testdata = pd.read_excel('KaggleWinningSolutionsTesting.xlsx')

# Analysis

## Useless Methods

### Pre-trained Models with Hugging Face

In [None]:
# Suppress logging warnings
logging.getLogger("transformers").setLevel(logging.ERROR)

# Check if a GPU is available
device = 0 if torch.cuda.is_available() else -1

# Load pre-trained NER model with device parameter
ner_model = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", device=device)
text = data['Solution Description'][0]

# Perform NER
entities = ner_model(text)

# Extract model names
model_names = [entity['word'] for entity in entities if entity['entity'] in ['MODEL', 'ORG']] # 'ORG' can sometimes capture model names
model_names

### KeyBert

In [None]:
# Load KeyBERT model
kw_model = KeyBERT()
text = data['Solution Description'][2]

# Extract keywords
keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words=None)
keywords

### Text Similarity and Clustering (Spacy)

In [None]:
# Load Spacy model
nlp = spacy.load("en_core_web_md")

# Define a list of known ML models
known_models = [
    "LightGBM", "XGBoost", "CatBoost", "Random Forest", "SVM", "Naive Bayes",
    "KNN", "Logistic Regression", "Gradient Boosting", "Neural Network", "LSTM", "GRU"
]

# Get embeddings for known models
known_model_vectors = np.array([nlp(model).vector for model in known_models])

# Sample text
text = data['Solution Description'][0]

# Process the text
doc = nlp(text)

# Extract potential model names based on similarity
potential_models = []
for token in doc:
    if token.has_vector:
        similarity = cosine_similarity([token.vector], known_model_vectors).max()
        if similarity > 0.6:  # You can adjust the threshold
            potential_models.append(token.text)

print("Potential model names:", set(potential_models))

## ChatGPT API

### Main Data

In [234]:
api_key = load_api_key()

df = process_dataframe(data, api_key)

In [240]:
df.to_csv('kaggleWinningSolutionsExtracted.csv', index=False)

### Testing Set F1 Score & Jaccard Index

In [482]:
for i, value in enumerate(testdata['True Model Names']):
    if isinstance(value, float) and np.isnan(value):
        continue
    else:
        testdata['True Model Names'][i] = value.replace('"', '').replace('[', '').replace(']', '').strip(' ').split(', ')

In [508]:
df_test = process_dataframe(testdata, api_key)

In [512]:
df_test.to_excel('kaggleWinningSolutionsTestingExtracted.xlsx', index=False)

In [558]:
df_test = pd.read_excel('kaggleWinningSolutionsTestingExtracted.xlsx')

In [560]:
testset_with_metrics, final_f1_score, final_jaccard_index = calculate_metrics(df_test)

testset_with_metrics.head()
print(f"Final F1 Score: {round(final_f1_score, 2)}")
print(f"Final Jaccard Index: {round(final_jaccard_index, 2)}")

Final F1 Score: 0.85
Final Jaccard Index: 0.78
