In [14]:
from embedding_model import Embeddings
from pattern_matching import PatternMatching
from logistic_regression import LR
import pandas as pd
import time
import requests
import threading
import numpy as np

# Setup

In [15]:
# Embeddings model
targets = {
    1: 'hints_name',
    2: 'hints_tags',
    3: 'hints_short_description',
    4: 'hints_full_description',
    5: 'naics_label'
}
business_taxonomy = pd.read_csv('Business_category_taxonomy.csv')
naics_taxonomy = pd.read_csv('Naics3labeltaxonomy.csv')
hints = pd.read_csv('tournament_hints_data.csv')
labels = pd.read_csv('cleaned_naics_codes_final.csv')
files = {
    'business': business_taxonomy,
    'naics': naics_taxonomy,
    'hints': hints,
    'labels': labels
}

model_name = 'all-MiniLM-L6-v2'
embeddings = Embeddings(model_name, targets, files)

# Pattern matching
pattern_matching = PatternMatching(files['labels'])

# Logistic Regression
lr = LR('pipeline.joblib')

# NAICS label mapping
dictionary_labels = {}
for i in range(len(naics_taxonomy)):
    naics_code = naics_taxonomy['naics_code'][i]
    dictionary_labels[naics_code] = i
reversed_label_dict = {value: key for key, value in dictionary_labels.items()}

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [3]:
def run():
    # Get a new hint for current company or get the first hint for a new company after calling /evaluate/reset
    try:
        response = requests.get(f"{base_url}/evaluate/hint", headers=headers)
    except:
        print('GET request failed')
        return
    response_data = response.json()
    hint = response_data['hint']
    level = response_data['level']
    time.sleep(1)

    # Rounds
    confidence = 0.6
    abstained = False
    
    if level == 1:
        pred = pattern_matching.predict_naics(hint)
        if pred is None:
            confidence -= 0.1
            threshold = 0.5
            probs = lr.predict_proba(hint)
            best_class_indices = np.argmax(probs, axis=1)
            best_probabilities = np.max(probs, axis=1)
            if best_probabilities[0] >= threshold:
                pred = reversed_label_dict[best_class_indices[0]]
            else:
                confidence -= 0.1
                pred = embeddings(hint, level)
    elif level == 2:
        confidence -= 0.1
        pred = embeddings(hint, level)
    elif level == 3:
        if np.random.rand() > confidence and not abstained:
            pred = 'abstain'
            abstained = True
        else:
            confidence -= 0.1
            pred = embeddings(hint, level)
    elif level == 4:
        if np.random.rand() > confidence and not abstained:
            pred = 'abstain'
            abstained = True
        else:
            confidence -= 0.1
            pred = embeddings(hint, level)
    elif level == 5:
        if not abstained:
            pred = 'abstain'
            abstained = True
        else:
            pred = embeddings(hint, level)
    print(f'Round {level}, Hint - {hint}, Pred - {pred}')
    mapping_dict = dict(zip(naics_taxonomy['naics_code'], naics_taxonomy['naics_label']))
    data = {
        'answer': f'{pred} - {mapping_dict(pred)}'
    }
    response = requests.post(f"{base_url}/evaluate/answer", json=data, headers=headers)
    try:
        response_data = response.json()
        print(f"Round {level}, Result={response_data['result']}, Score={response_data['score']}, Answer={response_data['answer']}\n")
    except:
        pass

# Running cell

In [13]:
base_url = 'http://116.202.111.229:8000'
api_key = 'AjBzyzk6Wg8V6hD48EH8btOOOJOAtYn1'

headers = {
    'x-api-key': api_key
}

# 5 rounds
# for _ in range(5):
thread = threading.Thread(target=run)
thread.start()
thread.join(timeout=4.5)

# Check if thread is still alive after the timeout
if thread.is_alive():
    pred = naics_taxonomy['naics_code'].sample(1).iloc[0]
    mapping_dict = dict(zip(naics_taxonomy['naics_code'], naics_taxonomy['naics_label']))
    data = {
        'answer': f'{pred} - {mapping_dict(pred)}'
    }
    response = requests.post(f"{base_url}/evaluate/answer", json=data, headers=headers)
    response_data = response.json()
    print(f"Round {response_data['level']}, Result={response_data['result']}, Score={response_data['score']}, Answer={response_data['answer']}\n")
else:
    thread.join()
    time.sleep(1)

# Get hints about a new company
# response = requests.get(f"{base_url}/evaluate/reset", headers=headers)
# print(response.status_code, response.json())

Round 5, Hint - Art & Music Publishers, Pred - 513


In [28]:
# Get hints about a new company
response = requests.get(f"{base_url}/evaluate/reset", headers=headers)
print(response.status_code, response.json())

500 {'title': '500 Internal Server Error'}
