# LLM Recommender

In [None]:
%pip install singlestoredb openai tiktoken beautifulsoup4 pandas python-dotenv Markdown praw PyGithub --quiet

## Dependencies importing

In [None]:
import re
import json
import openai
import tiktoken
import json
import requests
import pandas as pd
import singlestoredb as s2
from bs4 import BeautifulSoup
from markdown import markdown
from datetime import datetime
from time import time

from github import Github
from github import Auth

import praw

## Variables setup

In [None]:
DB_NAME = 'llm_recommender'
DB_CONNECTION_URL = f'/{DB_NAME}'

OPENAI_API_KEY = ''

HF_TOKEN = ''

REDDIT_USERNAME = ''
REDDIT_PASSWORD = ''
REDDIT_CLIENT_ID = ''
REDDIT_CLIENT_SECRET = ''
REDDIT_USER_AGENT = 'llm_recommender_1.0'

GITHUB_ACCESS_TOKEN = ''

TOKENS_LIMIT = 2047
TOKENS_TRASHHOLD_LIMIT = TOKENS_LIMIT - 128

MODELS_TABLE_NAME = 'models'
MODEL_READMES_TABLE_NAME = 'model_readmes'
MODEL_REDDIT_POSTS_TABLE_NAME = 'model_reddit_posts'
MODEL_GITHUB_REPOS_TABLE_NAME = 'model_github_repos'

## Database setup

In [None]:
connection = s2.connect(DB_CONNECTION_URL)


def create_tables():
    def create_models_table():
        with connection.cursor() as cursor:
            cursor.execute(f'''
                CREATE TABLE IF NOT EXISTS {MODELS_TABLE_NAME} (
                    id INT AUTO_INCREMENT PRIMARY KEY,
                    name VARCHAR(512) NOT NULL,
                    author VARCHAR(512) NOT NULL,
                    repo_id VARCHAR(1024) NOT NULL,
                    score DECIMAL(5, 2) NOT NULL,
                    arc DECIMAL(5, 2) NOT NULL,
                    hellaswag DECIMAL(5, 2) NOT NULL,
                    mmlu DECIMAL(5, 2) NOT NULL,
                    truthfulqa DECIMAL(5, 2) NOT NULL,
                    winogrande DECIMAL(5, 2) NOT NULL,
                    gsm8k DECIMAL(5, 2) NOT NULL,
                    link VARCHAR(255) NOT NULL,
                    downloads INT,
                    likes INT,
                    still_on_hub BOOLEAN NOT NULL,
                    created_at TIMESTAMP,
                    embedding BLOB
                )
            ''')

    def create_model_readmes_table():
        with connection.cursor() as cursor:
            cursor.execute(f'''
                CREATE TABLE IF NOT EXISTS {MODEL_READMES_TABLE_NAME} (
                    id INT AUTO_INCREMENT PRIMARY KEY,
                    model_repo_id VARCHAR(512),
                    text LONGTEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,
                    clean_text LONGTEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,
                    created_at TIMESTAMP,
                    embedding BLOB
                )
            ''')

    def create_model_reddit_posts_table():
        with connection.cursor() as cursor:
            cursor.execute(f'''
                CREATE TABLE IF NOT EXISTS {MODEL_REDDIT_POSTS_TABLE_NAME} (
                    id INT AUTO_INCREMENT PRIMARY KEY,
                    model_repo_id VARCHAR(512),
                    post_id VARCHAR(256),
                    title VARCHAR(512) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,
                    clean_text LONGTEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,
                    link VARCHAR(256),
                    created_at TIMESTAMP,
                    embedding BLOB
                )
            ''')

    def create_model_github_repos_table():
        with connection.cursor() as cursor:
            cursor.execute(f'''
                CREATE TABLE IF NOT EXISTS {MODEL_GITHUB_REPOS_TABLE_NAME} (
                    id INT AUTO_INCREMENT PRIMARY KEY,
                    model_repo_id VARCHAR(512),
                    repo_id INT,
                    name VARCHAR(512) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,
                    description TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,
                    clean_text LONGTEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,
                    link VARCHAR(256),
                    created_at TIMESTAMP,
                    embedding BLOB
                )
            ''')

    create_models_table()
    create_model_readmes_table()
    create_model_reddit_posts_table()
    create_model_github_repos_table()


def drop_table(table_name: str):
    with connection.cursor() as cursor:
        cursor.execute(f'DROP TABLE IF EXISTS {DB_NAME}.{table_name}')


def get_models(select='*', query='', as_dict=True):
    with connection.cursor() as cursor:
        _query = f'SELECT {select} FROM {MODELS_TABLE_NAME}'

        if query:
            _query += f' {query}'

        cursor.execute(_query)

        if as_dict:
            columns = [desc[0] for desc in cursor.description]
            return [dict(zip(columns, row)) for row in cursor.fetchall()]

        return cursor.fetchall()

## AI setup

In [None]:
openai.api_key = OPENAI_API_KEY


def count_tokens(text: str):
    enc = tiktoken.get_encoding('cl100k_base')
    return len(enc.encode(text))


def create_embedding(input):
    data = openai.embeddings.create(input=input, model='text-embedding-ada-002').data
    return data[0].embedding

## Utils setup

In [None]:
class JSONEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, datetime):
            return obj.strftime('%Y-%m-%d %H:%M:%S')
        return super().default(obj)


def string_into_chunks(string: str, max_tokens=TOKENS_LIMIT):
    if count_tokens(string) <= max_tokens:
        return [string]

    delimiter = ' '
    words = string.split(delimiter)
    chunks = []
    current_chunk = []

    for word in words:
        if count_tokens(delimiter.join(current_chunk + [word])) <= max_tokens:
            current_chunk.append(word)
        else:
            chunks.append(delimiter.join(current_chunk))
            current_chunk = [word]

    if current_chunk:
        chunks.append(delimiter.join(current_chunk))

    return chunks


def clean_string(string: str):
    def strip_html_elements(string: str):
        html = markdown(string)
        soup = BeautifulSoup(html, "html.parser")
        text = soup.get_text()
        return text.strip()

    def remove_unicode_escapes(string: str):
        return re.sub(r'[^\x00-\x7F]+', '', string)

    def remove_string_spaces(strgin: str):
        new_string = re.sub(r'\n+', '\n', strgin)
        new_string = re.sub(r'\s+', ' ', new_string)
        return new_string

    def remove_links(string: str):
        url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
        return re.sub(url_pattern, '', string)

    new_string = strip_html_elements(string)
    new_string = remove_unicode_escapes(new_string)
    new_string = remove_string_spaces(new_string)
    new_string = re.sub(r'\*\*+', '*', new_string)
    new_string = re.sub(r'--+', '-', new_string)
    new_string = re.sub(r'====+', '=', new_string)
    new_string = remove_links(new_string)

    return new_string

## Leaderboard setup

In [None]:
def leaderboard_get_df():
    url = 'https://raw.githubusercontent.com/singlestore-labs/llm-recommender/main/backend/leaderboard/datasets/leaderboard.json?token=GHSAT0AAAAAACLI5LF64INSSDY7POI6CXF2ZM4LNOA'

    response = requests.get(url)

    if response.status_code == 200:
        data = json.loads(response.text)
        df = pd.DataFrame(data).head(10)
        return df
    else:
        print("Failed to retrieve JSON file")


def leaderboard_get_models():
    models = []
    existed_model_repo_ids = [i[0] for i in get_models('repo_id', as_dict=False)]
    leaderboard_df = leaderboard_get_df()

    for i, row in leaderboard_df.iterrows():
        if row['repo_id'] in existed_model_repo_ids:
            continue

        models.append(row.to_dict())

    return models


def leaderboard_insert_models(models):
    if not len(models):
        return

    _models = []
    readmes = []

    for model in models:
        model['created_at'] = model['created_at']
        _model = {key: value for key, value in model.items() if key != 'readme'}
        to_embedding = json.dumps(_model, cls=JSONEncoder)
        embedding = str(create_embedding(to_embedding))
        _models.append({**_model, embedding: embedding})

        if not model['readme']:
            continue

        readme = {
            'model_repo_id': model['repo_id'],
            'text': model['readme'],
            'created_at': time()
        }

        if count_tokens(readme['text']) <= TOKENS_TRASHHOLD_LIMIT:
            readme['clean_text'] = clean_string(readme['text'])
            to_embedding = json.dumps({
                'model_repo_id': readme['model_repo_id'],
                'clean_text': readme['clean_text'],
            })
            readme['embedding'] = str(create_embedding(to_embedding))
            readmes.append(readme)
        else:
            for i, chunk in enumerate(string_into_chunks(readme['text'])):
                _readme = {
                    **readme,
                    'text': chunk,
                    'created_at': time()
                }

                _readme['clean_text'] = clean_string(chunk)
                to_embedding = json.dumps({
                    'model_repo_id': _readme['model_repo_id'],
                    'clean_text': chunk,
                })
                _readme['embedding'] = str(create_embedding(to_embedding))
                readmes.append(_readme)

    with connection.cursor() as cursor:
        model_values = [tuple(model.values()) for model in _models]
        cursor.executemany(f'''
            INSERT INTO {MODELS_TABLE_NAME} (name, author, repo_id, score, link, still_on_hub, arc, hellaswag, mmlu, truthfulqa, winogrande, gsm8k, downloads, likes, created_at, embedding)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, FROM_UNIXTIME(%s), JSON_ARRAY_PACK(%s))
        ''', model_values)

        readme_values = [tuple(readme.values()) for readme in readmes]
        cursor.executemany(f'''
            INSERT INTO {MODEL_READMES_TABLE_NAME} (model_repo_id, text, created_at, clean_text, embedding)
            VALUES (%s, %s, FROM_UNIXTIME(%s), %s, JSON_ARRAY_PACK(%s))
        ''', readme_values)

## GitHub setup

In [None]:
github = Github(auth=Auth.Token(GITHUB_ACCESS_TOKEN))


def github_search_repos(keyword: str, last_repo_created_at):
    repos = []
    query = f'"{keyword}" in:name,description,readme'

    if last_repo_created_at:
        query += f' created:>{last_repo_created_at}'

    for repo in github.search_repositories(query):
        try:
            readme_file = repo.get_readme()

            if readme_file.size > 7000:
                continue

            readme = readme_file.decoded_content.decode('utf-8')

            repos.append({
                'repo_id': repo.id,
                'name': repo.name,
                'link': repo.html_url,
                'created_at': repo.created_at.timestamp(),
                'description': repo.description if bool(repo.description) else '',
                'readme': readme,
            })
        except:
            continue

    return repos


def github_get_models_repos(existed_models):
    repos = {}

    for model in existed_models:
        try:
            repo_id = model['repo_id']

            with connection.cursor() as cursor:
                cursor.execute(f"""
                    SELECT UNIX_TIMESTAMP(created_at) FROM {MODEL_GITHUB_REPOS_TABLE_NAME}
                    WHERE model_repo_id = '{repo_id}'
                    ORDER BY created_at DESC
                    LIMIT 1
                """)

                last_repo_crated_at = cursor.fetchone()
                if (last_repo_crated_at):
                    last_repo_crated_at = datetime.fromtimestamp(float(last_repo_crated_at[0]))
                    last_repo_crated_at = last_repo_crated_at.strftime("%Y-%m-%d")

            keyword = model['name'] if re.search(r'\d', model['name']) else repo_id
            found_repos = github_search_repos(keyword, last_repo_crated_at)

            if not len(found_repos):
                continue

            repos[repo_id] = found_repos
        except Exception as e:
            print(e)

    return repos


def github_insert_models_repos(repos):
    with connection.cursor() as cursor:
        for model_repo_id, repos in repos.items():
            if not len(repos):
                continue

            values = []

            for repo in repos:
                value = {
                    'model_repo_id': model_repo_id,
                    'repo_id': repo['repo_id'],
                    'name': repo['name'],
                    'description': repo['description'],
                    'clean_text': clean_string(repo['readme']),
                    'link': repo['link'],
                    'created_at': repo['created_at'],
                }

                to_embedding = {
                    'model_repo_id': model_repo_id,
                    'name': value['name'],
                    'description': value['description'],
                    'clean_text': value['clean_text']
                }

                if count_tokens(value['clean_text']) <= TOKENS_TRASHHOLD_LIMIT:
                    embedding = str(create_embedding(json.dumps(to_embedding)))
                    values.append({**value, 'embedding': embedding})
                else:
                    for chunk in string_into_chunks(value['clean_text']):
                        embedding = str(create_embedding(json.dumps({
                            **to_embedding,
                            'clean_text': chunk
                        })))
                        values.append({**value, 'clean_text': chunk, 'embedding': embedding})

            cursor.executemany(f'''
                INSERT INTO {MODEL_GITHUB_REPOS_TABLE_NAME} (model_repo_id, repo_id, name, description, clean_text, link, created_at, embedding)
                VALUES (%s, %s, %s, %s, %s, %s, FROM_UNIXTIME(%s), JSON_ARRAY_PACK(%s))
            ''', [list(value.values()) for value in values])

## Reddit setup

In [None]:
# https://www.reddit.com/prefs/apps
reddit = praw.Reddit(
    username=REDDIT_USERNAME,
    password=REDDIT_PASSWORD,
    client_id=REDDIT_CLIENT_ID,
    client_secret=REDDIT_CLIENT_SECRET,
    user_agent=REDDIT_USER_AGENT
)


def reddit_search_posts(keyword: str, latest_post_timestamp):
    posts = []

    # https://www.reddit.com/dev/api/#GET_search
    # https://praw.readthedocs.io/en/stable/code_overview/models/subreddit.html#praw.models.Subreddit.search
    for post in reddit.subreddit('all').search(
            f'"{keyword}"', sort='relevance', time_filter='year', limit=100
    ):
        contains_keyword = keyword in post.title or keyword in post.selftext

        if contains_keyword and not post.over_18:
            if not latest_post_timestamp or (post.created_utc > latest_post_timestamp):
                posts.append({
                    'post_id': post.id,
                    'title': post.title,
                    'text': post.selftext,
                    'link': f'https://www.reddit.com{post.permalink}',
                    'created_at': post.created_utc,
                })

    return posts


def reddit_get_models_posts(existed_models):
    posts = {}

    for model in existed_models:
        repo_id = model['repo_id']

        with connection.cursor() as cursor:
            cursor.execute(f"""
                SELECT UNIX_TIMESTAMP(created_at) FROM {MODEL_REDDIT_POSTS_TABLE_NAME}
                WHERE model_repo_id = '{repo_id}'
                ORDER BY created_at DESC
                LIMIT 1
            """)

            latest_post_timestamp = cursor.fetchone()
            latest_post_timestamp = float(latest_post_timestamp[0]) if latest_post_timestamp != None else None

        keyword = model['name'] if re.search(r'\d', model['name']) else repo_id
        found_posts = reddit_search_posts(keyword, latest_post_timestamp)

        if not len(found_posts):
            continue

        posts[repo_id] = found_posts

    return posts


def reddit_insert_models_posts(posts):
    if not len(posts):
        return

    with connection.cursor() as cursor:
        for model_repo_id, posts in posts.items():
            if not len(posts):
                continue

            values = []

            for post in posts:
                value = {
                    'model_repo_id': model_repo_id,
                    'post_id': post['post_id'],
                    'title': post['title'],
                    'clean_text': clean_string(post['text']),
                    'link': post['link'],
                    'created_at': post['created_at'],
                }

                to_embedding = {
                    'model_repo_id': model_repo_id,
                    'title': value['title'],
                    'clean_text': value['clean_text']
                }

                if count_tokens(value['clean_text']) <= TOKENS_TRASHHOLD_LIMIT:
                    embedding = str(create_embedding(json.dumps(to_embedding)))
                    values.append({**value, 'embedding': embedding})
                else:
                    for chunk in string_into_chunks(value['clean_text']):
                        embedding = str(create_embedding(json.dumps({
                            **to_embedding,
                            'clean_text': chunk
                        })))
                        values.append({**value, 'clean_text': chunk, 'embedding': embedding})

            cursor.executemany(f'''
                INSERT INTO {MODEL_REDDIT_POSTS_TABLE_NAME} (model_repo_id, post_id, title, clean_text, link, created_at, embedding)
                VALUES (%s, %s, %s, %s, %s, FROM_UNIXTIME(%s), JSON_ARRAY_PACK(%s))
            ''', [list(value.values()) for value in values])

## Execute scheduled logic

In [None]:
create_tables()

leaderboard_models = leaderboard_get_models()
leaderboard_insert_models(leaderboard_models)

existed_models = get_models('repo_id, name', 'ORDER BY score DESC')

models_github_repos = github_get_models_repos(existed_models)
github_insert_models_repos(models_github_repos)

models_reddit_posts = reddit_get_models_posts(existed_models)
reddit_insert_models_posts(models_reddit_posts)