In [10]:
%pip install pandas openai matplotlib

Note: you may need to restart the kernel to use updated packages.


In [11]:
import pandas as pd
from pathlib import Path

DEF_PREFIXES = ['def ', 'async def ']
NEWLINE = '\n'

def get_function_name(code):
    """
    Extract function name from a line beginning with 'def' or 'async def'.
    """
    for prefix in DEF_PREFIXES:
        if code.startswith(prefix):
            return code[len(prefix): code.index('(')]

def get_until_no_space(all_lines, i):
    """
    Get all lines until a line outside the function definition is found.
    """
    ret = [all_lines[i]]
    for j in range(i + 1, len(all_lines)):
        if len(all_lines[j]) == 0 or all_lines[j][0] in [' ', '\t', ')']:
            ret.append(all_lines[j])
        else:
            break
    return NEWLINE.join(ret)

def get_functions(filepath):
    """
    Get all functions in a Python file.
    """
    with open(filepath, 'r', encoding = 'utf8') as file:
        all_lines = file.read().replace('\r', NEWLINE).split(NEWLINE)
        for i, l in enumerate(all_lines):
            for prefix in DEF_PREFIXES:
                if l.startswith(prefix):
                    code = get_until_no_space(all_lines, i)
                    function_name = get_function_name(code)
                    yield {
                        'code': code,
                        'function_name': function_name,
                        'filepath': filepath,
                    }
                    break

def extract_functions_from_repo(code_root):
    """
    Extract all .py functions from the repository.
    """
    code_files = list(code_root.glob('**/*.py'))

    num_files = len(code_files)
    print(f'Total number of .py files: {num_files}')

    if num_files == 0:
        print('Verify openai-python repo exists and code_root is set correctly.')
        return None

    all_funcs = [
        func
        for code_file in code_files
        for func in get_functions(str(code_file))
    ]

    num_funcs = len(all_funcs)
    print(f'Total number of functions extracted: {num_funcs}')

    return all_funcs

In [12]:
# Set user root directory to the 'openai-python' repository
root_dir = Path.home()
print(root_dir)

# Assumes the 'openai-python' repository exists in the user's root directory
code_root = root_dir / 'source/repos/llama-cpp-python'
print(code_root)

# Extract all functions from the repository
all_funcs = extract_functions_from_repo(code_root)

C:\Users\shafi
C:\Users\shafi\source\repos\llama-cpp-python
Total number of .py files: 50
Total number of functions extracted: 218


In [13]:
%pip install plotly scipy




In [14]:
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [20]:
import openai

openai.api_key = "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"

In [22]:
from openai.embeddings_utils import get_embedding

df = pd.DataFrame(all_funcs)
df['code_embedding'] = df['code'].apply(lambda x: get_embedding(x, engine='text-embedding-ada-002'))
df['filepath'] = df['filepath'].map(lambda x: Path(x).relative_to(code_root))
df.to_csv("data/code_search_openai-python.csv", index=False)
df.head()

Unnamed: 0,code,function_name,filepath,code_embedding
0,"def make_request(url, params=None):\n print...",make_request,docker\open_llama\hug_model.py,"[-0.0029997548554092646, 0.006617534905672073,..."
1,def check_magic_and_version(filename):\n wi...,check_magic_and_version,docker\open_llama\hug_model.py,"[0.0019175842171534896, -0.011885509826242924,..."
2,"def download_file(url, destination):\n prin...",download_file,docker\open_llama\hug_model.py,"[-0.013736424967646599, 0.0036590476520359516,..."
3,def get_user_choice(model_list):\n # Print ...,get_user_choice,docker\open_llama\hug_model.py,"[-0.009958215989172459, -0.01502623688429594, ..."
4,def main():\n # Create an argument parser\n...,main,docker\open_llama\hug_model.py,"[-0.024131689220666885, 0.026415444910526276, ..."


In [23]:
from openai.embeddings_utils import cosine_similarity

def search_functions(df, code_query, n=3, pprint=True, n_lines=7):
    embedding = get_embedding(code_query, engine='text-embedding-ada-002')
    df['similarities'] = df.code_embedding.apply(lambda x: cosine_similarity(x, embedding))

    res = df.sort_values('similarities', ascending=False).head(n)

    if pprint:
        for r in res.iterrows():
            print(f"{r[1].filepath}:{r[1].function_name}  score={round(r[1].similarities, 3)}")
            print("\n".join(r[1].code.split("\n")[:n_lines]))
            print('-' * 70)

    return res

In [24]:
res = search_functions(df, 'fine-tuning input data validation logic', n=3)

examples\low_level_api\common.py:gpt_params_parse  score=0.727
def gpt_params_parse(argv = None):
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("-s", "--seed", type=int, default=-1, help="RNG seed (use random seed for <= 0)",dest="seed")
    parser.add_argument("-t", "--threads", type=int, default=min(4, os.cpu_count() or 1), help="number of threads to use during computation",dest="n_threads")
    parser.add_argument("-n", "--n_predict", type=int, default=128, help="number of tokens to predict (-1 = infinity)",dest="n_predict")
    parser.add_argument("--n_parts", type=int, default=-1, help="number of model parts", dest="n_parts")
    parser.add_argument("-c", "--ctx_size", type=int, default=512, help="size of the prompt context",dest="n_ctx")
----------------------------------------------------------------------
llama_cpp\server\app.py:make_logit_bias_processor  score=0.715
def make_logit_bias_processor(
    llama:

In [25]:
res = search_functions(df, 'find common suffix', n=2, n_lines=10)

llama_cpp\llama_cpp.py:llama_token_suffix  score=0.74
def llama_token_suffix(ctx: llama_context_p) -> int:
    return _lib.llama_token_suffix(ctx)


----------------------------------------------------------------------
vendor\llama.cpp\examples\finetune\convert-finetune-checkpoint-to-gguf.py:tensor_name  score=0.727
def tensor_name(key, bid=None, suffix=".weight"):
    return gguf.TENSOR_NAMES[key].format(bid=bid) + suffix

----------------------------------------------------------------------


In [26]:
res = search_functions(df, 'Command line interface for fine-tuning', n=1, n_lines=20)

examples\low_level_api\common.py:gpt_params_parse  score=0.775
def gpt_params_parse(argv = None):
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("-s", "--seed", type=int, default=-1, help="RNG seed (use random seed for <= 0)",dest="seed")
    parser.add_argument("-t", "--threads", type=int, default=min(4, os.cpu_count() or 1), help="number of threads to use during computation",dest="n_threads")
    parser.add_argument("-n", "--n_predict", type=int, default=128, help="number of tokens to predict (-1 = infinity)",dest="n_predict")
    parser.add_argument("--n_parts", type=int, default=-1, help="number of model parts", dest="n_parts")
    parser.add_argument("-c", "--ctx_size", type=int, default=512, help="size of the prompt context",dest="n_ctx")
    parser.add_argument("-b", "--batch_size", type=int, default=8, help="batch size for prompt processing",dest="n_batch")
    parser.add_argument("--keep", type=int, default=