In [2]:
import os # package for using operating system
from dotenv import load_dotenv
from snowflake.snowpark.session import Session # package for building and using Snowflake sessions
# from snowflake.cortex import Complete
# from transformers import AutoTokenizer # package to select the fitting tokenizer for a pretrained model
# from huggingface_hub import login # package for login and identifying to Huggingface
import pandas as pd # package for data manipulation
import json # package to work with .json
import math # package for mathematical operations

In [3]:
with open('../data/input_readme_data/pallets_click.json', 'r') as f:
    loaded_data = json.load(f)

In [4]:
repo_owner = loaded_data['repo_owner']
repo_name = loaded_data['repo_name']
input_txt = loaded_data['source_code_cleaned_comments']

In [5]:
def write_summary_prompt(repo_name, input_txt):
    '''
    Function which writes the summary prompt for a GitHub repository.

    Args:
        repo_name: The name of the GitHub repository.
        input_txt: The source code of the GitHup repository as string.

    Return:
        prompt_summary
    '''
    prompt_summary = f'''
        You are acting as a software development expert for the following GitHub repository "{repo_name}".
        Your task is to summarize the given source code string "{input_txt}" in natural language, so a specialist is able to understand
        the purpose of the repository.
        Identify its purpose, key functions, main components and dependencies. Focus on the overall architecture and structure 
        rather than line-by-line details. Do not add any recommendations or improvement suggestions, but concentrate on the summary. 
        Present the summary in a clear and concise language.
        You are not allowed to add any small talk. 
    ''' 

    return prompt_summary

In [6]:
summary_prompt = write_summary_prompt(repo_name, input_txt)

In [7]:
len(summary_prompt)

435098

In [11]:
# load .env file
load_dotenv(override=True)

# set up connection parameters for Snowflake connection
connection_params = {
    "account": os.environ['SNOWFLAKE_ACCOUNT'],
    "user": os.environ['SNOWFLAKE_USER'],
    "password": os.environ['SNOWFLAKE_USER_PASSWORD'],
    "role": 'ACCOUNTADMIN',
    "warehouse": 'COMPUTE_WH',
    'paramstyle': 'qmark'
}

# build Snowflake session with connection parameters
snowflake_session = Session.builder.configs(connection_params).create()
print('Snowflake sessions is build.')
print('---------------------------------------------')

Snowflake sessions is build.
---------------------------------------------


In [12]:
model = 'llama3.1-8b'
model_summary_params = {
   'temperature': 0, # default: 0 https://docs.snowflake.com/en/sql-reference/functions/complete-snowflake-cortex --> Internetrecherche hat keine anderen Empfehlungen ergeben
   # 'top_p': # default: 0 https://docs.snowflake.com/en/sql-reference/functions/complete-snowflake-cortex
    'max_tokens': 4000
}


In [13]:
query = f"""
            SELECT SNOWFLAKE.CORTEX.COUNT_TOKENS(
                ?,
                ?
            ) AS response
        """

In [None]:
response = snowflake_session.sql(query, params=[model, summary_prompt]).collect()
# the number of tokens is lower as the acutally processed tokens because count_tokens does not account the tokens
# which are automatically added to the beginning of the the input text https://docs.snowflake.com/en/sql-reference/functions/count_tokens-snowflake-cortex

In [21]:
print(response)

[Row(RESPONSE=117246)]


In [15]:
input_tokens = response[0]

In [19]:
input_tokens[0]

117246

In [None]:
# estimation for number of characters which fits in 1 token
len(summary_prompt) / input_tokens[0]

3.7109837435818704

In [12]:
int(len(summary_prompt) / 4)

108774

In [17]:
snowflake_session.close()