In [40]:
import os
from dotenv import load_dotenv
from snowflake.snowpark.session import Session
from snowflake.snowpark.context import get_active_session
from snowflake.cortex import Summarize, Complete, ExtractAnswer, Sentiment, Translate
import pandas as pd
import json

### Build Snowflake session

In [41]:
load_dotenv()

True

In [42]:
connection_params = {
    "account": os.environ['SNOWFLAKE_ACCOUNT'],
    "user": os.environ['SNOWFLAKE_USER'],
    "password": os.environ['SNOWFLAKE_USER_PASSWORD'],
    "role": 'ACCOUNTADMIN',
    #"database": 'SNOWFLAKE_LEARNING_DB',
    "warehouse": 'COMPUTE_WH'
    #"schema": 'PUBLIC',
    }

In [43]:
snowflake_session = Session.builder.configs(connection_params).create()

### Define model

In [44]:
# model = 'mistral-7b'
# max number of input tokens = 32,000
# max number of output tokens =  8,192
# credits per 1 million token = 0.12 ~ 8 million tokens
# model = 'llama3.1-8b'##
model = 'llama3.2-3b'



### Load input data

In [45]:
# load json
with open('../data/input_data/kaxap_arl.json', 'r')  as file:
    loaded_data = json.load(file)

In [46]:
source_code_cleaned_comments = loaded_data['source_code_cleaned_comments']
license = loaded_data['license']
requirements = loaded_data['requirements']

In [47]:
input_txt = source_code_cleaned_comments

In [48]:
repo_name = 'arl'
repo_owner = 'kaxap'
license = ''
requirements = ''

### Create summary prompt

In [49]:
prompt_summary = f'''
    You are acting like software development expert for the following GitHub repository {repo_name}.
    Your task is to summarize the given source code string "{input_txt}" in natural language so a specialist is able to understand
    the purpose of the repository.
    Identify its purpose, key functionalites, main components and dependencies. Focus on the overall architecture and structure 
    rather than line-by-line details. Do not add any recomandations or improvement suggestions but concentrate on the summary. Present the summary in a clear and concise language. 
''' 
prompt_summary = prompt_summary.replace("'", "\\'")

In [50]:
len(prompt_summary)

5362

### Count input tokens before send to Snowflake

In [51]:
res_cnt_input = snowflake_session.sql(f"SELECT SNOWFLAKE.CORTEX.COUNT_TOKENS('{model}', '{prompt_summary}') AS token_count").collect()
input_tokens = res_cnt_input[0]['TOKEN_COUNT']

In [52]:
print(input_tokens)

1344


### Execute summary query

In [53]:
df = snowflake_session.sql(f"select snowflake.cortex.complete('{model}', '{prompt_summary}') as response").to_pandas()
#query_id = snowflake_session.sql("SELECT last_query_id()").collect()[0][0]


In [54]:
#df = snowflake_session.sql(f"select snowflake.cortex.complete('{model}', '{prompt_summary}') as response").to_pandas()
# lama3.2-1b

In [56]:
summary_txt = df.iloc[0]['RESPONSE']
print(summary_txt)

**Repository Summary**

The repository is a Python script that generates README files for popular programming languages on GitHub. The script uses the GitHub API to retrieve information about repositories, including their stars, forks, issues, and last commit dates.

**Key Functionalities**

1. **Language Support**: The script supports a list of programming languages, including Verilog, VHDL, V, Erlang, Kotlin, D, Crystal, Idris, Python, Java, C, CPP, SQL, Node, CSharp, PHP, Ruby, TypeScript, Swift, ObjectiveC, VB.net, Assembly, R, Perl, MATLAB, Go, Scala, Groovy, Lua, Haskell, CoffeeScript, Clojure, Rust, JavaScript, ActionScript, Elixir, Elm, and PureScript.
2. **GitHub API Integration**: The script uses the GitHub API to retrieve repository information, including stars, forks, issues, and last commit dates.
3. **Rate Limiting**: The script implements rate limiting to prevent excessive requests to the GitHub API, ensuring that the script does not exceed the API's rate limits.
4. **Da

### Create README prompt

In [162]:
prompt_readme = f'''
    You are acting like software development expert for the following GitHub repository {repo_name} from the owner {repo_owner}. 
    Your task is to create a README file for the repository in Markdown format. 
    Use the provided summary: "{summary_txt}", the license: "{license}" and the given requirements: "{requirements}" 
    The README file should contain information about what the project does, why it is useful, how users 
    can get started, where they can get help, and how to maintain and contribute to the project.
    If you don't know the answer add a hint following this style […]. You're not allowed to create 
    made-up content to fill gaps and you're not allowed to add additional paragraphs.
    Use the following Markdown template and fill each paragraph. 

    ## Titel

    ## Installation

    ## Usage

    ## Contributing

    ## License

    Do not include any sensitive data like names or emails. Keep the markdown file clean and structured.
'''
prompt_readme = prompt_readme.replace("'", "\\'")

In [163]:
df = snowflake_session.sql(f"select snowflake.cortex.complete('{model}', '{prompt_readme}') as response").to_pandas()

In [164]:
df_readme = df.iloc[0]['RESPONSE']
readme_txt = df_readme

In [165]:
print(df_readme)

 ## Popular GitHub Repositories by Programming Language

This repository is a Python script that fetches and generates a list of most popular repositories on GitHub based on the given programming language. It uses the GitHub API to retrieve the required information and stores the access token in a local file named "token.json". The script supports multiple programming languages and can fetch up to 10 pages of results per language.

## Installation

To use this script, you need to have Python installed on your system. You can install the required dependencies using pip:

```bash
pip install requests argparse json humanize
```

## Usage

To run the script, save the provided code in a file named `github_popular_repos.py` and execute it using the following command:

```bash
python github_popular_repos.py [--language LANG1, LANG2, ...]
```

Replace `LANG1, LANG2, ...` with the desired programming languages, separated by commas. If no languages are specified, the script will fetch the popula

### Save summary and README

In [180]:
tmp_json = {
    'repo_owner': repo_owner,
    'repo_name': repo_name,
    'summary': summary_txt,
    'readme': readme_txt
}

In [167]:
tmp_json

{'repo_owner': 'kaxap',
 'repo_name': 'arl',
 'summary': ' This repository is a Python script that fetches and generates a list of most popular repositories on GitHub based on the given programming language. The script uses the GitHub API to retrieve the required information and stores the access token in a local file named "token.json". The script supports multiple programming languages and can fetch up to 10 pages of results per language.\n\nThe script defines a class `RepositoryInformationProvider` that initializes a `requests.Session` object with retries and rate limit handling. It also defines methods to get the next page of results for a given language and to get the last commit date for a given repository.\n\nThe `generate_readme` function generates a markdown file with a table of most popular repositories for a given language. It fetches the data using the `RepositoryInformationProvider` and formats the data into a markdown table.\n\nThe script uses several constants and variab

In [183]:

with open('../model/test_readme.json', 'w') as file:
    json.dump(tmp_json, file)

### Count output tokens from prompt and result

In [169]:
query_id = snowflake_session.sql("SELECT last_query_id()").collect()[0][0]

In [170]:
print(query_id)

01bc4ca8-0305-3f74-0008-537300054062


In [171]:
res_cnt_output = snowflake_session.sql(f"SELECT * FROM SNOWFLAKE.ACCOUNT_USAGE.CORTEX_FUNCTIONS_QUERY_USAGE_HISTORY WHERE query_id='{query_id}'").to_pandas()
output_tokens = res_cnt_output['TOKENS']

In [172]:
res_cnt_output

Unnamed: 0,QUERY_ID,WAREHOUSE_ID,MODEL_NAME,FUNCTION_NAME,TOKENS,TOKEN_CREDITS


In [173]:
print(input_tokens)
print(output_tokens)

1782
Series([], Name: TOKENS, dtype: int64)


In [174]:
# # count input tokens of prompt


# # Get current Snowflake session
# #session = get_active_session()

# # Define model and input text
# model_name = model #0.06 credits per 1M token

# # Execute token count function
# result = session.sql(f"SELECT SNOWFLAKE.CORTEX.COUNT_TOKENS('{model_name}', '{prompt_summary}') AS token_count").collect()

# print(f"Token count: {result[0]['TOKEN_COUNT']}") # 204 tokens (snowflake)

In [175]:
# get query id of last session
# query_id = session.sql("SELECT last_query_id()").collect()[0][0]
# print(query_id)

In [176]:
#query_id = '01bc4c11-0305-3f6c-0008-53730004c00e'

In [177]:
#session = get_active_session()
# get tokens for query id
#res = snowflake_session.sql(f"SELECT * FROM SNOWFLAKE.ACCOUNT_USAGE.CORTEX_FUNCTIONS_QUERY_USAGE_HISTORY WHERE query_id='{query_id}'").to_pandas()

In [178]:
# input_tokens = result[0]['TOKEN_COUNT']
# output_tokens = res['TOKENS']

In [57]:
snowflake_session.close()

Interessant wird das repo tensorflow models! Es hat die meiste Anzahl an Characters ;-)