In [76]:
import os
from dotenv import load_dotenv
from snowflake.snowpark.session import Session
from snowflake.snowpark.context import get_active_session
from snowflake.cortex import Summarize, Complete, ExtractAnswer, Sentiment, Translate
import pandas as pd
import json

### Build Snowflake session

In [77]:
load_dotenv()

True

In [78]:
connection_params = {
    "account": os.environ['SNOWFLAKE_ACCOUNT'],
    "user": os.environ['SNOWFLAKE_USER'],
    "password": os.environ['SNOWFLAKE_USER_PASSWORD'],
    "role": 'ACCOUNTADMIN',
    #"database": 'SNOWFLAKE_LEARNING_DB',
    "warehouse": 'COMPUTE_WH'
    #"schema": 'PUBLIC',
    }

In [79]:
snowflake_session = Session.builder.configs(connection_params).create()

### Define model

In [80]:
# model = 'mistral-7b'
# max number of input tokens = 32,000
# max number of output tokens =  8,192
# credits per 1 million token = 0.12 ~ 8 million tokens
# model = 'llama3.1-8b'
# model = 'llama3.2-3b'



### Load input data

In [81]:
# load json
with open('../../data/input_data/kaxap_arl.json', 'r')  as file:
    loaded_data = json.load(file)

In [82]:
source_code_cleaned_comments = loaded_data['source_code_cleaned_comments']
license = loaded_data['license']
requirements = loaded_data['requirements']

In [83]:
input_txt = source_code_cleaned_comments

In [84]:
repo_name = 'arl'
repo_owner = 'kaxap'
license = ''
requirements = ''

### Create summary prompt

In [85]:
prompt_summary = f'''
    You are acting like software development expert for the following GitHub repository {repo_name}.
    Your task is to summarize the given source code string "{input_txt}" in natural language so a specialist is able to understand
    the purpose of the repository.
    Identify its purpose, key functionalites, main components and dependencies. Focus on the overall architecture and structure 
    rather than line-by-line details. Do not add any recomandations or improvement suggestions but concentrate on the summary. Present the summary in a clear and concise language. 
''' 
prompt_summary = prompt_summary.replace("'", "\\'")

In [86]:
prompt_summary

'\n    You are acting like software development expert for the following GitHub repository arl.\n    Your task is to summarize the given source code string "#importjsonfromtypingimportOptionalimportrequestsimporttimeimporthumanizeimportdatetimeimportargparsefromrequests.adaptersimportHTTPAdapterfromrequests.packages.urllib3importRetryfromrequests.structuresimportCaseInsensitiveDictTABLE_DISCLAIMER="##Thisisamostpopularrepositorylistfor{lng}sortedbynumberofstars"TABLE_HEADER="|STARS|FORKS|ISSUES|LASTCOMMIT|NAME/PLACE|DESCRIPTION|"TABLE_SEPARATOR="|---|---|---|---|---|---|"TABLE_ITEM_MASK="|{n_stars}|{n_forks}|{n_issues}|{updated_at}|[{name}]({url})/{place}|{description}|"MAX_PAGE=10URL_MASK=""\\"?q=language:{lng}&sort=stars&order=desc&page={n_page}&per_page=100"LAST_COMMIT_URL_MASK="{repo_full_name}/commits"KEY_STAR_COUNT="stargazers_count"KEY_ISSUE_COUNT="open_issues"KEY_FORK_COUNT="forks"KEY_REPOSITORY_NAME="name"KEY_REPOSITORY_FULL_NAME="full_name"KEY_DESCRIPTION="description"KEY_URL

### Count input tokens before send to Snowflake

In [87]:
# res_cnt_input = snowflake_session.sql(f"SELECT SNOWFLAKE.CORTEX.COUNT_TOKENS('{model}', '{prompt_summary}') AS token_count").collect()
# input_tokens = res_cnt_input[0]['TOKEN_COUNT']

In [88]:
#print(input_tokens)

### Execute summary query

In [89]:
model = 'llama3.1-8b'
model_summary_params = {
   'temperature': 0, 
    'max_tokens': 3000
}

In [90]:
query = f"""
    SELECT SNOWFLAKE.CORTEX.COMPLETE(
        '{model}',
        [
            {{
                'role': 'user', 
                'content': '{prompt_summary}'
            }}
        ],
        {{
            'temperature': {model_summary_params['temperature']},
            'max_tokens':  {model_summary_params['max_tokens']}
        }} 
    ) AS response
"""

In [91]:
res = snowflake_session.sql(query).collect()

In [92]:
#df = snowflake_session.sql(f"select snowflake.cortex.complete('{model}', '{prompt_summary}') as response").to_pandas()
#query_id = snowflake_session.sql("SELECT last_query_id()").collect()[0][0]


In [93]:
# df = snowflake_session.sql(f"select snowflake.cortex.complete('{model}', '{prompt_summary}') as response").to_pandas()
# lama3.2-1b

In [94]:
res

[Row(RESPONSE='{\n  "choices": [\n    {\n      "messages": "**Repository Purpose and Overview**\\n\\nThis repository is a GitHub repository list generator that fetches and displays the most popular repositories for a given programming language. The repository provides a simple command-line interface to generate a README file in markdown format, listing the top repositories for a specified language.\\n\\n**Key Functionalities**\\n\\n1. **Repository Information Provider**: The repository uses a `RepositoryInformationProvider` class to interact with the GitHub API, fetching repository information for a given language.\\n2. **GitHub API Interaction**: The provider uses the `requests` library to make HTTP requests to the GitHub API, handling rate limiting and retrying failed requests.\\n3. **Repository Data Processing**: The provider extracts relevant information from the API responses, including repository names, descriptions, star counts, fork counts, issue counts, and last commit dates.\

In [101]:
import ast

In [103]:
summary_txt = res[0]['RESPONSE']
print(summary_txt)

{
  "choices": [
    {
      "messages": "**Repository Purpose and Overview**\n\nThis repository is a GitHub repository list generator that fetches and displays the most popular repositories for a given programming language. The repository provides a simple command-line interface to generate a README file in markdown format, listing the top repositories for a specified language.\n\n**Key Functionalities**\n\n1. **Repository Information Provider**: The repository uses a `RepositoryInformationProvider` class to interact with the GitHub API, fetching repository information for a given language.\n2. **GitHub API Interaction**: The provider uses the `requests` library to make HTTP requests to the GitHub API, handling rate limiting and retrying failed requests.\n3. **Repository Data Processing**: The provider extracts relevant information from the API responses, including repository names, descriptions, star counts, fork counts, issue counts, and last commit dates.\n4. **README Generation**:

In [105]:
test = res[0]['RESPONSE']
test_str = json.loads(test)

In [112]:
print(test_str['usage']['total_tokens'])

1777


In [109]:
print(test_str['choices'][0]['messages'])

**Repository Purpose and Overview**

This repository is a GitHub repository list generator that fetches and displays the most popular repositories for a given programming language. The repository provides a simple command-line interface to generate a README file in markdown format, listing the top repositories for a specified language.

**Key Functionalities**

1. **Repository Information Provider**: The repository uses a `RepositoryInformationProvider` class to interact with the GitHub API, fetching repository information for a given language.
2. **GitHub API Interaction**: The provider uses the `requests` library to make HTTP requests to the GitHub API, handling rate limiting and retrying failed requests.
3. **Repository Data Processing**: The provider extracts relevant information from the API responses, including repository names, descriptions, star counts, fork counts, issue counts, and last commit dates.
4. **README Generation**: The `generate_readme` function takes the repositor

In [95]:
snowflake_session.close()

### Create README prompt

In [162]:
prompt_readme = f'''
    You are acting like software development expert for the following GitHub repository {repo_name} from the owner {repo_owner}. 
    Your task is to create a README file for the repository in Markdown format. 
    Use the provided summary: "{summary_txt}", the license: "{license}" and the given requirements: "{requirements}" 
    The README file should contain information about what the project does, why it is useful, how users 
    can get started, where they can get help, and how to maintain and contribute to the project.
    If you don't know the answer add a hint following this style […]. You're not allowed to create 
    made-up content to fill gaps and you're not allowed to add additional paragraphs.
    Use the following Markdown template and fill each paragraph. 

    ## Titel

    ## Installation

    ## Usage

    ## Contributing

    ## License

    Do not include any sensitive data like names or emails. Keep the markdown file clean and structured.
'''
prompt_readme = prompt_readme.replace("'", "\\'")

In [163]:
df = snowflake_session.sql(f"select snowflake.cortex.complete('{model}', '{prompt_readme}') as response").to_pandas()

In [164]:
df_readme = df.iloc[0]['RESPONSE']
readme_txt = df_readme

In [165]:
print(df_readme)

 ## Popular GitHub Repositories by Programming Language

This repository is a Python script that fetches and generates a list of most popular repositories on GitHub based on the given programming language. It uses the GitHub API to retrieve the required information and stores the access token in a local file named "token.json". The script supports multiple programming languages and can fetch up to 10 pages of results per language.

## Installation

To use this script, you need to have Python installed on your system. You can install the required dependencies using pip:

```bash
pip install requests argparse json humanize
```

## Usage

To run the script, save the provided code in a file named `github_popular_repos.py` and execute it using the following command:

```bash
python github_popular_repos.py [--language LANG1, LANG2, ...]
```

Replace `LANG1, LANG2, ...` with the desired programming languages, separated by commas. If no languages are specified, the script will fetch the popula

### Save summary and README

In [180]:
tmp_json = {
    'repo_owner': repo_owner,
    'repo_name': repo_name,
    'summary': summary_txt,
    'readme': readme_txt
}

In [167]:
tmp_json

{'repo_owner': 'kaxap',
 'repo_name': 'arl',
 'summary': ' This repository is a Python script that fetches and generates a list of most popular repositories on GitHub based on the given programming language. The script uses the GitHub API to retrieve the required information and stores the access token in a local file named "token.json". The script supports multiple programming languages and can fetch up to 10 pages of results per language.\n\nThe script defines a class `RepositoryInformationProvider` that initializes a `requests.Session` object with retries and rate limit handling. It also defines methods to get the next page of results for a given language and to get the last commit date for a given repository.\n\nThe `generate_readme` function generates a markdown file with a table of most popular repositories for a given language. It fetches the data using the `RepositoryInformationProvider` and formats the data into a markdown table.\n\nThe script uses several constants and variab

In [183]:

with open('../model/test_readme.json', 'w') as file:
    json.dump(tmp_json, file)

### Count output tokens from prompt and result

In [169]:
query_id = snowflake_session.sql("SELECT last_query_id()").collect()[0][0]

In [170]:
print(query_id)

01bc4ca8-0305-3f74-0008-537300054062


In [171]:
res_cnt_output = snowflake_session.sql(f"SELECT * FROM SNOWFLAKE.ACCOUNT_USAGE.CORTEX_FUNCTIONS_QUERY_USAGE_HISTORY WHERE query_id='{query_id}'").to_pandas()
output_tokens = res_cnt_output['TOKENS']

In [172]:
res_cnt_output

Unnamed: 0,QUERY_ID,WAREHOUSE_ID,MODEL_NAME,FUNCTION_NAME,TOKENS,TOKEN_CREDITS


In [173]:
print(input_tokens)
print(output_tokens)

1782
Series([], Name: TOKENS, dtype: int64)


In [174]:
# # count input tokens of prompt


# # Get current Snowflake session
# #session = get_active_session()

# # Define model and input text
# model_name = model #0.06 credits per 1M token

# # Execute token count function
# result = session.sql(f"SELECT SNOWFLAKE.CORTEX.COUNT_TOKENS('{model_name}', '{prompt_summary}') AS token_count").collect()

# print(f"Token count: {result[0]['TOKEN_COUNT']}") # 204 tokens (snowflake)

In [175]:
# get query id of last session
# query_id = session.sql("SELECT last_query_id()").collect()[0][0]
# print(query_id)

In [176]:
#query_id = '01bc4c11-0305-3f6c-0008-53730004c00e'

In [177]:
#session = get_active_session()
# get tokens for query id
#res = snowflake_session.sql(f"SELECT * FROM SNOWFLAKE.ACCOUNT_USAGE.CORTEX_FUNCTIONS_QUERY_USAGE_HISTORY WHERE query_id='{query_id}'").to_pandas()

In [178]:
# input_tokens = result[0]['TOKEN_COUNT']
# output_tokens = res['TOKENS']

In [23]:
snowflake_session.close()

Interessant wird das repo tensorflow models! Es hat die meiste Anzahl an Characters ;-)

In [4]:
with open('../../data/df_repos_counts_filtered.json','r') as file:
    loaded_data = json.load(file)

In [6]:
df = pd.DataFrame(loaded_data)

In [None]:
repos = [f"{row['repo_owner']}_{row['repo_name']}" for index, row in df.iterrows()]


In [29]:
rep = [(row.repo_owner, row.repo_name) for row in df.itertuples()]

In [31]:
rep[0][0]

'rochacbruno'

In [51]:
def write_preprocessed_repo(repo_owner, repo_name):
    tmp_tuple = (repo_owner, repo_name)
    with open('../../data/helper/repos_processed.txt', 'a') as file:
        file.write(str(tmp_tuple) + '\n')

write_preprocessed_repo(repo_owner='lisa', repo_name='nice')

In [113]:
tmp_json = {
    'summary_1': 'some text 1',
    'summary_2': 'some text 2',
    'summary_3': 'some text 3',
}

In [117]:
new_list = [value for value in tmp_json.values()]

In [119]:
test = 0

In [120]:
test =+ 1

In [121]:
test

1