In [None]:
from transformers import AutoTokenizer
from huggingface_hub import login
from getpass import getpass

In [4]:
HF_TOKEN = getpass()
login(HF_TOKEN)

In [13]:
full_prompt = '''
    This repository is a Python script that fetches and generates a list of most popular repositories on GitHub based on the given programming language. The script uses the GitHub API to retrieve the required information and stores the access token in a local file named \"token.json\". The script supports multiple programming languages and can fetch up to 10 pages of results per language.\n\nThe script defines a class `RepositoryInformationProvider` that initializes a `requests.Session` object with retries and rate limit handling. It also defines methods to get the next page of results for a given language and to get the last commit date for a given repository.\n\nThe `generate_readme` function generates a markdown file with a table of most popular repositories for a given language. It fetches the data using the `RepositoryInformationProvider` and formats the data into a markdown table.\n\nThe script uses several constants and variables to store the API URLs, headers, and other configuration options. It also defines some helper functions for formatting and humanizing dates.\n\nThe script uses the `argparse` module to parse command-line arguments and supports specifying multiple languages using a comma-separated list.\n\nThe main components of the script are:\n\n* `RepositoryInformationProvider` class for fetching and handling GitHub API responses\n* `generate_readme` function for generating the markdown file\n* Use of `requests` library for making HTTP requests\n* Use of `argparse` module for parsing command-line arguments\n* Use of `json`, `time`, `humanize`, `datetime`, and `argparse` modules for various utility functions\n\nThe dependencies of the script are:\n\n* `requests` library for making HTTP requests\n* `argparse` module for parsing command-line arguments\n* `json` module for parsing JSON responses\n* `time` module for handling time-related functionality\n* `humanize` module for formatting dates\n* `datetime` module for parsing and manipulating dates\n\nThe overall architecture of the script is simple and modular, with clear separation of concerns between fetching data from the API and generating the markdown file. The script is well-documented with clear variable and function names, making it easy to understand and maintain.
'''

In [None]:

def split_prompt(prompt, max_tokens):
    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")
    tokenized_prompt = tokenizer.encode(prompt)
    
    chunks = [tokenized_prompt[i:i+max_tokens] for i in range(0, len(tokenized_prompt), max_tokens)] # (generated with Microsoft Copilot)
    return [tokenizer.decode(chunk) for chunk in chunks] # (generated with Microsoft Copilot)


sub_prompts = split_prompt(full_prompt, 20)


In [20]:
sub_prompts

['<|begin_of_text|>\n    This repository is a Python script that fetches and generates a list of most popular repositories',
 ' on GitHub based on the given programming language. The script uses the GitHub API to retrieve the required information',
 ' and stores the access token in a local file named "token.json". The script supports multiple programming languages',
 ' and can fetch up to 10 pages of results per language.\n\nThe script defines a class `Repository',
 'InformationProvider` that initializes a `requests.Session` object with retries and rate limit handling. It also',
 ' defines methods to get the next page of results for a given language and to get the last commit date',
 ' for a given repository.\n\nThe `generate_readme` function generates a markdown file with a table of',
 ' most popular repositories for a given language. It fetches the data using the `RepositoryInformationProvider`',
 ' and formats the data into a markdown table.\n\nThe script uses several constants and 