In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import git
import glob
import tempfile

# Create a temporary directory for the Git repository
local_dir = tempfile.mkdtemp()

# Clone the repository to the temporary directory
repo_url = "https://github.com/urschrei/pyzotero.git"
git.Repo.clone_from(repo_url, local_dir)

# Get the Git repository object
repo = git.Repo(local_dir)

# Get the list of files to ignore from .gitignore
ignore_list = []
with open(os.path.join(local_dir, ".gitignore"), "r") as f:
    ignore_list = f.read().splitlines()

# Define the file extensions to include
include_extensions = [".py"]

# Get the list of files in the repository, ignoring the files listed in .gitignore
all_files = []
for include_extension in include_extensions:
    for file_path in glob.glob(
        os.path.join(local_dir, "**", f"*{include_extension}"), recursive=True
    ):
        if os.path.isfile(file_path) and not any(
            [glob.fnmatch.fnmatch(file_path, ignore) for ignore in ignore_list]
        ):
            all_files.append(file_path)

# Print the list of all files in the repository with the specified extensions
print(all_files)

In [None]:
import tiktoken

enc = tiktoken.encoding_for_model("gpt-4")

encodings = {}
for file in all_files:
    with open(file, "r+") as f:
        try:
            text = f.read()
            encodings[file] = enc.encode(text)
        except UnicodeDecodeError:
            pass

In [None]:
from collections import Counter

Counter(sorted([len(l) for l in list(encodings.values())]))

In [None]:
from llamabot import QueryBot

bot = QueryBot(
    "You are a bot that answers questions about a git repository.", doc_paths=all_files
)

In [None]:
bot(
    "How do I configure zotero.Zotero().items() to return a list of items based on a query string that I provide it?"
)

In [None]:
bot(
    "I want to make a new function that loads FASTQ files into a pandas DataFrame and cleans it up. What should I write? And which module should it go into?"
)

In [None]:
# SCRATCH
from llamabot import SimpleBot


pr_bot = SimpleBot("You are a bot that answers questions about pull requests.")

In [None]:
diff = '''
diff --git a/janitor/functions/conditional_join.py b/janitor/functions/conditional_join.py
index ef1100a2c..957ea0846 100644
--- a/janitor/functions/conditional_join.py
+++ b/janitor/functions/conditional_join.py
@@ -5,7 +5,6 @@
 import pandas as pd
 import pandas_flavor as pf
 from pandas.core.dtypes.common import (
-    is_categorical_dtype,
     is_datetime64_dtype,
     is_dtype_equal,
     is_extension_array_dtype,
@@ -305,29 +304,27 @@ def _conditional_join_type_check(
     Raise error if column type is not any of numeric or datetime or string.
     """
 
-    permitted_types = {
-        is_datetime64_dtype,
-        is_numeric_dtype,
-        is_string_dtype,
-    }
-    for func in permitted_types:
-        # change is based on this PR
-        # https://github.com/pandas-dev/pandas/pull/52527/files
-        if isinstance(left_column.dtype, pd.CategoricalDtype) or func(
-            left_column
-        ):
-            break
-    else:
-        raise ValueError(
-            "conditional_join only supports "
-            "string, category, numeric, or date dtypes (without timezone) - "
-            f"'{left_column.name} is of type {left_column.dtype}."
-        )
+    is_categorical_dtype = isinstance(left_column.dtype, pd.CategoricalDtype)
 
-    lk_is_cat = is_categorical_dtype(left_column)
-    rk_is_cat = is_categorical_dtype(right_column)
+    if not is_categorical_dtype:
+        permitted_types = {
+            is_datetime64_dtype,
+            is_numeric_dtype,
+            is_string_dtype,
+        }
+        for func in permitted_types:
+            if func(left_column.dtype):
+                break
+        else:
+            raise ValueError(
+                "conditional_join only supports "
+                "string, category, numeric, or "
+                "date dtypes (without timezone) - "
+                f"'{left_column.name} is of type "
+                f"{left_column.dtype}."
+            )
 
-    if lk_is_cat & rk_is_cat:
+    if is_categorical_dtype:
         if not left_column.array._categories_match_up_to_permutation(
             right_column.array
         ):
'''

In [None]:
prompt = f"Given the following pull request diff: {diff}, summarize the pull request changes for me in plain English. Also postulate what the benefits of these changes are. If the changes are not likely to be beneficial, state your reasons for why."

In [None]:
pr_bot(prompt)

In [None]:
writing_bot = SimpleBot("You are an expert writer of technical blog posts.")

In [None]:
blog_post = """
How to write better pull request summaries using GPT4.

Today, I stumbled upon a really cool use case of GPT4:
writing better pull request messages!

Here's how I did it.

Firstly, I used `llamabot`, a Python package that I created.

Secondly, I obtained the diff of a pull request.
The easiest way to do this is by appending `.diff` to the end of a PR url, for example:

https://github.com/pyjanitor-devs/pyjanitor/pull/1256 -> https://github.com/pyjanitor-devs/pyjanitor/pull/1256.diff

Then, here's the code for generating the summary message:

```python
diff = '''
<diff gets pasted here...>
'''

prompt = f"Given the following pull request diff: {diff}, summarize the pull request changes for me in plain English. Also postulate what the benefits of these changes are. If the changes are not likely to be beneficial, state your reasons for why."

from llamabot import SimpleBot
prbot = SimpleBot("You are a bot that answers questions about pull requests diffs.")

prbot(prompt)
```

For [PR#1262](https://github.com/pyjanitor-devs/pyjanitor/pull/1262):

```text
This pull request introduces several changes to the `conditional_join` function in the `janitor/functions/conditional_join.py` file and updates the corresponding tests in the `tests/functions/test_conditional_join.py` file. The main changes are as follows:

1. Adds support for "outer" join type in the `how` parameter, allowing users to perform outer joins in addition to inner, left, and right joins.
2. Deprecates the `sort_by_appearance` parameter, issuing a warning when it is used.
3. Updates the documentation and comments to reflect the changes made.

The benefits of these changes include:

1. Increased functionality: Users can now perform outer joins using the `conditional_join` function, providing more flexibility in their data manipulation tasks.
2. Improved performance: By deprecating the `sort_by_appearance` parameter, the function may offer better performance in certain cases, as it no longer needs to maintain the original order of the data.
3. Clearer documentation: The updated documentation and comments make it easier for users to understand the function's behavior and the available options.

Overall, these changes are likely to be beneficial, as they enhance the functionality of the `conditional_join` function and improve its performance in certain scenarios.
```

And for [PR#1261](https://github.com/pyjanitor-devs/pyjanitor/pull/1261):

```text
The pull request modifies the `_conditional_join_type_check` function in the `conditional_join.py` file. The main change is the removal of the `is_categorical_dtype` function from the `permitted_types` set and the addition of a separate check for categorical data types using `isinstance(left_column.dtype, pd.CategoricalDtype)`.

In plain English, the changes can be summarized as follows:
1. The code now checks if the `left_column.dtype` is a categorical data type separately, instead of including it in the `permitted_types` set.
2. If the `left_column.dtype` is not a categorical data type, the code proceeds to check if it is one of the other permitted types (datetime64, numeric, or string).
3. If the `left_column.dtype` is a categorical data type, the code checks if the categories match up to a permutation between the left and right columns.

The benefits of these changes are:
1. Improved readability and clarity of the code by separating the check for categorical data types from the other permitted types.
2. Ensuring that the correct checks are performed for categorical data types, which may help prevent potential issues or bugs in the future.

Overall, these changes are likely to be beneficial as they improve the code's readability and maintainability.
```

This was pretty cool to see!
Previously, I would dread having to (1) write a summary of my PRs, and (2) review PRs with very long diffs.
This prompt is a great way to solve both problems!
"""

prompt = f"Given the following blog post: {blog_post}, help me rewrite it such that it is clearer to read. Correct any grammatical errors"

In [None]:
improved_blog = writing_bot(prompt)

In [None]:
summary_bot = SimpleBot(
    "You write summaries of blog posts. The blog post contents will be provided to you. Provide a summary of under 100 words, and 10 blog tags that can be associated with the blog post, all in lowercase, one line per tag, no numbered or bullet lists. Start the summary with 'In this blog post, I...'. Write in first person tone."
)

In [None]:
summary_bot(improved_blog.content)

In [None]:
patreon_bot = SimpleBot(
    "You are a bot that helps craft expertly messaged Patreon posts. You will be provided with a blog post, and you will write a Patreon post that summarizes the blog post, keeps Patrons engaged, and encourages them to share the post with others. Here is the blog post."
)

In [None]:
patreon_post = patreon_bot(improved_blog.content)

In [None]:
patreon_summary_bot = SimpleBot(
    "You are a bot that provides a 140 character (or less) summary of a Patreon post. The summary should be engaging and encourage people to read the full post. Write the summary in first person tone. Here is the patreon post."
)

In [None]:
patreon_summary = patreon_summary_bot(patreon_post.content)

In [None]:
linkedin_bot = SimpleBot(
    "You are a bot that crafts LinkedIn posts. You will be provided with a blog post, and you will write a LinkedIn post that summarizes the blog post, keeps people engaged, and encourages them to share the post with others. Use emojis where appropriate! Include hashtags (all lowercase). Include a placeholder link to the original blog post. Here is the blog post."
)

In [None]:
linkedin_post = linkedin_bot(improved_blog.content)

In [None]:
mastodon_bot = SimpleBot(
    "You are a bot that crafts Mastodon posts. You will be provided with a blog post, and you will write a LinkedIn post that summarizes the blog post, keeps people engaged, and encourages them to share the post with others. Use emojis where appropriate! Include hashtags (all lowercase). Include a placeholder link to the original blog post. Write in first-person tone. Here is the blog post."
)

In [None]:
mastodon_bot(improved_blog.content)

In [None]:
twitter_bot = SimpleBot(
    "You are a bot that crafts Twitter posts. You will be provided with a blog post, and you will write a Twitter post that summarizes the blog post, keeps people engaged, and encourages them to share the post with others. Use emojis where appropriate! Include hashtags (all lowercase). Include a placeholder link to the original blog post. The Twitter post should be 280 characters or less. Write in first person tone. Here is the blog post."
)

In [None]:
twitter_post = twitter_bot(improved_blog.content)