# Load cleaned data

In [1]:
import glob
import json
import string
import math
import pandas as pd
import numpy as np
import re
import sklearn
from sklearn.model_selection import train_test_split
import random
random.seed(42)

import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

In [2]:
# ========================= Columns =========================
commit_columns = ['Type', 'URL', 'Author', 'RepoName', 'RepoLanguage', 'Sha', 'Message']
code_file_columns = ['Type', 'URL', 'RepoName']
repo_columns = ['Type', 'URL', 'RepoName', 'RepoLanguage']
issue_columns = ['Type', 'URL', 'Author', 'RepoName', 'RepoLanguage', 'Number', 'Title', 'Body', 'AuthorAt', 'ClosedAt', 'UpdatedAt', 'State']
pull_request_columns = ['Type', 'URL', 'Author', 'RepoName', 'RepoLanguage', 'Number', 'Title', 'Body', 'CreatedAt', 'ClosedAt', 'MergedAt', 'UpdatedAt', 'State', 'Additions', 'Deletions', 'ChangedFiles', 'CommitsTotalCount', 'CommitSha']
hacker_news_columns = ['Type', 'ID', 'URL', 'AttachedURL', 'Title', 'CreatedAt']
discussion_columns = ['Type', 'URL', 'Author', 'RepoName', 'RepoLanguage', 'Number', 'Title', 'Body', 'AuthorAt', 'ClosedAt', 'UpdatedAt', 'Closed', 'UpvoteCount']
mention_columns = ['MentionedURL', 'MentionedProperty', 'MentionedAuthor', 'MentionedText', 'MentionedPath','MentionedAnswer', 'MentionedUpvoteCount']
gpt_sharing_columns = ['SharingURL', 'Status', 'DateOfConversation', 'DateOfAccess', 'NumberOfPrompts', 'TokensOfPrompts', 'TokensOfAnswers', 'Model', 'Conversations']

# ========================= Processing functions =========================
def process_commit_json(commit):
    commit_array_of_elements = [commit[col] for col in (commit_columns + gpt_sharing_columns + mention_columns)]
    return commit_array_of_elements

def process_code_files_json(code_file):
    code_files_array_of_elements = [code_file[col] for col in (code_file_columns + gpt_sharing_columns + mention_columns)]
    return code_files_array_of_elements

def process_repo_json(repo):
    repo_array_of_elements = [repo[col] for col in (repo_columns + gpt_sharing_columns + mention_columns)]
    return repo_array_of_elements

def process_issue_json(issue):
    issue_array_of_elements = [issue[col] for col in (issue_columns + gpt_sharing_columns + mention_columns)]
    return issue_array_of_elements

def process_pull_request_json(pull_request):
    pull_request_array_of_elements = [pull_request[col] for col in (pull_request_columns + gpt_sharing_columns + mention_columns)]
    return pull_request_array_of_elements

def process_hacker_news_json(hacker_news):
    hacker_news_array_of_elements = [hacker_news[col] for col in (hacker_news_columns + gpt_sharing_columns + mention_columns)]
    return hacker_news_array_of_elements

def process_discussion_json(discussion):
    discussion_array_of_elements = [discussion[col] for col in (discussion_columns + gpt_sharing_columns + mention_columns)]
    return discussion_array_of_elements

In [3]:
def read_json_data_from_files_to_dataframe(json_filepath):
    file_sharings_df = pd.DataFrame()
    with open(json_filepath, 'r') as file:
        # Load JSON data from file
        json_data = json.load(file)
        data_to_df = []
        for source in json_data:
            source_array = []
            columns_for_df = []
            if source['Type'] == 'commit':
                source_array = process_commit_json(source)
                columns_for_df = commit_columns
            elif source['Type'] == 'code file':
                source_array = process_code_files_json(source)
                columns_for_df = code_file_columns
            elif source['Type'] == 'repository':
                source_array = process_repo_json(source)
                columns_for_df = repo_columns
            elif source['Type'] == 'issue':
                source_array = process_issue_json(source)
                columns_for_df = issue_columns
            elif source['Type'] == 'pull request':
                source_array = process_pull_request_json(source)
                columns_for_df = pull_request_columns
            elif source['Type'] == 'hacker news':
                source_array = process_hacker_news_json(source)
                columns_for_df = hacker_news_columns
            elif source['Type'] == 'discussion':
                source_array = process_discussion_json(source)
                columns_for_df = discussion_columns
            else:
                print(f"Unexpected type of the course: '{source['Type']}'")
                raise
            data_to_df.append(source_array)
        file_dataframe = pd.DataFrame(data_to_df, columns=(columns_for_df + gpt_sharing_columns + mention_columns))
        file_sharings_df = pd.concat([file_sharings_df, file_dataframe])
    return file_sharings_df

In [4]:
cleaned_dir_name = "cleaned_datasets"
dataframe_names = ["commits", "issues", "discussions", "pull_requests", "code_files", "repository", "hacker_news"]
cleaned_dataframe_file_names = [f"{cleaned_dir_name}/cleaned_{df_name}.json" for df_name in dataframe_names]

dataframes_cleaned = []

for filename in cleaned_dataframe_file_names:
    dataframes_cleaned.append(read_json_data_from_files_to_dataframe(filename))

### Investigate what do long conversations contain

In [5]:
long_conv_cutoff = 30
prompt_lengts_cutoff = [900, 1600, 1200, 1300, 2000, 2000, 400]

too_long_conversations = {}
long_conv_filename = "./too_long_conversations.json"
too_long_prompts = {}
long_prompts_filename = "./too_long_prompts.json"

for df_idx in range(len(dataframes_cleaned)):
    for index, df_row in dataframes_cleaned[df_idx].iterrows():
        conversations = df_row.Conversations
        if len(conversations) >= long_conv_cutoff:
            too_long_conversations[df_row.URL] = []
            for conv in conversations:
                too_long_conversations[df_row.URL].append(conv["Prompt"])
        contains_too_long_prompts = False
        for conv_idx in range(len(conversations)):
            conversation = conversations[conv_idx]
            prompt = conversation["Prompt"]
            prompt_len = len(prompt)
            if prompt_len >= prompt_lengts_cutoff[df_idx]:
                contains_too_long_prompts = True
                break
        if contains_too_long_prompts:
            too_long_prompts[df_row.URL] = []
            for conv in conversations:
                too_long_prompts[df_row.URL].append(conv["Prompt"])


with open(long_conv_filename, 'w') as file:
        json.dump(too_long_conversations, file)

with open(long_prompts_filename, 'w') as file:
        json.dump(too_long_prompts, file)