# Import tokens from env folders

In [None]:
from tqdm import tqdm
import pandas as pd
import requests
import json
import sys
import time
import datetime
import random
import re
import numpy as np
class QueryFailError(Exception):
    pass
with open('./env/tokens.txt') as f:
    lines = [line.rstrip() for line in f]
tokens = [{'token': token, 'reset': None} for token in lines]
token_index = 0
def get_header():
    return {
    "Authorization": f"Bearer {tokens[token_index]['token']}",
  }
import logging
logger = logging.getLogger()
logger.setLevel(logging.ERROR)

# set query method and function to deal with api limit

In [None]:
def send_query(query, headers):
    request = requests.post(URL, json={"query": query}, headers=headers)
    if request.status_code != 200:
        logger.error(f"{request.text}")
        raise QueryFailError(
    "Query failed to run by returning code of {}. {}".format(
        request.status_code, query
    )
)
    try:
        response = json.JSONDecoder().decode(json.dumps(request.json(), sort_keys=True))
    except:
        logger.error(f"{request.text}")
        raise QueryFailError(
            "Query failed to run by returning code of {}. {}".format(
                request.status_code, url
            )
        )
    try:
        return response['data']
    except KeyError:
        return response
def search_limit(headers):
    query = """
                    query {
                      viewer {
                        login
                      }
                      rateLimit {
                        limit
                        cost
                        remaining
                        resetAt
                      }
                }
                """
    try:
        resp = send_query(query, headers)
        global tokens
        global token_index
        tokens[token_index]['reset'] = pd.to_datetime(resp['rateLimit']['resetAt'])
        if resp['rateLimit']['remaining'] > 200:
            return
        else:
            token_index = (token_index + 1) % len(tokens)
            if tokens[token_index]['reset'] is None:
                return
            if tokens[token_index]['reset'] < datetime.datetime.now(datetime.timezone.utc):
                return
            else:
                time.sleep((tokens[token_index]['reset'] - datetime.datetime.now(datetime.timezone.utc)).total_seconds())
                print(f"Token {token_index} is ready to use again.")
                return
    except (QueryFailError,KeyError) as e:
        logger.error('sleep in ' + str(datetime.datetime.now()) + str(e))
        time.sleep(3600)

URL = "https://api.github.com/graphql"
    
def send_query_with_retries(query, retries, headers):
    request = requests.post(URL, json={"query": query}, headers=headers)
    if request.status_code != 200:
        for i in range(retries):
            time.sleep(random.randint(1, 3))
            request = requests.post(URL, json={"query": query}, headers=headers)
            print("retrying in ", i +1)
            if request.status_code == 200: 
                break
            
            elif i == retries - 1:
                logger.error(f"{request.text}")
                raise QueryFailError(
            "Query failed to run by returning code of {}. {}".format(
                request.status_code, query
            )
        )
        
    try:
        response = json.JSONDecoder().decode(json.dumps(request.json(), sort_keys=True))
    except:
        logger.error(f"{request.text}")
        raise QueryFailError(
            "Query failed to run by returning code of {}. {}".format(
                request.status_code, url
            )
        )
    
    try:
        return response['data']
    except KeyError:
        return response  


# Section 3.1 PRs Generated by Copilot for PRs - collection process

In [None]:
failed_edits_queries = []
failed_experiences_queries = []
edit_contents = pd.DataFrame(columns = ['repoName','number','editor','diff','editedAt','id', 'afterCommitSha','afterNumberOfCommits','afterPeriod'])
failed_comments_queries = []
failed_prs_queries = []
prs_df = pd.DataFrame(columns = ['repoName','number','title','body','repoLanguage', 'repoCreatedAt', 'forkCount', 'stargazerCount', 'repoAge', 'createdAt',
                                 'mergedAt','url','state','lastEditedAt', 'firstEditedAtBycopilot4prs','closedAt',
                                 'updatedAt','deletions','additions','changedFiles','commentsTotalCount',
                                'author','commitsTotalCount','prExperience','isMember'])
comments_df = pd.DataFrame(columns = ['repoName','number','author','comment','id'])
start_dt = datetime.datetime(2019, 2, 14)
temp_start_dt = start_dt
end_dt = datetime.datetime(2023, 9, 1)
temp_end_dt = end_dt
while temp_start_dt != end_dt:
    search_limit(get_header())
    query = '''
    {
      search(
        query: """ "Generated by Copilot" is:public is:pr in:body created:temp_start_dt..temp_end_dt"""
        type: ISSUE
        first: 0
      ) {
        issueCount
      }
    }'''.replace('temp_start_dt',temp_start_dt.strftime('%Y-%m-%dT%H:%M:%SZ')).replace('temp_end_dt',temp_end_dt.strftime('%Y-%m-%dT%H:%M:%SZ'))
    try:
        results = send_query_with_retries(query,3,get_header())
    except QueryFailError as e:
        logger.error('Obtaining pr counts with ' + str(e))
        failed_prs_queries.append(query)
        temp_start_dt = temp_end_dt
        temp_end_dt = end_dt
        continue
    issueCount = results['search']['issueCount']
    print(temp_start_dt, temp_end_dt, issueCount)
    if issueCount == 0: 
        temp_start_dt = temp_end_dt
        temp_end_dt = end_dt
    elif issueCount <= 1000:
        search_limit(get_header())
        query = '''
             query{
                search(
                    query: """ "Generated by Copilot" is:public is:pr in:body created:temp_start_dt..temp_end_dt"""
                    type: ISSUE
                    first: 50
                    ) {
                    edges {
                      node {
                        ... on PullRequest {
                          authorAssociation
                          number
                          title
                          body
                          repository {
                            nameWithOwner
                            primaryLanguage {
                              name
                            }
                            createdAt
                            forkCount
                            stargazerCount
                          }
                          createdAt
                          mergedAt
                          url
                          state
                          lastEditedAt
                          userContentEdits(first: 100) {
                            edges {
                              node {
                                editor {
                                  login
                                }
                                diff
                                editedAt
                                id
                              }
                            }
                          }
                          closedAt
                          updatedAt
                          deletions
                          additions
                          changedFiles
                          totalCommentsCount
                          comments(first: 50) {
                            totalCount
                            edges {
                              node {
                                body
                                author {
                                  login
                                }
                                id
                              }
                            }
                            pageInfo {
                              endCursor
                              hasNextPage
                              hasPreviousPage
                              startCursor
                            }
                          }
                          author {
                            login
                          }
                          commits(first: 100) {
                              edges {
                              node {
                                commit {
                                  oid
                                  committedDate
                                }
                              }
                            }
                            totalCount
                          }
                        }
                      }
                    }
                    pageInfo {
                      endCursor
                      hasNextPage
                      hasPreviousPage
                      startCursor
                    }
                  }
                }'''.replace('temp_start_dt',temp_start_dt.strftime('%Y-%m-%dT%H:%M:%SZ')).replace('temp_end_dt',temp_end_dt.strftime('%Y-%m-%dT%H:%M:%SZ'))
        try:
            results = send_query_with_retries(query,3,get_header())
        except QueryFailError as e:
            logger.error('Obtaining pr with ' + str(e))
            failed_prs_queries.append(query)
            temp_start_dt = temp_end_dt
            temp_end_dt = end_dt
            continue
        for pr in results['search']['edges']:
            editor_names = [editing['node']['editor']['login'] if editing['node']['editor'] is not None else 'ghost' for editing in pr['node']['userContentEdits']['edges']]
            # Note now the copilot4prs bot has became ghost, change 'copilot4prs' to 'ghost' due to https://gist.github.com/idan/325676d192b32f169b032fde2d866c2c#github-next--technical-preview-sunsets
            if ('copilot4prs' not in editor_names) or (pr['node']['number'] in list(prs_df.loc[prs_df['repoName'] == pr['node']['repository']['nameWithOwner']]['number'])): continue
            committed_dates = [pd.to_datetime(commit['node']['commit']['committedDate']) for commit in pr['node']['commits']['edges']]
            committed_shas = [commit['node']['commit']['oid'] for commit in pr['node']['commits']['edges']]
            firstEditedAtBycopilot4prs = None
            editings = pr['node']['userContentEdits']['edges']
            editings.reverse()
            for editing in editings:
                editor_name = editing['node']['editor']['login'] if editing['node']['editor'] is not None else 'ghost'
                after_commit_sha = None
                after_number_of_commits = 0
                after_period = None
                for index, commited_date in enumerate(committed_dates):
                    if pd.to_datetime(editing['node']['editedAt']) > commited_date:
                        after_commit_sha = committed_shas[index]
                        after_number_of_commits = index + 1
                        after_period = (pd.to_datetime(editing['node']['editedAt']) - commited_date).total_seconds()/3600
                edit_contents = pd.concat([edit_contents,pd.DataFrame([
                    [pr['node']['repository']['nameWithOwner'],pr['node']['number'], editor_name, editing['node']['diff'], editing['node']['editedAt'], editing['node']['id'], after_commit_sha, after_number_of_commits, after_period]],
                    columns=edit_contents.columns)], ignore_index=True)
                if editor_name == 'copilot4prs' and firstEditedAtBycopilot4prs is None:
                    firstEditedAtBycopilot4prs = editing['node']['editedAt']
            pr_author = pr['node']['author']['login'] if pr['node']['author'] is not None else 'ghost'
            author_association = pr['node']['authorAssociation']
            is_member = author_association in ['MEMBER','OWNER']
            # search author experince
            search_limit(get_header())
            query = '''
                query{
              search(
                query: """
                repo:repo_name author:pr_author is:pr created:<pr_created
                """
                type: ISSUE
                first: 0
              ) {
                issueCount
              }
            }
            '''.replace('repo_name', pr['node']['repository']['nameWithOwner']).replace('pr_author', pr_author).replace('pr_created', pr['node']['createdAt'])
            try:
                pr_experiences_results = send_query_with_retries(query,3,get_header())
            except QueryFailError as e:
                logger.error('Obtaining author experience with ' + str(e))
                failed_experiences_queries.append(query)
                continue  
            preliminary_lang = pr['node']['repository']['primaryLanguage']['name'] if pr['node']['repository']['primaryLanguage'] is not None else None
            repo_age = (pd.to_datetime(pr['node']['createdAt']) - pd.to_datetime(pr['node']['repository']['createdAt'])).total_seconds()/3600/24
            prs_df = pd.concat([prs_df,pd.DataFrame([
                [pr['node']['repository']['nameWithOwner'],pr['node']['number'], pr['node']['title'], pr['node']['body'],
                 preliminary_lang, pr['node']['repository']['createdAt'], pr['node']['repository']['forkCount'], pr['node']['repository']['stargazerCount'], repo_age, pr['node']['createdAt'], pr['node']['mergedAt'], 
                 pr['node']['url'], pr['node']['state'], pr['node']['lastEditedAt'], firstEditedAtBycopilot4prs, pr['node']['closedAt'], pr['node']['updatedAt'],
                 pr['node']['deletions'], pr['node']['additions'], pr['node']['changedFiles'], pr['node']['comments']['totalCount'],
                  pr_author, pr['node']['commits']['totalCount'], pr_experiences_results['search']['issueCount'], is_member]],
                columns=prs_df.columns)], ignore_index=True)
            for comment in pr['node']['comments']['edges']:
                comment_author = comment['node']['author']['login'] if comment['node']['author'] is not None else 'ghost'
                comments_df = pd.concat([comments_df,pd.DataFrame([
                [pr['node']['repository']['nameWithOwner'], pr['node']['number'], comment_author, comment['node']['body'], comment['node']['id']]],
                columns=comments_df.columns)], ignore_index=True)
            pr_comments = pr['node']['comments']
            while pr_comments['pageInfo']['hasNextPage']:
                comment_endcursor = pr_comments['pageInfo']['endCursor']
                search_limit(get_header())
                query = '''query{
                  repository(name: "repo_name", owner: "repo_owner") {
                    pullRequest(number: pr_number) {
                      comments(first: 100, after: "comment_endcursor") {
                        edges {
                          node {
                            author {
                              login
                            }
                            body
                            id
                          }
                        }
                        pageInfo {
                          endCursor
                          hasNextPage
                          hasPreviousPage
                          startCursor
                        }
                      }
                    }
                  }
                }
                '''.replace('repo_name',pr['node']['repository']['nameWithOwner'].split('/')[1]).replace(
                    'repo_owner',pr['node']['repository']['nameWithOwner'].split('/')[0]).replace(
                    'pr_number', str(pr['node']['number'])).replace('comment_endcursor',comment_endcursor)
                try:
                    comments_results = send_query_with_retries(query,3,get_header())
                except QueryFailError as e:
                    logger.error('Obtaining comment with ' + str(e))
                    failed_comments_queries.append(query)
                    break
                pr_comments = comments_results['repository']['pullRequest']['comments']
                for comment in pr_comments['edges']:
                    comment_author = comment['node']['author']['login'] if comment['node']['author'] is not None else 'ghost'
                    comments_df = pd.concat([comments_df,pd.DataFrame([
                    [pr['node']['repository']['nameWithOwner'], pr['node']['number'], comment_author, comment['node']['body'], comment['node']['id']]],
                    columns=comments_df.columns)], ignore_index=True)
        while results['search']['pageInfo']['hasNextPage']:
            endcursor = results['search']['pageInfo']['endCursor']
            search_limit(get_header())
            query = '''
             query{
                search(
                    query: """ "Generated by Copilot" is:public is:pr in:body created:temp_start_dt..temp_end_dt"""
                    type: ISSUE
                    first: 50
                    after:"endcursor"
                    ) {
                    edges {
                      node {
                        ... on PullRequest {
                          authorAssociation
                          number
                          title
                          body
                          repository {
                            nameWithOwner
                            primaryLanguage {
                              name
                            }
                            createdAt
                            forkCount
                            stargazerCount
                          }
                          createdAt
                          mergedAt
                          url
                          state
                          lastEditedAt
                          userContentEdits(first: 100) {
                            edges {
                              node {
                                editor {
                                  login
                                }
                                diff
                                editedAt
                                id
                              }
                            }
                          }
                          closedAt
                          updatedAt
                          deletions
                          additions
                          changedFiles
                          comments(first: 50) {
                            totalCount
                            edges {
                              node {
                                body
                                author {
                                  login
                                }
                                id
                              }
                            }
                            pageInfo {
                              endCursor
                              hasNextPage
                              hasPreviousPage
                              startCursor
                            }
                          }
                          author {
                            login
                          }
                          commits(first: 100) {
                              edges {
                              node {
                                commit {
                                  oid
                                  committedDate
                                }
                              }
                            }
                            totalCount
                          }
                        }
                      }
                    }
                    pageInfo {
                      endCursor
                      hasNextPage
                      hasPreviousPage
                      startCursor
                    }
                  }
                }'''.replace('endcursor',endcursor).replace('temp_start_dt',temp_start_dt.strftime('%Y-%m-%dT%H:%M:%SZ')).replace('temp_end_dt',temp_end_dt.strftime('%Y-%m-%dT%H:%M:%SZ'))
            try:
                results = send_query_with_retries(query,3,get_header())
            except QueryFailError as e:
                logger.error('Obtaining pr with ' + str(e))
                failed_prs_queries.append(query)
                break
            for pr in results['search']['edges']:
                editor_names = [editing['node']['editor']['login'] if editing['node']['editor'] is not None else 'ghost' for editing in pr['node']['userContentEdits']['edges']]
                # Note now the copilot4prs bot has became ghost, change 'copilot4prs' to 'ghost' due to https://gist.github.com/idan/325676d192b32f169b032fde2d866c2c#github-next--technical-preview-sunsets
                if ('copilot4prs' not in editor_names) or (pr['node']['number'] in list(prs_df.loc[prs_df['repoName'] == pr['node']['repository']['nameWithOwner']]['number'])): continue
                committed_dates = [pd.to_datetime(commit['node']['commit']['committedDate']) for commit in pr['node']['commits']['edges']]
                committed_shas = [commit['node']['commit']['oid'] for commit in pr['node']['commits']['edges']]
                firstEditedAtBycopilot4prs = None
                editings = pr['node']['userContentEdits']['edges']
                editings.reverse()
                for editing in editings:
                    editor_name = editing['node']['editor']['login'] if editing['node']['editor'] is not None else 'ghost'
                    after_commit_sha = None
                    after_number_of_commits = 0
                    after_period = None
                    for index, commited_date in enumerate(committed_dates):
                        if pd.to_datetime(editing['node']['editedAt']) > commited_date:
                            after_commit_sha = committed_shas[index]
                            after_number_of_commits = index + 1
                            after_period = (pd.to_datetime(editing['node']['editedAt']) - commited_date).total_seconds()/3600
                    edit_contents = pd.concat([edit_contents,pd.DataFrame([
                        [pr['node']['repository']['nameWithOwner'],pr['node']['number'], editor_name, editing['node']['diff'], editing['node']['editedAt'], editing['node']['id'], after_commit_sha, after_number_of_commits, after_period]],
                        columns=edit_contents.columns)], ignore_index=True)
                    if editor_name == 'copilot4prs' and firstEditedAtBycopilot4prs is None:
                        firstEditedAtBycopilot4prs = editing['node']['editedAt']
                pr_author = pr['node']['author']['login'] if pr['node']['author'] is not None else 'ghost'
                author_association = pr['node']['authorAssociation']
                is_member = author_association in ['MEMBER','OWNER']
                # search author experince
                search_limit(get_header())
                query = '''
                    query{
                  search(
                    query: """
                    repo:repo_name author:pr_author is:pr created:<pr_created
                    """
                    type: ISSUE
                    first: 0
                  ) {
                    issueCount
                  }
                }
                '''.replace('repo_name', pr['node']['repository']['nameWithOwner']).replace('pr_author', pr_author).replace('pr_created', pr['node']['createdAt'])
                try:
                    pr_experiences_results = send_query_with_retries(query,3,get_header())
                except QueryFailError as e:
                    logger.error('Obtaining author experience with ' + str(e))
                    failed_experiences_queries.append(query)
                    continue  
                preliminary_lang = pr['node']['repository']['primaryLanguage']['name'] if pr['node']['repository']['primaryLanguage'] is not None else None
                repo_age = (pd.to_datetime(pr['node']['createdAt']) - pd.to_datetime(pr['node']['repository']['createdAt'])).total_seconds()/3600/24
                prs_df = pd.concat([prs_df,pd.DataFrame([
                    [pr['node']['repository']['nameWithOwner'],pr['node']['number'], pr['node']['title'], pr['node']['body'],
                     preliminary_lang, pr['node']['repository']['createdAt'], pr['node']['repository']['forkCount'], pr['node']['repository']['stargazerCount'], repo_age, pr['node']['createdAt'], pr['node']['mergedAt'], 
                     pr['node']['url'], pr['node']['state'], pr['node']['lastEditedAt'], firstEditedAtBycopilot4prs, pr['node']['closedAt'], pr['node']['updatedAt'],
                     pr['node']['deletions'], pr['node']['additions'], pr['node']['changedFiles'], pr['node']['comments']['totalCount'],
                      pr_author, pr['node']['commits']['totalCount'], pr_experiences_results['search']['issueCount'], is_member]],
                    columns=prs_df.columns)], ignore_index=True)
                for comment in pr['node']['comments']['edges']:
                    comment_author = comment['node']['author']['login'] if comment['node']['author'] is not None else 'ghost'
                    comments_df = pd.concat([comments_df,pd.DataFrame([
                    [pr['node']['repository']['nameWithOwner'], pr['node']['number'], comment_author, comment['node']['body'], comment['node']['id']]],
                    columns=comments_df.columns)], ignore_index=True)
                pr_comments = pr['node']['comments']
                while pr_comments['pageInfo']['hasNextPage']:
                    comment_endcursor = pr_comments['pageInfo']['endCursor']
                    search_limit(get_header())
                    query = '''query{
                      repository(name: "repo_name", owner: "repo_owner") {
                        pullRequest(number: pr_number) {
                          comments(first: 100, after: "comment_endcursor") {
                            edges {
                              node {
                                author {
                                  login
                                }
                                body
                                id
                              }
                            }
                            pageInfo {
                              endCursor
                              hasNextPage
                              hasPreviousPage
                              startCursor
                            }
                          }
                        }
                      }
                    }
                    '''.replace('repo_name',pr['node']['repository']['nameWithOwner'].split('/')[1]).replace(
                        'repo_owner',pr['node']['repository']['nameWithOwner'].split('/')[0]).replace(
                        'pr_number', str(pr['node']['number'])).replace('comment_endcursor',comment_endcursor)
                    try:
                        comments_results = send_query_with_retries(query,3,get_header())
                    except QueryFailError as e:
                        logger.error('Obtaining comment with ' + str(e))
                        failed_comments_queries.append(query)
                        break
                    pr_comments = comments_results['repository']['pullRequest']['comments']
                    for comment in pr_comments['edges']:
                        comment_author = comment['node']['author']['login'] if comment['node']['author'] is not None else 'ghost'
                        comments_df = pd.concat([comments_df,pd.DataFrame([
                        [pr['node']['repository']['nameWithOwner'], pr['node']['number'], comment_author, comment['node']['body'], comment['node']['id']]],
                        columns=comments_df.columns)], ignore_index=True)
        temp_start_dt = temp_end_dt
        temp_end_dt = end_dt
    else:
        temp_end_dt = temp_start_dt + (temp_end_dt - temp_start_dt)/2
        
        
    

In [None]:
prs_df

# Section 3.1 PRs Generated by Copilot for PRs - Identifying Obsolete Uses of Copilot for PRs

In [None]:
firstEditedAtBycopilot4prs = pd.to_datetime(prs_df['firstEditedAtBycopilot4prs'].min())

In [None]:
str(firstEditedAtBycopilot4prs)

In [None]:
opend_prs_df = prs_df.loc[prs_df['closedAt'].isnull()].copy()
closed_prs_df = prs_df.loc[~prs_df['closedAt'].isnull()].copy()

In [None]:
opend_prs_df.loc[:,'isObsolete'] =  opend_prs_df.apply(lambda row: (pd.to_datetime(row['createdAt']) < firstEditedAtBycopilot4prs) , axis=1)
closed_prs_df.loc[:,'isObsolete'] = closed_prs_df.apply(lambda row: (pd.to_datetime(row['closedAt']) < pd.to_datetime(row['firstEditedAtBycopilot4prs'])) | (pd.to_datetime(row['createdAt']) < firstEditedAtBycopilot4prs) , axis=1)

In [None]:
prs_df = pd.concat([opend_prs_df, closed_prs_df], axis=0, ignore_index=True)

In [None]:
prs_df[prs_df['isObsolete'] == False]

# Section 3.1 PRs Generated by Copilot for PRs - Excluding PRs Submitted by Bots

In [None]:
bots = pd.read_csv('../data/groundtruthbots.csv')

In [None]:
cleaned_comments_df = comments_df[(~comments_df['author'].str.endswith('bot')) & (~comments_df['author'].isin(list(bots.loc[bots['type'] == "Bot"]['account'])))]

In [None]:
cleaned_comments_df

# Section 4.2 Casual Inference - Calculate the metadata comments, type, size, review time

In [None]:
total_comments = []
author_comments = []
reviewers_comments = []
reviewers_counts = []
desc_lens = []
pr_sizes = []
review_times = []
is_generated_by_bots = []
for index, row in prs_df.iterrows():
    is_generated_by_bots.append(row['author'].endswith('bot') or row['author'] in list(bots.loc[bots['type'] == "Bot"]['account']))
    comments = cleaned_comments_df.loc[(cleaned_comments_df['repoName'] == row['repoName']) & (cleaned_comments_df['number'] == row['number'])]
    author_comments.append(comments.loc[comments['author'] == row['author']].shape[0])
    reviewers_comments.append(comments.loc[comments['author'] != row['author']].shape[0])
    total_comments.append(comments.shape[0])
    reviewers_counts.append(len(set(list(comments.loc[comments['author'] != row['author']]['author']))))
    desc_lens.append(len(re.sub(r'[^A-Za-z0-9\']+',' ', row['body'])))
    pr_sizes.append(row['deletions'] + row['additions'])
    if row['closedAt'] is not None:
        review_times.append((pd.to_datetime(row['closedAt']) - pd.to_datetime(row['createdAt'])).total_seconds()/3600)
    else:
        review_times.append(None)
prs_df['commentsTotalCount'] = total_comments
prs_df['authorComments'] = author_comments
prs_df['reviewersComments'] = reviewers_comments
prs_df['reviewersTotalCount'] = reviewers_counts
prs_df['bodyLength'] = desc_lens
prs_df['prSize'] = pr_sizes
prs_df['reviewTime'] = review_times
prs_df['isGeneratedByBots'] = is_generated_by_bots

        

In [None]:
prs_df

In [None]:
prs_df['repoName'].value_counts()

In [None]:
prs_df.loc[(prs_df['isObsolete'] == False) & ((prs_df['isGeneratedByBots'] == False))]

In [None]:
prs_df.loc[(prs_df['isObsolete'] == False) & ((prs_df['isGeneratedByBots'] == False))]['repoName'].value_counts()

In [None]:
prs_df.loc[(prs_df['isObsolete'] == False) & ((prs_df['isGeneratedByBots'] == False)) & (prs_df['state'].isin(['MERGED', 'CLOSED']))]

# Section 3.3 Revisions of PR Descriptions Generated by Copilot for PRs - Collection of Edits on PR Descriptions

In [None]:
edit_contents = edit_contents.rename(columns={'diff': 'content'})

In [None]:
edit_contents

# Section 3.3 Revisions of PR Descriptions Generated by Copilot for PRs - Exclusion of PRs Without Post-Copilot Edits

In [None]:
edit_contents_developers = pd.DataFrame(columns = edit_contents.columns)
urls = []
number_pr = 0
number_edits = 0
for name, group in edit_contents.groupby(['repoName','number']):
    pr = prs_df.loc[(prs_df['repoName'] == group.iloc[0]['repoName']) & (prs_df['number'] == group.iloc[0]['number'])]
    if (not pr['isObsolete'].values) and (not pr['isGeneratedByBots'].values) and (pr['state'].values in ['MERGED', 'CLOSED']):
        number_pr += 1
        number_edits += group.shape[0]
        first_edit_by_copilot = group.loc[group['editor'] == 'copilot4prs']['editedAt'].min()
        last_edit_by_developer = group.loc[group['editor'] != 'copilot4prs']['editedAt'].max()
        if first_edit_by_copilot < last_edit_by_developer:
            edit_contents_developers = pd.concat([edit_contents_developers, group], ignore_index=True)
            urls.extend([pr['url'].values[0]] * group.shape[0])
print(number_pr, number_edits)

In [None]:
edit_contents_developers['url'] = urls

In [None]:
import numpy as np

conditions = [
    prs_df['body'].str.contains('doc|copyright|license', case=False, na=False),
    prs_df['body'].str.contains('bug|fix|defect', case=False, na=False)
]

choices = ['Document', 'Bug']

prs_df['purpose'] = np.select(conditions, choices, default='Feature')

In [None]:
prs_df.to_csv('../data/LLMPRs.csv',index=None)

In [None]:
prs_df.loc[(prs_df['isObsolete'] == False) & (prs_df['state'].isin(['MERGED','CLOSED'])) & (prs_df['isGeneratedByBots'] == False) ].drop(columns=['repoName', 'number', 'title', 'body','repoCreatedAt','createdAt', 'mergedAt',
       'url','lastEditedAt', 'firstEditedAtBycopilot4prs','closedAt', 'updatedAt','author','isObsolete','isGeneratedByBots']).to_csv('../data/treatment_metrics.csv',index=None)

In [None]:
comments_df.to_csv('../data/LLMPRsComments.csv',index=None)

In [None]:
cleaned_comments_df.to_csv('../data/cleanedLLMPRsComments.csv',index=None)

In [None]:
edit_contents.to_csv('../data/edit_contents.csv',index=None)

In [None]:
edit_contents_developers.to_csv('../data/edit_contents_developers.csv',index=None)

In [None]:
valid_prs_df = prs_df.loc[(prs_df['isValid'] == True) & ((prs_df['isGeneratedByBots'] == False))]

In [None]:
valid_prs_df

# Section 3.2 PRs Not generated by Copilot for PRs

In [None]:
failed_comments_queries = []
failed_prs_queries = []
failed_experiences_queries = []
control_prs_df = pd.DataFrame(columns = ['repoName','number','title','body','repoLanguage', 'repoCreatedAt', 'forkCount', 'stargazerCount', 'repoAge', 'createdAt',
                                 'mergedAt','url','state','lastEditedAt','closedAt',
                                 'updatedAt','deletions','additions','changedFiles','commentsTotalCount',
                                'author','commitsTotalCount','prExperience','isMember'])
control_comments_df = pd.DataFrame(columns = ['repoName','number','author','comment','id'])
start_dt = firstEditedAtBycopilot4prs
end_dt = pd.to_datetime('2023-09-01T00:00:00Z')
for name, group in valid_prs_df.groupby('repoName'):
    preliminary_lang = group.iloc[0]['repoLanguage']
    fork_count = group.iloc[0]['forkCount']
    stargazer_count = group.iloc[0]['stargazerCount']
    repo_created = group.iloc[0]['repoCreatedAt']
    temp_start_dt = start_dt
    temp_end_dt = end_dt
    while temp_start_dt != end_dt:
        search_limit(get_header())
        
        query = '''
        {
          search(
            query: """ NOT "Generated by Copilot" repo:repo_name is:pr created:temp_start_dt..temp_end_dt"""
            type: ISSUE
            first: 0
          ) {
            issueCount
          }
        }'''.replace('temp_start_dt',temp_start_dt.strftime('%Y-%m-%dT%H:%M:%SZ')).replace('temp_end_dt',temp_end_dt.strftime('%Y-%m-%dT%H:%M:%SZ')).replace('repo_name',name)
        try:
            results = send_query_with_retries(query,3,get_header())
        except QueryFailError as e:
            logger.error('Obtaining pr counts with ' + str(e))
            failed_prs_queries.append(query)
            temp_start_dt = temp_end_dt
            temp_end_dt = end_dt
            continue
        issueCount = results['search']['issueCount']
        print(name, temp_start_dt, temp_end_dt,issueCount)
        if issueCount == 0: 
            temp_start_dt = temp_end_dt
            temp_end_dt = end_dt
        elif issueCount <= 1000:
            search_limit(get_header())
            query = '''
                 query{
                    search(
                        query: """ NOT "Generated by Copilot" repo:repo_name is:pr created:temp_start_dt..temp_end_dt"""
                        type: ISSUE
                        first: 50
                        ) {
                        edges {
                          node {
                            ... on PullRequest {
                              authorAssociation
                              number
                              title
                              body
                              createdAt
                              mergedAt
                              url
                              state
                              lastEditedAt
                              closedAt
                              updatedAt
                              deletions
                              additions
                              changedFiles
                              comments(first: 50) {
                                totalCount
                                edges {
                                  node {
                                    body
                                    author {
                                      login
                                    }
                                    id
                                  }
                                }
                                pageInfo {
                                  endCursor
                                  hasNextPage
                                  hasPreviousPage
                                  startCursor
                                }
                              }
                            
                              author {
                                login
                              }
                              commits {
                                totalCount
                              }
                            }
                          }
                        }
                        pageInfo {
                          endCursor
                          hasNextPage
                          hasPreviousPage
                          startCursor
                        }
                      }
                    }'''.replace('temp_start_dt',temp_start_dt.strftime('%Y-%m-%dT%H:%M:%SZ')).replace('temp_end_dt',temp_end_dt.strftime('%Y-%m-%dT%H:%M:%SZ')).replace('repo_name',name)
            try:
                results = send_query_with_retries(query,3,get_header())
            except QueryFailError as e:
                logger.error('Obtaining pr with ' + str(e))
                failed_prs_queries.append(query)
                temp_start_dt = temp_end_dt
                temp_end_dt = end_dt
                continue
            for pr in results['search']['edges']:
                if (pr['node']['number'] in list(prs_df.loc[prs_df['repoName'] == name]['number'])) or (pr['node']['number'] in list(control_prs_df.loc[control_prs_df['repoName'] == name]['number'])): continue
                else:
                    pr_author = pr['node']['author']['login'] if pr['node']['author'] is not None else 'ghost'
                    author_association = pr['node']['authorAssociation']
                    is_member = author_association in ['MEMBER','OWNER']
                    # search author experince
                    search_limit(get_header())
                    query = '''
                        query{
                      search(
                        query: """
                        repo:repo_name author:pr_author is:pr created:<pr_created
                        """
                        type: ISSUE
                        first: 0
                      ) {
                        issueCount
                      }
                    }
                    '''.replace('repo_name', name).replace('pr_author', pr_author).replace('pr_created', pr['node']['createdAt'])
                    try:
                        pr_experiences_results = send_query_with_retries(query,3,get_header())
                    except QueryFailError as e:
                        logger.error('Obtaining author experience with ' + str(e))
                        failed_experiences_queries.append(query)
                        continue
                    repo_age = (pd.to_datetime(pr['node']['createdAt']) - pd.to_datetime(repo_created)).total_seconds()/3600/24
                    control_prs_df = pd.concat([control_prs_df,pd.DataFrame([
                        [name, pr['node']['number'], pr['node']['title'], pr['node']['body'],
                         preliminary_lang, repo_created, fork_count, stargazer_count, repo_age, pr['node']['createdAt'], pr['node']['mergedAt'], 
                         pr['node']['url'], pr['node']['state'], pr['node']['lastEditedAt'], pr['node']['closedAt'], pr['node']['updatedAt'],
                         pr['node']['deletions'], pr['node']['additions'], pr['node']['changedFiles'], pr['node']['comments']['totalCount'],
                         pr_author, pr['node']['commits']['totalCount'], pr_experiences_results['search']['issueCount'], is_member]],
                        columns=control_prs_df.columns)], ignore_index=True)
                    for comment in pr['node']['comments']['edges']:
                        comment_author = comment['node']['author']['login'] if comment['node']['author'] is not None else 'ghost'
                        control_comments_df = pd.concat([control_comments_df,pd.DataFrame([
                        [name, pr['node']['number'], comment_author, comment['node']['body'], comment['node']['id']]],
                        columns=control_comments_df.columns)], ignore_index=True)
                    pr_comments = pr['node']['comments']
                    while pr_comments['pageInfo']['hasNextPage']:
                        comment_endcursor = pr_comments['pageInfo']['endCursor']
                        search_limit(get_header())
                        query = '''query{
                          repository(name: "repo_name", owner: "repo_owner") {
                            pullRequest(number: pr_number) {
                              comments(first: 100, after: "comment_endcursor") {
                                edges {
                                  node {
                                    author {
                                      login
                                    }
                                    body
                                    id
                                  }
                                }
                                pageInfo {
                                  endCursor
                                  hasNextPage
                                  hasPreviousPage
                                  startCursor
                                }
                              }
                            }
                          }
                        }
                        '''.replace('repo_name',name.split('/')[1]).replace(
                            'repo_owner',name.split('/')[0]).replace(
                            'pr_number', str(pr['node']['number'])).replace('comment_endcursor',comment_endcursor)
                        try:
                            comments_results = send_query_with_retries(query,3,get_header())
                        except QueryFailError as e:
                            logger.error('Obtaining comment with ' + str(e))
                            failed_comments_queries.append(query)
                            break
                        pr_comments = comments_results['repository']['pullRequest']['comments']
                        for comment in pr_comments['edges']:
                            comment_author = comment['node']['author']['login'] if comment['node']['author'] is not None else 'ghost'
                            control_comments_df = pd.concat([control_comments_df,pd.DataFrame([
                            [name, pr['node']['number'], comment_author, comment['node']['body'], comment['node']['id']]],
                            columns=control_comments_df.columns)], ignore_index=True)
            while results['search']['pageInfo']['hasNextPage']:
                endcursor = results['search']['pageInfo']['endCursor']
                search_limit(get_header())
                query = '''
                 query{
                    search(
                        query: """ NOT "Generated by Copilot" repo:repo_name is:pr created:temp_start_dt..temp_end_dt"""
                        type: ISSUE
                        first: 50
                        after:"endcursor"
                        ) {
                        edges {
                          node {
                            ... on PullRequest {
                              authorAssociation
                              number
                              title
                              body
                              createdAt
                              mergedAt
                              url
                              state
                              lastEditedAt
                              closedAt
                              updatedAt
                              deletions
                              additions
                              changedFiles
                              comments(first: 50) {
                                totalCount
                                edges {
                                  node {
                                    body
                                    author {
                                      login
                                    }
                                    id
                                  }
                                }
                                pageInfo {
                                  endCursor
                                  hasNextPage
                                  hasPreviousPage
                                  startCursor
                                }
                              }
                              
                              author {
                                login
                              }
                              commits {
                                totalCount
                              }
                            }
                          }
                        }
                        pageInfo {
                          endCursor
                          hasNextPage
                          hasPreviousPage
                          startCursor
                        }
                      }
                    }'''.replace('endcursor',endcursor).replace('temp_start_dt',temp_start_dt.strftime('%Y-%m-%dT%H:%M:%SZ')).replace('temp_end_dt',temp_end_dt.strftime('%Y-%m-%dT%H:%M:%SZ')).replace('repo_name',name)
                try:
                    results = send_query_with_retries(query,3,get_header())
                except QueryFailError as e:
                    logger.error('Obtaining pr with ' + str(e))
                    failed_prs_queries.append(query)
                    break
                for pr in results['search']['edges']:
                    if (pr['node']['number'] in list(prs_df.loc[prs_df['repoName'] == name]['number'])) or (pr['node']['number'] in list(control_prs_df.loc[control_prs_df['repoName'] == name]['number'])): continue
                    else:
                        pr_author = pr['node']['author']['login'] if pr['node']['author'] is not None else 'ghost'
                        author_association = pr['node']['authorAssociation']
                        is_member = author_association in ['MEMBER','OWNER']
                        # search author experince
                        search_limit(get_header())
                        query = '''
                            query{
                          search(
                            query: """
                            repo:repo_name author:pr_author is:pr created:<pr_created
                            """
                            type: ISSUE
                            first: 0
                          ) {
                            issueCount
                          }
                        }
                        '''.replace('repo_name', name).replace('pr_author', pr_author).replace('pr_created', pr['node']['createdAt'])
                        try:
                            pr_experiences_results = send_query_with_retries(query,3,get_header())
                        except QueryFailError as e:
                            logger.error('Obtaining author experience with ' + str(e))
                            failed_experiences_queries.append(query)
                            continue 
                        repo_age = (pd.to_datetime(pr['node']['createdAt']) - pd.to_datetime(repo_created)).total_seconds()/3600/24
                        control_prs_df = pd.concat([control_prs_df,pd.DataFrame([
                            [name, pr['node']['number'], pr['node']['title'], pr['node']['body'],
                             preliminary_lang, repo_created, fork_count, stargazer_count, repo_age, pr['node']['createdAt'], pr['node']['mergedAt'], 
                             pr['node']['url'], pr['node']['state'], pr['node']['lastEditedAt'], pr['node']['closedAt'], pr['node']['updatedAt'],
                             pr['node']['deletions'], pr['node']['additions'], pr['node']['changedFiles'], pr['node']['comments']['totalCount'],
                             pr_author, pr['node']['commits']['totalCount'], pr_experiences_results['search']['issueCount'], is_member]],
                            columns=control_prs_df.columns)], ignore_index=True)
                        for comment in pr['node']['comments']['edges']:
                            comment_author = comment['node']['author']['login'] if comment['node']['author'] is not None else 'ghost'
                            control_comments_df = pd.concat([control_comments_df,pd.DataFrame([
                            [name, pr['node']['number'], comment_author, comment['node']['body'], comment['node']['id']]], 
                            columns=control_comments_df.columns)], ignore_index=True)
                        pr_comments = pr['node']['comments']
                        while pr_comments['pageInfo']['hasNextPage']:
                            comment_endcursor = pr_comments['pageInfo']['endCursor']
                            search_limit(get_header())
                            query = '''query{
                              repository(name: "repo_name", owner: "repo_owner") {
                                pullRequest(number: pr_number) {
                                  comments(first: 100, after: "comment_endcursor") {
                                    edges {
                                      node {
                                        author {
                                          login
                                        }
                                        body
                                        id
                                      }
                                    }
                                    pageInfo {
                                      endCursor
                                      hasNextPage
                                      hasPreviousPage
                                      startCursor
                                    }
                                  }
                                }
                              }
                            }
                            '''.replace('repo_name',name.split('/')[1]).replace(
                                'repo_owner',name.split('/')[0]).replace(
                                'pr_number', str(pr['node']['number'])).replace('comment_endcursor',comment_endcursor)
                            try:
                                comments_results = send_query_with_retries(query,3,get_header())
                            except QueryFailError as e:
                                logger.error('Obtaining comment with ' + str(e))
                                failed_comments_queries.append(query)
                                break
                            pr_comments = comments_results['repository']['pullRequest']['comments']
                            for comment in pr_comments['edges']:
                                comment_author = comment['node']['author']['login'] if comment['node']['author'] is not None else 'ghost'
                                control_comments_df = pd.concat([control_comments_df,pd.DataFrame([
                                [name, pr['node']['number'], comment_author, comment['node']['body'], comment['node']['id']]],
                                columns=control_comments_df.columns)], ignore_index=True)
            temp_start_dt = temp_end_dt
            temp_end_dt = end_dt
        else:
            temp_end_dt = temp_start_dt + (temp_end_dt - temp_start_dt)/2
            


In [None]:
control_prs_df

In [None]:
cleaned_control_comments_df = control_comments_df[(~control_comments_df['author'].str.endswith('bot')) & (~control_comments_df['author'].isin(list(bots.loc[bots['type'] == "Bot"]['account'])))]

In [None]:
cleaned_control_comments_df

In [None]:

import re
control_total_comments = []
control_author_comments = []
control_reviewers_comments = []
control_reviewers_counts = []
control_desc_lens = []
control_pr_sizes = []
control_review_times = []
control_is_generated_by_bots = []
for index, row in control_prs_df.iterrows():
    control_is_generated_by_bots.append(row['author'].endswith('bot') or row['author'] in list(bots.loc[bots['type'] == "Bot"]['account']))
    comments = cleaned_control_comments_df.loc[(cleaned_control_comments_df['repoName'] == row['repoName']) & (cleaned_control_comments_df['number'] == row['number'])]
    control_author_comments.append(comments.loc[comments['author'] == row['author']].shape[0])
    control_reviewers_comments.append(comments.loc[comments['author'] != row['author']].shape[0])
    control_total_comments.append(comments.shape[0])
    control_reviewers_counts.append(len(set(list(comments.loc[comments['author'] != row['author']]['author']))))
    control_desc_lens.append(len(re.sub(r'[^A-Za-z0-9\']+',' ', row['body'])))
    control_pr_sizes.append(row['deletions'] + row['additions'])
    if row['closedAt'] is not None:
        control_review_times.append((pd.to_datetime(row['closedAt']) - pd.to_datetime(row['createdAt'])).total_seconds()/3600)
    else:
        control_review_times.append(None)
control_prs_df['commentsTotalCount'] = control_total_comments
control_prs_df['authorComments'] = control_author_comments
control_prs_df['reviewersComments'] = control_reviewers_comments
control_prs_df['reviewersTotalCount'] = control_reviewers_counts
control_prs_df['bodyLength'] = control_desc_lens
control_prs_df['prSize'] = control_pr_sizes
control_prs_df['reviewTime'] = control_review_times
control_prs_df['isGeneratedByBots'] = control_is_generated_by_bots

        

In [None]:
control_prs_df[control_prs_df['isGeneratedByBots'] == False]

In [None]:
control_prs_df[control_prs_df['isGeneratedByBots'] == False]['repoName'].value_counts()

In [None]:
conditions = [
    control_prs_df['body'].str.contains('doc|copyright|license', case=False, na=False),
    control_prs_df['body'].str.contains('bug|fix|defect', case=False, na=False)
]

choices = ['Document', 'Bug']

control_prs_df['purpose'] = np.select(conditions, choices, default='Feature')

In [None]:
control_prs_df.to_csv('../data/control_prs_df.csv',index=None)

In [None]:
control_prs_df.columns

In [None]:
control_prs_df.loc[(control_prs_df['state'].isin(['MERGED','CLOSED'])) & (control_prs_df['isGeneratedByBots'] == False) ].drop(columns=['repoName', 'number', 'title', 'body','repoCreatedAt','createdAt', 'mergedAt',
       'url','lastEditedAt', 'closedAt', 'updatedAt','author','isGeneratedByBots']).to_csv('../data/control_metrics.csv',index=None)

In [None]:
control_comments_df.to_csv('../data/control_comments_df.csv',index=None)


In [None]:
cleaned_control_comments_df.to_csv('../data/cleaned_control_comments_df.csv',index=None)


In [None]:
valid_prs_df['state'].value_counts()

In [None]:
control_prs_df[control_prs_df['isGeneratedByBots'] == False]['state'].value_counts()