In [None]:
import pandas as pd
import difflib
import numpy as np
import re
import os
import requests
import json
import time
import datetime
import random
import logging
logger = logging.getLogger()
logger.setLevel(logging.ERROR)
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
from sentence_transformers import SentenceTransformer
import subprocess
edit_contents_developers = pd.read_csv('../data/edit_contents_developers.csv')

In [None]:
edit_contents_developers

In [None]:
edit_contents_developers.groupby(by=["repoName", "number"]).size()

In [None]:
edit_contents_developers['content'] = edit_contents_developers['content'].fillna('')

In [None]:
def parse_diff(lines):
    modified_lines = {'added': {}, 'deleted': {}}
    count_deletions = 0
    count_additions = 0
    i = 0
    for line in lines:
        line = line.rstrip()
        count_deletions += 1
        count_additions += 1
        if line.startswith('@@'):
            count_deletions, count_additions = get_line_numbers(line)

        elif line.startswith('-'):
            modified_lines['deleted'].update({count_deletions: line[1:]})
            count_additions -= 1
        elif line.startswith('+'):
            modified_lines['added'].update({count_additions: line[1:]})
            count_deletions -= 1
        
        i += 1

    return modified_lines
def get_line_numbers(line):
    token = line.split(" ")
    numbers_old_file = token[1]
    numbers_new_file = token[2]
    delete_line_number = int(numbers_old_file.split(",")[0].replace("-", "")) - 1
    additions_line_number = int(numbers_new_file.split(",")[0]) - 1
    return delete_line_number, additions_line_number



# Section 3.3 Revisions of PR Descriptions Generated by Copilot for PRs - Identification of PRs with Post-Copilot Edits.

In [None]:
added = []
deleted = []
ngroup = []
summary_commands = []
all_commands = []
walkthrough_commands = []
poem_commands = []
is_edited_by_otherses = []
is_rerun_commands = []
is_edited = []
changes = []
i = 0
for name, group in edit_contents_developers.groupby(['repoName', 'number']):
    i += 1
    edits = group.reset_index(drop=True)
    contents_by_copilot = []
    line_numbers = []
    is_edited_by_others = False
    is_rerun_command = False
    is_second_copilot_command = False
    summary_command = 0
    all_command = 0
    walkthrough_command = 0
    poem_command = 0
    for index, row in edits.iterrows():
        
        if index == 0: first = ''
        else: first = edits.iloc[index - 1]['content'].replace('\r','').replace("'", "'\\''").strip()
        second = row['content'].replace('\r','').replace("'", "'\\''").strip()
        if second.count('\n') == 0: second += '\n'
        process = os.popen(f"git diff $(echo -e '{first}' | git hash-object -w --stdin) $(echo -e '{second}' | git hash-object -w --stdin)")
        diffs = process.read().split('\n')[4:]

        process.close()
        modified_lines = parse_diff(diffs)
        if row['editor'] == 'copilot4prs':
            for number, line in list(modified_lines['added'].items()):
                if line.startswith('- [x]') or line.startswith('- [ ]'):
                    del modified_lines['added'][number]
            for number, line in list(modified_lines['deleted'].items()):
                if line.startswith('- [x]') or line.startswith('- [ ]'):
                    del modified_lines['deleted'][number]      
        second = second.replace("""<!--\ncopilot:all\n-->""",'').replace("""<!--\ncopilot:summary\n-->""",'').replace("""<!--\ncopilot:walkthrough\n-->""",'').replace("""<!--\ncopilot:poem\n-->""",'')
        if row['editor'] != 'copilot4prs' and list(edits['editor']).count('copilot4prs') > 1 :
            command = list(set(re.findall(r"\bcopilot:(all|summary|walkthrough|poem)\b", second)))
            if 'summary' in command: summary_command += 1
            if 'all' in command: all_command += 1
            if 'walkthrough' in command: walkthrough_command += 1
            if 'poem' in command: poem_command += 1
        changes.append(modified_lines)
        ngroup.append(i)
        summary_commands.append(summary_command)
        all_commands.append(all_command)
        walkthrough_commands.append(walkthrough_command)
        poem_commands.append(poem_command)
        added_line_numbers = list(modified_lines['added'].keys())
        deleted_line_numbers = list(modified_lines['deleted'].keys())

        if len(contents_by_copilot) == 0:
            if row['editor'] == 'copilot4prs':
                contents_by_copilot.extend(modified_lines['added'].values())
                line_numbers.extend(added_line_numbers)
            is_edited.append(is_edited_by_others)
            continue
        while (not (len(added_line_numbers) == 0 and len(deleted_line_numbers) == 0)):
#             print(added_line_numbers, deleted_line_numbers, line_numbers)
            if len(added_line_numbers) == 0:
                # pop up minimum deleted line, make rest of deleted line number - 1
                deleted_line = deleted_line_numbers.pop(0)
                deleted_line_numbers = [i - 1 for i in deleted_line_numbers]
                if len(contents_by_copilot) == 0: continue
                if deleted_line in line_numbers:
                    if row['editor'] != 'copilot4prs':
                        is_edited_by_others = True
                    deleted_index = line_numbers.index(deleted_line)
                    del(contents_by_copilot[deleted_index])
                    del(line_numbers[deleted_index])
                line_numbers = [i - 1 if i > deleted_line else i for i in line_numbers ]
                continue
            if len(deleted_line_numbers) == 0:
                # pop up minimum deleted line, make rest of deleted line number - 1
                added_line = added_line_numbers.pop(0)
                if len(contents_by_copilot) == 0: continue
                if row['editor'] == 'copilot4prs':
                    line_numbers = [i + 1 if i >= added_line else i for i in line_numbers ]
                    added_index = 0
                    for number in line_numbers:
                        if added_line < number:
                            break
                        else: added_index += 1
                    contents_by_copilot.insert(added_index, modified_lines['added'][added_line])
                    line_numbers.insert(added_index, added_line)
                else:
                    line_numbers = [i + 1 if i >= added_line else i for i in line_numbers ]
                continue
            # in each loop, pop up minimul line number for added and deleted
            if added_line_numbers[0] < deleted_line_numbers[0]:
                added_line = added_line_numbers.pop(0)
                deleted_line_numbers = [i + 1 for i in deleted_line_numbers]
                if len(contents_by_copilot) == 0: continue
                if row['editor'] == 'copilot4prs':
                    line_numbers = [i + 1 if i >= added_line else i for i in line_numbers ]
                    added_index = 0
                    for number in line_numbers:
                        if added_line < number:
                            break
                        else: added_index += 1
                    contents_by_copilot.insert(added_index , modified_lines['added'][added_line])
                    line_numbers.insert(added_index , added_line)
                else:
                    line_numbers = [i + 1 if i >= added_line else i for i in line_numbers ]
            # if added line number = deleted line number means the content exchange
            elif added_line_numbers[0] == deleted_line_numbers[0]:
                added_line = added_line_numbers.pop(0)
                deleted_line = deleted_line_numbers.pop(0)
                if len(line_numbers) == 0: continue
                if added_line in line_numbers:
                    added_index = line_numbers.index(added_line)
                    if row['editor'] != 'copilot4prs':
                        is_edited_by_others = True
                        del(contents_by_copilot[added_index])
                        del(line_numbers[added_index])
                    else:
                        contents_by_copilot[added_index] = modified_lines['added'][added_line]
            else:
                deleted_line = deleted_line_numbers.pop(0)
                deleted_line_numbers = [i - 1 for i in deleted_line_numbers]
                if len(contents_by_copilot) == 0: continue
                if deleted_line in line_numbers:
                    if row['editor'] != 'copilot4prs':
                        is_edited_by_others = True
                    deleted_index = line_numbers.index(deleted_line)
                    del(contents_by_copilot[deleted_index])
                    del(line_numbers[deleted_index])
                line_numbers = [i - 1 if i > deleted_line else i for i in line_numbers ]
        is_edited.append(is_edited_by_others)

    is_edited_by_otherses.extend([is_edited_by_others] * edits.shape[0])
    if summary_command > 1 or all_command > 1 or walkthrough_command > 1 or poem_command > 1:
        is_rerun_command = True
    is_rerun_commands.extend([is_rerun_command] * edits.shape[0])

    


In [None]:
edit_contents_developers['group'] = ngroup
edit_contents_developers['allCommands'] = all_commands
edit_contents_developers['walkthroughCommands'] = walkthrough_commands
edit_contents_developers['poemCommands'] = poem_commands
edit_contents_developers['summaryCommand'] = summary_command
edit_contents_developers['isRerunCommands'] = is_rerun_commands
edit_contents_developers['isEditedByOtherses'] = is_edited_by_otherses
edit_contents_developers['isThisDditingCopilot'] = is_edited
edit_contents_developers['Changes'] = changes



# Section 3.3 Revisions of PR Descriptions Generated by Copilot for PRs - Filtering PRs that Reapply Marker Tags.

In [None]:
edit_contents_developers[(edit_contents_developers['isRerunCommands'] == True)]['group'].value_counts()

In [None]:
edit_contents_developers[(edit_contents_developers['isRerunCommands'] == True)]

In [None]:
edit_contents_developers[(edit_contents_developers['isEditedByOtherses'] == True)]['group']

In [None]:
df.to_csv('../data/edit_contents_developers_with_diff.csv', index=None)

In [None]:
coding_df = edit_contents_developers[(edit_contents_developers['isRerunCommands'] == False) & (edit_contents_developers['isEditedByOtherses'] == True)]

In [None]:
coding_df['group'].value_counts()

In [None]:
coding_df

In [None]:
coding_df['repoName'].value_counts()

In [None]:


with open('./env/tokens.txt') as f:
    lines = [line.rstrip() for line in f]
class QueryFailError(Exception):
    pass
tokens = [{'token': token, 'reset': None} for token in lines]
token_index = 0
def get_graphql_header():
    return {
    "Authorization": f"Bearer {tokens[token_index]['token']}",
  }
def get_rest_header():
    return {
    "Accept": "application/vnd.github+json",
    "Authorization": f"Bearer {tokens[token_index]['token']}",
    "X-GitHub-Api-Version": "2022-11-28"
  }
def send_rest_query(url, headers):
    request = requests.get(url, headers=headers)
    if request.status_code != 200:
        logger.error(f"{request.text}")
        raise QueryFailError(
    "Query failed to run by returning code of {}. {}".format(
        request.status_code, url
    )
)
    response = json.JSONDecoder().decode(json.dumps(request.json(), sort_keys=True))
    return response

def search_rest_limit(headers):
    try:
        resp = send_rest_query("https://api.github.com/rate_limit", headers)
        global tokens
        global token_index
        tokens[token_index]['reset'] = pd.to_datetime(resp['resources']['search']['reset'])
        if resp['resources']['search']['remaining'] > 1:
            return
        else:
            token_index = (token_index + 1) % len(tokens)
            if tokens[token_index]['reset'] is None:
                return
            if tokens[token_index]['reset'] < datetime.datetime.now(datetime.timezone.utc):
                return
            else:
                time.sleep((tokens[token_index]['reset'] - datetime.datetime.now(datetime.timezone.utc)).total_seconds())
                logger.info(f"Token {token_index} is ready to use again.")
                return
    except (QueryFailError,KeyError) as e:
        logger.error('sleep in ' + str(datetime.datetime.now()) + str(e))
        time.sleep(3600)

def send_rest_query_with_retries(url, retries, headers):
    request = requests.get(url, headers=headers)
    if request.status_code != 200:
        for i in range(retries):
            time.sleep(random.randint(1, 3))
            request = requests.get(url, headers=headers)
            logger.info("retrying in " + str (i +1))
            if request.status_code == 200: 
                break
            elif i == retries - 1:
                logger.error(f"{request.text}")
                raise QueryFailError(
            "Query failed to run by returning code of {}. {}".format(
                request.status_code, url
            )
        )
    response = json.JSONDecoder().decode(json.dumps(request.json(), sort_keys=True))
    return response
        
        
graphql_url = "https://api.github.com/graphql"
def send_graphql_query(query, headers):
    request = requests.post(graphql_url, json={"query": query}, headers=headers)
    if request.status_code != 200:
        logger.error(f"{request.text}")
        raise QueryFailError(
    "Query failed to run by returning code of {}. {}".format(
        request.status_code, query
    )
)
    response = json.JSONDecoder().decode(json.dumps(request.json(), sort_keys=True))

    try:
        return response['data']
    except KeyError:
        return response
    
def search_graphql_limit(headers):
    query = """
                    query {
                      viewer {
                        login
                      }
                      rateLimit {
                        limit
                        cost
                        remaining
                        resetAt
                      }
                }
                """
    try:
        resp = send_graphql_query(query, headers)
        global tokens
        global token_index
        tokens[token_index]['reset'] = pd.to_datetime(resp['rateLimit']['resetAt'])
        if resp['rateLimit']['remaining'] > 200:
            return
        else:
            token_index = (token_index + 1) % len(tokens)
            if tokens[token_index]['reset'] is None:
                return
            if tokens[token_index]['reset'] < datetime.datetime.now(datetime.timezone.utc):
                return
            else:
                time.sleep((tokens[token_index]['reset'] - datetime.datetime.now(datetime.timezone.utc)).total_seconds())
                print(f"Token {token_index} is ready to use again.")
                return
    except (QueryFailError,KeyError) as e:
        logger.error('sleep in ' + str(datetime.datetime.now()) + str(e))
        time.sleep(3600)
        
def send_graphql_query_with_retries(query, retries, headers):
    request = requests.post(graphql_url, json={"query": query}, headers=headers)
    if request.status_code != 200:
        for i in range(retries):
            time.sleep(random.randint(1, 3))
            request = requests.post(graphql_url, json={"query": query}, headers=headers)
            print("retrying in ", i +1)
            if request.status_code == 200: 
                break
            elif i == retries - 1:
                logger.error(f"{request.text}")
                raise QueryFailError(
            "Query failed to run by returning code of {}. {}".format(
                request.status_code, query
            )
        )
        
    response = json.JSONDecoder().decode(json.dumps(request.json(), sort_keys=True))
    
    try:
        return response['data']
    except KeyError:
        return response
def clean_string(text):
    text = re.sub(r'\s+',' ', text)
    text = re.sub(r'[^A-Za-z0-9.\']+',' ',text)
    text = text.lower()
    text = text.strip()
    return text

In [None]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Clone repositories for tracking the correct PR template 

In [None]:
for repo in coding_df['repoName'].unique():
    repo_name = repo.replace('/','_')
    process = os.popen(f"git clone https://github.com/{repo} {repo_name}")
    process.close()

# Section 3.3 Revisions of PR Descriptions Generated by Copilot for PRs - Identifying the PR Template.

In [None]:
pr_templates = pd.DataFrame()
repos = []
file_paths = []
request = 0
for repo in coding_df['repoName'].unique():
    search_graphql_limit(get_graphql_header())
    query = '''
         query{
          repository(name: "repo_name", owner: "repo_owner") {
            pullRequestTemplates {
              filename
            }
          }
        }'''.replace('repo_name',repo.split('/')[1]).replace('repo_owner',repo.split('/')[0])
    try:
        results = send_graphql_query_with_retries(query,3,get_graphql_header())
    except QueryFailError as e:
        print(e)
    templates = list(set([template['filename'] for template in results['repository']['pullRequestTemplates']]))
    for template in results['repository']['pullRequestTemplates']:
        search_rest_limit(get_rest_header())
        url = f"https://api.github.com/search/code?q=repo:{repo}+filename:{template['filename']}"
        try:
            request += 1
            if request > 8: 
                time.sleep(61)
                request = 0
            results = send_rest_query(url, get_rest_header())
        except QueryFailError as e:
            print(e)
        for file in results['items']:
            repos.append(repo)
            file_paths.append(file['path'])



In [None]:
pr_templates['repo'] = repos
pr_templates['templateFilePath'] = file_paths

In [None]:
pr_templates[pr_templates.duplicated(['repo'], keep=False)]

In [None]:
pr_template_contents = []
for name, group in coding_df.groupby(['repoName','number']):
    repo_name = group.iloc[0]['repoName']
    templates_file_paths = list(set(pr_templates[pr_templates['repo'] == repo_name]['templateFilePath']))
    first_edit_time = pd.to_datetime(group.iloc[0]['editedAt'])
    pr_des = group.iloc[0]['content']
    contents = []
    for file_path in templates_file_paths:
        temp_repo_name = repo_name.replace('/','_')
        command = [
            'git', 'log', '--pretty=format:%H|%cd|', '--date=format:%Y-%m-%dT%H:%M:%S%z', '--name-only', '--follow', '--', file_path
        ]
        with subprocess.Popen(command, cwd=temp_repo_name, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as proc:
            stdout, stderr = proc.communicate()
            commits = stdout.decode('utf-8').strip().split('\n\n')
            no_template = False
            for index, commit in enumerate(commits):
                commit = commit.replace('\n','')
                log = commit.split('|')
                sha, commit_date, filepath = log[0], pd.to_datetime(log[1]), log[2]
                if commit_date < first_edit_time:
                    break
                if index == len(commits) - 1:
                    no_template = True
            if not no_template:
                command = [
                    'git', 'show', f'{sha}:{filepath}'
                ]
                with subprocess.Popen(command, cwd=temp_repo_name, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as proc:
                    stdout, stderr = proc.communicate()
                    content = stdout.decode('utf-8')
                    contents.append(content)
    if len(contents) == 1:
        pr_template_contents.extend([contents[0]] * group.shape[0])
    elif len(contents) == 0:
        pr_template_contents.extend([None] * group.shape[0])
    else:
        contents.append(pr_des)
        cleaned_template = [clean_string(content) for content in contents]
        embeddings = model.encode(cleaned_template)
        csim = cosine_similarity(embeddings)
        pr_template_contents.extend([contents[list(csim[-1][:-1]).index(max(csim[-1][:-1]))]] * group.shape[0])

In [None]:
coding_df['prTemplate'] = pr_template_contents

# 4.3 Qualitative Analysis - Preparing sample

In [None]:
random.seed(0)
groups = [coding_df for _, coding_df in coding_df.groupby('group')]
random.shuffle(groups)

df = pd.concat(groups).reset_index(drop=True)

In [None]:
df.to_csv('../data/coded_sample.csv',index=None)