# Consolidate Pull Requests data
Process commits data to summarize values from each pull request.

1. Run PRCommitMiner to mine PR commits and store data on the database.
2. saveCommitsResults(pr_number,merge_commit_sha,qty_commits)
    - Process commits info
2. GitHub web scrap to complement data

### ~1. Consolidating data from individual commits on pull requests~

- ~__[Step Removed]__ First, look to merge commits.~
    - ~Lastly, get individual commits.~
        - ~If the quantity of mined commits is equals to commits in the pullrequest: Consolidate them.~
        - ~If not, flag the PR to the next phase (incompletecommits = True)~
    - ~Scrap GitHub pages for each incomplete Pull Request.~

In [None]:
projects = getProjects()

for repo in projects:
    print('\n\n----------- {} - INITIALIZING {} -----------\n\n'.format(datetime.now().strftime("%H:%M:%S"),repo[0]))
    
    #collect pull requests for each repo
    PRs = getPRs(repo)
    for PR in PRs:
            #Verify tables commits and merge_commits searching for results 
            #of mining process to save processed results by pullrequest
        saveCommitsResults(repo,PR[0],PR[1], PR[2])

    print('\n\n----------- {}'.format(datetime.now().strftime("%H:%M:%S")))

### ~GitHub Web scraping to complement pull requests data~
### GitHub API requests to complement pull requests data

* ~Pull requests that was not mined neither by merge commit or by list of commits~
* This scrapping process was swapped by a new endpoint API:
   * https://api.github.com/repos/{owner}/{repo}/pulls/{pull_number}/files
  
  
##### We also compute the author frequency:
* Author_frequency = Ncommits / Nauthors
* Where:
   * Ncommits: Number of commits in a pull request.
   * Nauthors: Number of distinct authors in a pull request.

In [None]:
PRs = getPRsToMine()
i=1
for PR in PRs:
    project =  PR[0]
    pull =  PR[1]
    
    print(f"-- {i}/{len(PRs)}: Project {project} - PR {pull}")    
    i += 1
    set_changed_lines(project, pull)
    set_author_frequency(project,pull)
    
    
print('\n\n----------- {} - PROCESS FINISHED -----------\n\n'.format(datetime.now().strftime("%D/%M/%Y %H:%M:%S")))

In [None]:
def set_author_frequency(repo, pull_number):
    author_frequency_data = getPRAuthorFrequency(repo, pull_number)
    
    if author_frequency_data is None or author_frequency_data[1] == 0:
        commits = 1
    else:
        commits = author_frequency_data[1]
        
    if author_frequency_data is None or author_frequency_data[2] == 0:
        authors = 1
    else:
        authors = author_frequency_data[2]
    
    author_frequency = commits / authors
    
    print(f"\t Commits:{commits}, Authors: {authors}, Author Frequency: {author_frequency}")
    #update PR
    updateAuthorFrequency(repo,pull_number,author_frequency,authors)

In [None]:
#commits = getPRCommits('anastr/SpeedView', 81)
#len(commits)
authors = getPRAuthorFrequency('apache/shardingsphere-elasticjob', 1344)
authors

In [None]:
def set_changed_lines(repo, pull_number):
    # Faz uma requisição para obter as alterações de um commit específico da pull request
    url = f"https://api.github.com/repos/{repo}/pulls/{pull_number}/files"
    response = requestAPI(url)
    
    commit_size = added = removed = files = test_files = test_lines = file_name =0
    
    if response is None:
        return None
    
    files = len(response)
    # Itera sobre os arquivos alterados e obtém as linhas adicionadas e removidas
    for file in response:
        added += file['additions']
        removed += file['deletions']
        commit_size += file['changes']
        
        file_name = file['filename']
        
        if isTestFile(file_name):
            test_files += 1
            test_lines += added + removed
        
    
    print(f"\t Size:{commit_size}, Added: {added}, removed: {removed}, files: {files}, test files: {test_files}, test lines: {test_lines}")
    #update PR
    updatePRApi(repo,pull_number,removed,added,files,commit_size,test_lines,test_files)

In [None]:
PRs = getPRsToScrap()
for PR in PRs:
    url = 'https://github.com/'+ PR[0]+'/pull/'+str(PR[1])+'/files'
    print(url)
    
    response = requestPage(url)
    if response is not None:
        html_soup = BeautifulSoup(response.text, 'html.parser')

        scrapPR(html_soup,PR[0],PR[1])
    #sleep
    time.sleep(.2)

### Functions

In [None]:
import psycopg2
import os
import requests 
import time
import pytz    
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pandas as pd
from scipy import stats
import numpy as np
np.seterr(divide='ignore', invalid='ignore')
from bs4 import BeautifulSoup

In [None]:
def getPRsToScrap():
    #query = """select project_name, pr_number from PULLREQUESTS 
    #WHERE incompletecommits is true
    #order by project_name, pr_number
    #offset 0 limit 5004"""
    
    query = """SELECT project_name, pr_number from PULLREQUESTS
                WHERE (commit_size is null or incompletecommits is true) and release_id IN (
                    SELECT R.node_id FROM PROJECTS P
                        INNER JOIN PROJECT_RELEASES R ON P.repo_name = R.repo_name
                    WHERE P.analysis_releases > 0 AND P.analysis_issues > 0 AND P.analysis_prs > 0
                        AND R.created_at between P.analysis_init AND P.analysis_finish
                    )
                    order by project_name, pr_number"""
    
    connection = connectDB()
    cursor = connection.cursor()
    cursor.execute(query)
    result = cursor.fetchall()
    connection.close()
    return result

In [None]:
def getPRsToMine():
    query = """SELECT project_name, pr_number from metrics_releases m 
                inner join pullrequests p on m.release_id = p.release_id
                order by project_name, pr_number"""
    
    connection = connectDB()
    cursor = connection.cursor()
    cursor.execute(query)
    result = cursor.fetchall()
    connection.close()
    return result

In [None]:
def scrapPR(html_soup,repo_name,pr_number):
    path = '/home/eliezio/Doutorado/Causalidade/PR_scraping/'+repo_name.replace('/','_')+'--'+str(pr_number)
    
    commit_size = added = removed = files = test_files = test_lines = file_name =0

    files_soup = html_soup.select('.file')
    for file_s in files_soup:
        files += 1 

        try:
            commit_size += getCommitSize(file_s)
            added += getAddedLines(file_s)
            removed += getRemovedLines(file_s)
        except Exception as e:
            print('--- Exception: {}'.format(e))
            print('--- Error when scraping changes. Keep walking.')
            
        
        try:
            file_name = file_s.select('.Link--primary')[0]['title']
            if isTestFile(file_name):
                test_files += 1
                test_lines += getAddedLines(file_s) + getRemovedLines(file_s)
        
        except Exception as e:
            print('--- Exception: {}'.format(e))
            print('--- Error when scraping file name and test info. Keep walking.')
    
    #save the file for future usage.
    f = open(path, 'w')
    f.write(html_soup.prettify())
    f.close()
    
    #update PR
    updatePRscrap(repo_name,pr_number,removed,added,files,commit_size,test_lines,test_files)

    print('Project: {}; PR number: {}; Total Commit size: {}; Added: {}; Removed: {}; Files: {}; test files: {}; test_lines: {}'.format(repo_name,pr_number,commit_size,added,removed, files,test_files,test_lines))

In [None]:
def getCommitSize(file_s):
    diff_text = file_s.select('.diffstat')
    diff_text = str(diff_text[0]['aria-label'])
    diff_text = diff_text.rsplit(' ')
    
    return int(diff_text[0].replace(',',''))

def getAddedLines(file_s):
    diff_text = file_s.select('.diffstat')
    diff_text = str(diff_text[0]['aria-label'])
    diff_text = diff_text.rsplit(' ')
    
    return int(diff_text[2].replace(',',''))

def getRemovedLines(file_s):
    diff_text = file_s.select('.diffstat')
    diff_text = str(diff_text[0]['aria-label'])
    diff_text = diff_text.rsplit(' ')
    
    return int(diff_text[5].replace(',',''))

In [3]:
test = 'test/e2e/gridObjectTestUtils.spec.js'
print(isTestFile(test))

False


In [1]:
def isTestFile(file):
    if '_test' in file or 'test_' in file or 'Test_' in file or '_Test' in file or '_TEST' in file or '_TEST' in file:
        if not 'latest' in file and not 'LATEST' in file:
            return True;
    return False;

In [None]:
def connectDB():
    f = open('/home/psql_pwd.txt', "r")
    pwd = f.readline().replace('\n','')
    
    return psycopg2.connect(user = "ci_quality",
                              password = pwd,
                              host = "127.0.0.1",
                              port = "5432",
                              database = "Causal_CI_Quality_v4")

In [None]:
def saveCommitsResults(repo_name,pr_number,merge_commit_sha, commits):
    #get mined merge commits 
    merge_commit = getMergeCommit(merge_commit_sha)
    
    #Is there commit merge analysis?
    #if len(merge_commit) > 0:
    #    print('+ Merge commit found.')
    #    updatePR(repo_name,pr_number,list(merge_commit[0]), True)
    
    #If there is no merge analysis, search for commits individually
#else:
    print('Searching for the set of commits.')
    commits_list = getPRCommits(repo_name,pr_number)
    #print(commits_list)

    # Is there a set of commits?
    # Is the list equivalent to PR size?
    if len(commits_list) > 0 and len(commits_list) == commits:     
        data = sumMatrix(list(commits_list))
        updatePR(repo_name,pr_number,data, False)

    #If not, flag this PR as incomplete.
    else:
        updatePRIncomplete(repo_name,pr_number)


In [None]:
def getPRs(project_name):
    query = """select pr_number,merge_commit_sha,commits from PULLREQUESTS 
    where project_name like %s;"""# and commit_size is null;"""
    
    connection = connectDB()
    cursor = connection.cursor()
    cursor.execute(query, [project_name])
    result = cursor.fetchall()
    connection.close()
    return result

In [None]:
def getPRCommits(repo_name,pr_number):
    query = """SELECT deletions, insertions, files, commit_size, test_volume, test_files 
    FROM COMMIT_PR CPR INNER JOIN COMMITS C ON C.commit_sha = CPR.commit_sha
    WHERE commit_size is not NULL AND project_name like %s AND pr_number = %s"""

    connection = connectDB()
    cursor = connection.cursor()
    cursor.execute(query, [repo_name,pr_number])
    result = cursor.fetchall()
    connection.close()
    return result

In [None]:
def getPRAuthorFrequency(repo_name,pr_number):
    query = """select p.pr_number,p.commits,count(distinct author_id) from pullrequests p
                INNER JOIN commit_pr cp ON p.pr_number = cp.pr_number and p.project_name = cp.project_name
                INNER JOIN commits c ON cp.COMMIT_SHA = c.COMMIT_SHA
                where cp.pr_number = %s and cp.project_name like %s
                group by p.pr_number,p.commits;"""

    connection = connectDB()
    cursor = connection.cursor()
    cursor.execute(query, [pr_number,repo_name])
    result = cursor.fetchone()
    connection.close()
    return result

In [None]:
def getMergeCommit(sha):
    query = """select deletions, insertions, files, commit_size, test_volume, test_files 
    FROM MERGE_COMMITS WHERE commit_size is not NULL AND commit_sha like %s"""

    connection = connectDB()
    cursor = connection.cursor()
    cursor.execute(query, [sha])
    result = cursor.fetchall()
    connection.close()
    return result

In [None]:
def getProjects():
    query = """SELECT DISTINCT repo_name From PROJECTS 
    WHERE commits_mined IS TRUE and analysis_releases > 0 and analysis_issues > 0 and analysis_prs > 0
    order by repo_name desc;"""

    connection = connectDB()
    cursor = connection.cursor()
    cursor.execute(query)
    result = cursor.fetchall()
    connection.close()
    return result

In [None]:
def updatePR(repo_name,pr_number,data,isMerge):
    try:
        query = """UPDATE  PULLREQUESTS 
            set deletions = %s,
            insertions = %s,
            files = %s,
            commit_size = %s,
            test_volume = %s,
            test_files = %s,
            "dataFromMergeCommit" = %s,
            incompletecommits = False
            WHERE project_name like %s and pr_number = %s;"""

        print("""....Updating pullrequest: UPDATE  PULLREQUESTS set deletions = {},insertions = {},files = {},commit_size = {},test_volume = {},test_files = {}, dataFromMergeCommit = {} WHERE project_name like {} and pr_number = {};""".format(data[0],data[1],data[2],data[3],data[4],data[5],isMerge,repo_name,pr_number))
        
        connection = connectDB()
        cursor = connection.cursor()
        cursor.execute(query, [data[0],data[1],data[2],data[3],data[4],data[5],isMerge,repo_name,pr_number])
        connection.commit()
        connection.close()
    except psycopg2.IntegrityError as e:
        print ("==============================================================")
        print ("Error while updating into PostgreSQL. updatePR >>> Exception: {}".format(e)) 
        print('Project: {}    PR - {} '.format(repo_name, pr_number))
        connection.close()
    except Exception as e:
        print ("==============================================================")
        print ("Error while processing updatePR >>> Exception: {}".format(e)) 
        print('Project: {}    PR - {}  '.format(repo_name, pr_number))

In [None]:
def updatePRscrap(repo_name,pr_number,removed,added,files,commit_size,test_lines,test_files):
    try:
        query = """UPDATE  PULLREQUESTS 
            set deletions = %s,
            insertions = %s,
            files = %s,
            commit_size = %s,
            test_volume = %s,
            test_files = %s,
            datafromscrap = True,
            "dataFromMergeCommit" = False,
            incompletecommits = False
            WHERE project_name like %s and pr_number = %s;"""

        connection = connectDB()
        cursor = connection.cursor()
        cursor.execute(query, [removed,added,files,commit_size,test_lines,test_files,repo_name,pr_number])
        connection.commit()
    except Exception as e:
        print ("==============================================================")
        print ("Error while updatePRscrap >>> Exception: {}".format(e)) 
        print('Project: {}    PR - {}  '.format(repo_name, pr_number))

In [None]:
def updatePRApi(repo_name,pr_number,removed,added,files,commit_size,test_lines,test_files):
    try:
        query = """UPDATE  PULLREQUESTS 
            set deletions = %s,
            insertions = %s,
            files = %s,
            commit_size = %s,
            test_volume = %s,
            test_files = %s,
            datafromscrap = False,
            datafromapi = True,
            "dataFromMergeCommit" = False,
            incompletecommits = False
            WHERE project_name like %s and pr_number = %s;"""

        connection = connectDB()
        cursor = connection.cursor()
        cursor.execute(query, [removed,added,files,commit_size,test_lines,test_files,repo_name,pr_number])
        connection.commit()
    except Exception as e:
        print ("==============================================================")
        print ("Error while updatePRApi >>> Exception: {}".format(e)) 
        print('Project: {}    PR - {}  '.format(repo_name, pr_number))

In [None]:
def updateAuthorFrequency(repo_name,pr_number,author_frequency,authors):
    try:
        query = """UPDATE  PULLREQUESTS 
            set author_frequency = %s,
            nauthors = %s
            WHERE project_name like %s and pr_number = %s;"""

        connection = connectDB()
        cursor = connection.cursor()
        cursor.execute(query, [author_frequency,authors,repo_name,pr_number])
        connection.commit()
    except Exception as e:
        print ("==============================================================")
        print ("Error while updateAuthorFrequency >>> Exception: {}".format(e)) 
        print('Project: {}    PR - {}  '.format(repo_name, pr_number))

In [None]:
def updatePRIncomplete(repo_name,pr_number):
    try:
        query = """UPDATE  PULLREQUESTS 
            set "dataFromMergeCommit" = False,
            incompletecommits = True
            WHERE project_name like %s and pr_number = %s;"""

        print("""....Updating INCOMPLETE pullrequest: UPDATE  PULLREQUESTS set"dataFromMergeCommit" = False, incompleteCommits = True WHERE project_name like {} and pr_number = {};""".format(repo_name,pr_number))
        
        connection = connectDB()
        cursor = connection.cursor()
        cursor.execute(query, [repo_name,pr_number])
        connection.commit()
        connection.close()
    except psycopg2.IntegrityError as e:
        print ("==============================================================")
        print ("Error while updating into PostgreSQL. updatePRIncomplete >>> Exception: {}".format(e)) 
        print('Project: {}    PR - {} '.format(repo_name, pr_number))
        connection.close()
    except Exception as e:
        print ("==============================================================")
        print ("Error while processing updatePRIncomplete >>> Exception: {}".format(e)) 
        print('Project: {}    PR - {}  '.format(repo_name, pr_number))

In [None]:
def sumMatrix(matrix):
    result = []
    for c in range(len(matrix[0])):
        res = 0
        for v in range(len(matrix)):
            res += int(matrix[v][c])
            
        result.append(res)
    return result

In [None]:
def requestPage(URL):      
    http_proxy  = "http://157.100.58.60:999"
    https_proxy = "https://157.100.58.60:999"
    proxyDict = {
        "http"  : http_proxy, 
        "https" : https_proxy
    }

    headers={
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36"    
    }

    try:
        #r = requests.get(url=URL, headers=headers, proxies=proxyDict)
        r = requests.get(url=URL, headers=headers)

        if r.status_code != 200:
            print('{} -- {} -- Code: {}'.format(datetime.now().strftime("%H:%M:%S"),URL,r.status_code))
            if r.status_code != 404 and r.status_code != 404:
                time.sleep(10)
                return requestPage(URL)
        else:
            return r
    except Exception as e:
        print('\n Erro no request get: {}'.format(e))
        time.sleep(10)
        return requestPage(URL)

In [None]:
def loadTokens():
    f = open('/home/gh_tokens.txt', "r")
    tokens =[]
    tk = f.readline().replace('\n','')
    while tk != '':
        tokens.append(tk)
        tk = f.readline().replace('\n','')

    f.close()
    return tokens

In [None]:
tokens = loadTokens()
i_token =0


def requestAPI(URL):
    # api-endpoint 
    global i_token
    
    if (len(tokens)-1) > i_token:
        i_token += 1
    else:
        i_token = 0
        
    r = requests.get(url = URL, headers={'Authorization': tokens[i_token],'Accept':'application/vnd.github.cloak-preview'}).json()
    if len(r) > 0:
        try:
            #print('keys: {}'.format(r.keys()))
            
            if isinstance(r, dict) and 'message' in r.keys():
                print('{} -- {}'.format(datetime.now().strftime("%H:%M:%S"),r['message']))
                if 'API rate limit exceeded' in r['message']:
                    time.sleep(600)
                    requestAPI(URL)
                else:
                    return None

            return r
        except Exception as e:
            print('\n Erro no request get: {}'.format(e))
            print(r)
    else:
        return None