### Mining Commits metadata and communication
- Flagging the PRs under analysis
- Mining Commit Metadata and Communication
- Mining merge commits


In [None]:
projects = getProjects()

for repo in projects:
    repo_name = repo[0]
    repo_url = repo[1]
    init_date = repo[2]
    end_date = repo[3]
    mined = True
    
    print('\n\n----------- {} - INITIALIZING {} -----------\n\n'.format(datetime.now().strftime("%H:%M:%S"),repo_name))
    
    releases = getReleasesInPeriod(repo_name,init_date,end_date)
    for r in releases:
        release_id = r[0]
        
        
        prs = getPRsNotMined(release_id)
        for PR in prs:    
            print('\t\t PR: {}'.format(PR[0]))
            
            #collect number of comments and save it on pullrequests table
            communication_mined = setCommunication(repo_name, PR[0], PR[1],PR[3])      

            #collect the commits for each pull request
            commits_mined = getCommits(PR[2],repo_name, PR[0])


            #Check 'mined' flag on table pullrequests.
            if communication_mined and commits_mined:
                checkPRMined(repo_name, PR[0])
            else:
                mined = False

            #If merge_commit_sha is not null, record as merge PR
            if PR[4] is not None:
                recordMergeCommit(PR[4])


    #Check 'mined' flag on table projects.
    checkProjectMined(repo_name, mined)

    print('----------- {}'.format(datetime.now().strftime("%H:%M:%S")))

In [None]:
def getReleasesInPeriod(project,init,finish):
    connection = connectDB()
    cursor = connection.cursor()
    
    query = """select node_id from project_releases 
        where repo_name like %s
        and created_at between %s and %s"""

    cursor.execute(query, [project,init,finish])
    result = cursor.fetchall()
    cursor.close()
    connection.close()
    return result

In [None]:
def getPRsNotMined(release_id):
    connection = connectDB()
    cursor = connection.cursor()
    
    query = """select pr_number,COMMENTS_URL, commits_url, review_comments_url,merge_commit_sha
        from PULLREQUESTS 
        where release_id like %s and mined1 is not True;"""

    cursor.execute(query, [release_id])
    result = cursor.fetchall()
    cursor.close()
    connection.close()
    return result

## Aux Functions

In [None]:
def getCommits(commits_url,repo_name,pr_number):    
    try:
        commits = requestAPI(commits_url)
        if commits is not None:
            qty_commits = len(commits)
            i=0
            for commit in commits:
                i+=1
                if commit['author'] is not None and len(commit['author']) > 0:
                    storeAuthor(commit['author'])
                saveCommit(commit,repo_name,pr_number)
                print('Commit {}/{} - {}'.format(i,qty_commits,commits_url))
        else:
            qty_commits = 0
            
    except psycopg2.IntegrityError as e:
        print ("==============================================================")
        print ("Error while inserting into PostgreSQL. getCommits >>> Exception: {}".format(e)) 
        print('Project: {}    PR - {} '.format(repo_name, pr_number))
        return False
    except Exception as e:
        print ("==============================================================")
        print ("Error while processing getCommits >>> Exception: {}".format(e)) 
        print('Project: {}    PR - {}  '.format(repo_name, pr_number))
        return False
    
    try:
        query = """UPDATE  PULLREQUESTS 
            set commits = %s
            WHERE project_name like %s and pr_number = %s;"""
        
        print('....Updating pullrequest: UPDATE  PULLREQUESTS set commits = {} WHERE project_name like {} and id = {};'.format(qty_commits,repo_name,pr_number))

        connection = connectDB()
        cursor = connection.cursor()
        cursor.execute(query, [qty_commits,repo_name,pr_number])
        connection.commit()
        connection.close()
    except psycopg2.IntegrityError as e:
        print ("\n==============================================================")
        print ("Error while inserting into PostgreSQL. getCommits - PullRequests >>> Exception: {}".format(e)) 
        print('Project: {}    PR - {} '.format(repo_name, pr_number))
        connection.close()
        connection = connectDB()
        return False
    except Exception as e:
        print ("\n==============================================================")
        print ("Error while processing getCommits - PullRequests >>> Exception: {}".format(e)) 
        print('Project: {}    PR - {}  '.format(repo_name, pr_number))
        return False
    
    return True


In [None]:
def saveCommit(commit,repo_name,pr_number):
    connection = connectDB()
    cursor = connection.cursor()
    query = """SELECT * FROM commits WHERE commit_sha like %s;"""
    cursor.execute(query,[commit['sha']])
    row = cursor.fetchone()
    connection.commit()
 
    if row is None:  
        cursor_insert = connection.cursor()           
        query = """INSERT INTO commits (commit_sha,commit_date,msg,author_id) 
        VALUES(%s,%s,%s,%s);"""
    
        
        if commit['author'] is not None and len(commit['author']) > 0:
            commit_id = commit['author']['id']
        else:
            commit_id = None
            
        cursor_insert = connection.cursor()
        cursor_insert.execute(query, [commit['sha'],commit['commit']['committer']['date'],commit['commit']['message'],commit_id])
        connection.commit()
    
    cursor_insert = connection.cursor()           
    query = """INSERT INTO commit_PR (pr_number,project_name,commit_sha) 
    VALUES(%s,%s,%s);"""

    cursor_insert = connection.cursor()
    cursor_insert.execute(query, [str(pr_number), repo_name, commit['sha']])
    connection.commit()
    connection.close()

In [None]:
import psycopg2
import os
import requests 
import time
import pytz    
import calendar
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pandas as pd
from scipy import stats
import numpy as np
np.seterr(divide='ignore', invalid='ignore')
from bs4 import BeautifulSoup

In [None]:
def connectDB():
    f = open('/home/psql_pwd.txt', "r")
    pwd = f.readline().replace('\n','')
    
    return psycopg2.connect(user = "ci_quality",
                              password = pwd,
                              host = "127.0.0.1",
                              port = "5432",
                              database = "Causal_CI_Quality_v4")

#### Step 1 - Select projects not mined yet.

In [None]:
def getProjects():
    query = """select repo_name, repo_url,analysis_init,analysis_finish from projects P
        where (P.commits_mined is not true or P.comments_mined is not true)
            AND P.analysis_releases > 0 AND P.analysis_issues > 0 AND P.analysis_prs > 0
            and P.qty_bugs > 0 and P.analysis_point_days is not NULL
            order by repo_name;"""

    connection = connectDB()
    cursor = connection.cursor()
    cursor.execute(query)
    result = cursor.fetchall()
    connection.close()
    return result

In [None]:
def getProjectsMergeCommits():
    query = """SELECT * FROM PROJECTS WHERE commits_mined2 is True order by repo_name;"""

    connection = connectDB()
    cursor = connection.cursor()
    cursor.execute(query)
    result = cursor.fetchall()
    connection.close()
    return result

In [None]:
def getPRsMerge(project_name):
    query = """select merge_commit_sha,id_user from pullrequests where merge_commit_sha is not null
    and project_name like %s"""
    
    connection = connectDB()
    cursor = connection.cursor()
    cursor.execute(query, [project_name])
    result = cursor.fetchall()
    connection.close()
    return result

In [None]:
def recordMergeCommit(commit_sha):
    try:
        query = """insert into merge_commits (commit_sha) 
            values(%s)"""
        
        connection = connectDB()
        cursor = connection.cursor()
        cursor.execute(query, [commit_sha])
        connection.commit()
        connection.close()
    except Exception as e:
        print ("==============================================================")
        print ("Error while recordMergeCommit >>> Exception: {}".format(e)) 
        print('Commit: {}'.format(commit_sha))

In [None]:
def checkProjectMined(repo_name, mined):
    try:
        query = """UPDATE  PROJECTS 
            set commits_mined = %s,
            comments_mined = %s
            WHERE repo_name like %s"""
        
        connection = connectDB()
        cursor = connection.cursor()
        cursor.execute(query, [mined,mined,repo_name])
        connection.commit()
        connection.close()
    except Exception as e:
        print ("==============================================================")
        print ("Error while checkProjectMined >>> Exception: {}".format(e)) 
        print('Project: {}    PR - {}  '.format(repo_name, pr_number))

In [None]:
def checkPRMined(project_name,pr_number):
    try:
        query = """UPDATE  PULLREQUESTS 
            set mined1 = True
            WHERE project_name like %s and pr_number = %s;"""

        connection = connectDB()
        cursor = connection.cursor()
        cursor.execute(query, [repo_name,pr_number])
        connection.commit()
        connection.close()
    except Exception as e:
        print ("==============================================================")
        print ("Error while checkPRMined >>> Exception: {}".format(e)) 
        print('Project: {}    PR - {}  '.format(repo_name, pr_number))

In [None]:
def storeAuthor(author):
    connection = connectDB()
    cursor = connection.cursor()

    query = """SELECT * FROM gh_User WHERE id = """+str(author['id'])
    cursor.execute(query)
    row = cursor.fetchone()
    connection.commit()
    if row is not None:  
        return
        
    cursor_insert = connection.cursor()
    query = """INSERT INTO gh_User (id,events_url, followers_url, login, organizations_url, repos_url,starred_url, subscriptions_url,user_type) 
    VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
        
    cursor_insert = connection.cursor()
    cursor_insert.execute(query, [author['id'],author['events_url'],author['followers_url'],author['login'],author['organizations_url'],author['repos_url'],author['starred_url'],author['subscriptions_url'],author['type']])
    connection.commit()
    connection.close()


In [None]:
def storeMessage(message,repo_name, pr_number):      
    try:
        query = """INSERT INTO messages (pr_number,project_name,url,html_url,id_message,node_id,body,user_id, created_at) 
        VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s)"""

        if message['user'] is not None and len(message['user']) > 0:
            commit_id = message['user']['id']
        else:
            commit_id = None

        connection = connectDB()
        cursor_insert = connection.cursor()
        cursor_insert.execute(query, [str(pr_number), repo_name, message['url'],message['html_url'],message['id'],message['node_id'],message['body'],commit_id,str(message['created_at'])])
        connection.commit()
        connection.close()
    except Exception as e:
        print ("==============================================================")
        print ("Error while storeMessage >>> Exception: {}".format(e)) 
        print('Project: {}    PR - {}  Message: {} '.format(repo_name, pr_number,message['id']))

In [None]:
def setCommunication(repo_name, pr_number, comments_url, review_comments_url):
    try:
        print('+ Project: {}    PR - {}'.format(repo_name, pr_number))
        
        messages = requestAPI(comments_url)
        if messages is not None:
            qtycomments = len(messages)
            print('....Comments: {}'.format(qtycomments))
            
            for m in messages:
                if m['user'] is not None and len(m['user']) > 0:
                    storeAuthor(m['user'])
                storeMessage(m,repo_name, pr_number)
        else:
            qtycomments = 0
    except psycopg2.IntegrityError as e:
        print ("==============================================================")
        print ("Error while inserting into PostgreSQL. setCommunication >>> Exception: {}".format(e)) 
        print('Project: {}    PR - {} '.format(repo_name, pr_number))
        #return False
    except Exception as e:
        print ("==============================================================")
        print ("Error while processing setCommunication >>> Exception: {}".format(e)) 
        print('Project: {}    PR - {}  '.format(repo_name, pr_number))
        return False
            
    try:
        rev_messages = requestAPI(review_comments_url)
        if rev_messages is not None:
            qtyrevcomments = len(rev_messages)
            print('....Review Comments: {}'.format(qtyrevcomments))

            for m in rev_messages:
                if m['user'] is not None and len(m['user']) > 0:
                    storeAuthor(m['user'])
                storeMessage(m,repo_name, pr_number)
        else:
            qtyrevcomments = 0
    except psycopg2.IntegrityError as e:
        print ("==============================================================")
        print ("Error while inserting into PostgreSQL. setCommunication >>> Exception: {}".format(e)) 
        print('Project: {}    PR - {} '.format(repo_name, pr_number))
        #return False
    except Exception as e:
        print ("==============================================================")
        print ("Error while processing setCommunication >>> Exception: {}".format(e)) 
        print('Project: {}    PR - {}  '.format(repo_name, pr_number))
        return False
    
    try:
        query = """UPDATE  PULLREQUESTS 
            set "Comments" = %s,
            review_comments = %s
            WHERE project_name like %s and pr_number = %s;"""
        
        print('....Updating pullrequest: UPDATE  PULLREQUESTS set "Comments" = {}, review_comments = {} WHERE project_name like {} and id = {};'.format(qtycomments, qtyrevcomments,repo_name,pr_number))

        connection = connectDB()
        cursor = connection.cursor()
        cursor.execute(query, [qtycomments, qtyrevcomments,repo_name,pr_number])
        connection.commit()
        connection.close()
    except psycopg2.IntegrityError as e:
        print ("==============================================================")
        print ("Error while inserting into PostgreSQL. setCommunication >>> Exception: {}".format(e)) 
        print('Project: {}    PR - {} '.format(repo_name, pr_number))
        connection.close()
        connection = connectDB()
        return False
    except Exception as e:
        print ("==============================================================")
        print ("Error while processing setCommunication >>> Exception: {}".format(e)) 
        print('Project: {}    PR - {}  '.format(repo_name, pr_number))
        return False
    
    return True
        

In [None]:
def loadTokens():
    f = open('/home/gh_tokens.txt', "r")
    tokens =[]
    tk = f.readline().replace('\n','')
    while tk != '':
        tokens.append(tk)
        tk = f.readline().replace('\n','')

    f.close()
    return tokens

In [None]:
tokens = loadTokens()
i_token =0


def requestAPI(URL):
    # api-endpoint 
    global i_token
    
    if (len(tokens)-1) > i_token:
        i_token += 1
    else:
        i_token = 0
        
    r = requests.get(url = URL, headers={'Authorization': tokens[i_token],'Accept':'application/vnd.github.cloak-preview'}).json()
    if len(r) > 0:
        try:
            #print('keys: {}'.format(r.keys()))
            
            if isinstance(r, dict) and 'message' in r.keys():
                print('{} -- {}'.format(datetime.now().strftime("%H:%M:%S"),r['message']))
                if 'API rate limit exceeded' in r['message']:
                    time.sleep(600)
                    requestAPI(URL)
                else:
                    return None

            return r
        except Exception as e:
            print('\n Erro no request get: {}'.format(e))
            print(r)
    else:
        return None