In [1]:
#Install packages
!pip install pydriller



In [2]:
#Import Repository
from pydriller import Repository
from datetime import datetime

In [3]:
#Extracts repo url
with open('repo.txt') as f:
    url = [x.rstrip() for x in f.readlines()]

In [4]:
#Extract project name from project url
def get_project_name(repo_url):
    return os.path.basename(repo_url[:-4])

In [5]:
import subprocess
import os
import shutil

def extract_commits(repo_url, clean_up = True):
    start_time = datetime.now()
    project_info = dict()
    project_name = get_project_name(repo_url)
    #Clone repo if needed
    print("Cloning repository {}".format(project_name))
    if not os.path.isdir(project_name):
        subprocess.run(["git","clone", repo_url])
    
    print("Creating repository object {}".format(project_name))
    repo = Repository(repo_url, 
                      only_modifications_with_file_types = ".java",
                      since=datetime(2021, 1, 1))
    
    print("Start commit analysis")
    for commit in repo.traverse_commits():
        commit_diff = dict()
        for modified_file in commit.modified_files:
            #No added modifications
            if not modified_file.filename.endswith(".java") or not modified_file.diff_parsed['added']:
                continue
            
            #Store file modfications
            commit_diff[modified_file.filename] = modified_file.diff_parsed['added']
        #Store commit modifications
        if commit_diff:
            project_info[str(commit.hash)] = commit_diff
    if clean_up:
        shutil.rmtree(project_name)
    print('Time elapsed (hh:mm:ss.ms) {}'.format(datetime.now() - start_time))
    return project_info


        

In [6]:
def extract_projects(url):
    projects = dict()
    for repo_url in url:
        project_name = get_project_name(repo_url)
        projects[project_name] = extract_commits(repo_url)
        print("{} analysis complete".format(project_name))
    return projects

In [7]:
import json
p = extract_projects(url)
file = 'data.json'
os.remove(file)
with open(file, "w") as f:
  json.dump(p, f, indent = 6)


Cloning repository Java


Cloning into 'Java'...


Creating repository object Java
Start commit analysis
Time elapsed (hh:mm:ss.ms) 0:00:02.313963
Java analysis complete
Cloning repository elasticsearch


Cloning into 'elasticsearch'...
Updating files: 100% (22750/22750), done.


Creating repository object elasticsearch
Start commit analysis
Time elapsed (hh:mm:ss.ms) 0:08:36.028335
elasticsearch analysis complete


# Pipeline 3.1 du paper
## Obtenir les repos
* Sortir une liste des repo git

## PyDriller
* Cloner
* Extraire un certains nombre de commit
* On veut la structure suivante: Repo -> commit -> file
* Utiliser TestFileDetector pour valider que le fichier est un test
* Associer le fichier de test au fichier de production avec TestFileMapping
* Conserver les changements dans un JSON