# Data Collection

## GitHub credentials

A private access token is necessary to make use of less restrictive API limits.

In [None]:
from github import RateLimitExceededException, Github

# Providing access token
access_token = "< YOUR PRIVATE ACCESS TOKEN >"
g = Github(login_or_token=access_token)

# Confirm your login is successful
user = g.get_user()
print(f"Authenticated as: {user.login}")

## Data files

Path to output files.

In [None]:
import os
STEP1_HCLREPOS =   os.path.join("data", "step1-hcl-repositories.txt")
STEP2_TFREPOS =    os.path.join("data", "step2-tf-repositories.txt")
STEP2_404REPOS =   os.path.join("data", "step2-404-repositories.txt")
STEP3_KWCOMMITS =  os.path.join("data", "step3-keyword-commits.json")
STEP3_ERRORREPOS = os.path.join("data", "step3-error-repositories.txt")
STEP4_TFCOMMITS =  os.path.join("data", "step4-tf-commits.json")

## Step 1 - Recover GitHub repositories containing HCL IaC

For each day from 2014, query the GitHub search API for repositories that use HCL as language.
Some dates queried do not exist, an exception is caught to avoid interruptions.

Every repository is saved in '`data/step1-hcl-repositories.txt`' so no progress is lost in case of interruptions.

In [None]:
import time

script_urls = []
for year in range(2014, 2023):
    for month in range(1, 13):
        print(f"Scraping month {month} of year {year}")
        for day in range(1, 32):
            # Formatting compatible with search parameters
            date = f"{year}-{month:02d}-{day:02d}"
            try:
                time.sleep(2)  # sleep to reset API search limit
                repos = g.search_repositories(query=f"created:{date} language:HCL")
                for repo in repos:
                    time.sleep(0.2)  # sleep to reset API core limit
                    # URLs are added to a txt file to avoid data loss
                    with open(STEP1_HCLREPOS, "a") as file:
                        file.write(f"{repo.clone_url}\n")
                    script_urls.append(repo.clone_url)
            except RateLimitExceededException:
                print("Rate Limit Exception reached!")
            except Exception as e:
                print(e)
                # These are impossible dates (31-2-2022)
                print(f"Skipping: {date}")

In [None]:
# Number of HCL repositories obtained
print(len(script_urls))

## Step 2 - Filter repositories with Terraform files

Read the repositories from the previous step.

In [None]:
# read urls from the file and strip the '\n'
gitUrls_file = open(STEP1_HCLREPOS, "r")
repo_links = gitUrls_file.readlines()
repo_links = [repo.strip() for repo in repo_links]

Scan the content of each repository looking for files with extension '`.tf`' and '`.tf.json`' (i.e., Terraform artifact files).

Suitable repositories are saved in '`data/step2-tf-repositories.txt`'.

Repositories that are not reachable for any reason are saved in '`data/step2-404-repositories.txt`'.

In [None]:
counter = 0
terraform_keywords = ['.tf', '.tf.json']
terraform_relevant_repos = []
for repo_url in repo_links:
    if counter % 100 == 0:
        print(f'Got to {counter}')
    try:
        time.sleep(2)  # sleep for API search limit
        split_list = repo_url.split("/")
        actual_url = split_list[3]+ '/' + split_list[4]
        repo = g.get_repo(actual_url.split('.git')[0])
        contents = repo.get_contents('')
        while contents:
            time.sleep(0.2)  # sleep for API core limit
            file_content = contents.pop(0)
            if file_content.type == "dir":
                contents.extend(repo.get_contents(file_content.path))
            else:
                if file_content.name is not None and any(key in file_content.name.lower() for key in terraform_keywords):
                    terraform_relevant_repos.append(repo_url)
                    with open(STEP2_TFREPOS, "a") as file:
                        file.write(f"{repo_url}\n")
                    break
        counter += 1
    except RateLimitExceededException:
        print("Rate Limit Exception reached!")
    except Exception as e:
        print(f"{e}\n{repo_url}")
        with open(STEP2_404REPOS, "a") as file:
            file.write(f"{repo_url}\n")

## Step 3 - Extract commits with cost-related keywords

Read the previously filtered Terraform repositories. 
Then style a keyword list meant to be used in the commit message filtering phase.

In [None]:
# read urls from the file and strip the '\n'
all_repos = open(STEP2_TFREPOS, "r")
repo_links = [repo.strip() for repo in all_repos.readlines()]
cost_keywords = ["cheap", "expens", "cost", "efficient", "bill", "pay"]

Using PyDriller, traverse the commits of each repository.
For every commit containing one or more keywords in its message, extract **commit id**, **message**, **date** and **list of modified files**.

The final list of extracted commits is saved as JSON in '`data/step3-keyword-commits.json`'.

If an error occur while trying to access a commit, the repository URL is saved in '`data/step3-error-repositories.txt`'.

In [None]:
import json
from pydriller import Repository

relevant_repos = []
count = 0
for repo in repo_links:
    commits = []

    if count % 100 == 0:
        print("Got to {}".format(count))

    try:
        # For each commit in the repository
        for commit in Repository(repo).traverse_commits():
            # If any of the keyword appear in the commit message
            if commit.msg is not None and any(key in commit.msg.lower() for key in cost_keywords):
                changed_files = []
                # Save the modified files
                for file in commit.modified_files:
                    changed_files.append(file.filename)
                commit_dic = {"id": commit.hash, 
                              "msg":commit.msg, 
                              "date":str(commit.author_date),
                              "modified_files": changed_files}
                commits.append(commit_dic)
        repo_dic = {"name":repo, "commits":commits}

        # Mark the repository as relevant if it has any relevant commits
        if len(commits) != 0:
            relevant_repos.append(repo_dic)
    except Exception as e:
        # so that we document what errors can happen when accessing commits
        print(f"{e}\n{repo}")
        with open(STEP3_ERRORREPOS, "a") as file:
            file.write(f"{repo}\n")
    count = count + 1

output = {"no_of_repos":len(relevant_repos) ,"repositories": relevant_repos}
with open(STEP3_KWCOMMITS, "w") as outfile:
    json.dump(output, outfile)

## Step 4 - Filter commits that modify Terraform files 

Refines the previous JSON file so that only commits that modify '`.tf`' and '`.tf.json`' files are taken into consideration.

The final list of filtered commits is saved as JSON in '`data/step4-tf-commits.json`'.

In [None]:
# Opening JSON file
terraform_output = open(STEP3_KWCOMMITS)
selected_repos = json.load(terraform_output)

filtered_repos = []
terraform_keywords = ['.tf', '.tf.json']
print(len(selected_repos["repositories"]))

for repo in selected_repos["repositories"]:
    relevant_commits = []
    flag = False
    for commit in repo["commits"]:
        for mod_file in commit["modified_files"]:
            if mod_file is not None and any(key in mod_file for key in terraform_keywords):
                relevant_commits.append(commit)
                flag = True
                break

    if flag:
        # new_commit_repo = {"name":repo["name"], "commits":relevant_commits}
        repo["commits"] = relevant_commits
        filtered_repos.append(repo)



print(f"Identified {len(filtered_repos)}")

output = {"no_of_repos":len(filtered_repos) ,"repositories": filtered_repos}
with open(STEP4_TFCOMMITS, "w") as outfile:
    json.dump(output, outfile)