In [1]:
import requests
import json
import csv
import pandas as pd

from global_paths import *

with open(GH_ACCESS_TOKEN, "r") as file:
    gh_access_token = file.read().strip()
with open(SG_ACCESS_TOKEN, "r") as file:
    sg_access_token = file.read().strip()

df = pd.read_csv(DATETIME_REPOS_PATH)

In [2]:
gh_query = """
query($q: String!, $cursor: String) {
  rateLimit {
    remaining
    cost
    used
  }
  search(query:$q, type: ISSUE, first: 100, after:$cursor) {
		pageInfo {
      hasNextPage
      endCursor
    }
    nodes {
      ... on Issue {
        
        title
        bodyHTML
        url
        activeLockReason
        
        
        labels (first:100) {
          nodes {
            name
          }
        }
        
      }
    }
  }
}
"""

sg_query = """
query ($q: String) {
  search(patternType:keyword, query: $q) {
    results {
      matchCount
      results {
        ... on CommitSearchResult {
          url
          label {
            text
          }
          diffPreview {
            value
          }
        }
      }
    }
  }
}
"""


In [7]:
with open(ISSUES_PATH, "w") as file:
  writer = csv.writer(file, lineterminator="\n")
  
  row = ["repoName", "title", "bodyHtml", "url", "lockReason", "labels"]
  writer.writerow(row)

In [16]:
import time

url = "https://api.github.com/graphql"
headers = {"Authorization": f"Bearer {gh_access_token}"}

def search_issues(nameWithOwner):
  count = 0
  q = f"repo:{nameWithOwner} is:issue is:closed \"datetime\" AND \"bug\""
  cursor = None
  while (True):
    json = {"query": gh_query, "variables": {"q": q, "cursor": cursor}}
    response = requests.post(url, json=json, headers=headers).json()
    
    if ("errors" in response.keys()):
      if (response["errors"][0]["type"] == "RATE_LIMITED"):
        time.sleep(120)
        continue
      else:
        exit(0)

    response = response["data"]
    rateLimit = response["rateLimit"]
    
    hasNextPage = response["search"]["pageInfo"]["hasNextPage"]
    issues = response["search"]["nodes"]

    with open(ISSUES_PATH, "a") as file:
      writer = csv.writer(file, lineterminator="\n")

      for issue in issues:
        labels = []
        for l in issue["labels"]["nodes"]:
          labels.append(l["name"])

        row = [nameWithOwner, issue["title"], "<html redacted>", issue["url"], # issue["bodyHTML"]
               issue["activeLockReason"], labels
        ]
        writer.writerow(row)

    if (count % 1 == 0):
      print(f"Requests: {count}, endCursor: {cursor}, remaining: {rateLimit['remaining']}")
    count += 1
    cursor = response["search"]["pageInfo"]["endCursor"]
    if (not hasNextPage):
      # print("done")
      break

for index, row in df.iterrows():
  # nameWithOwner = row["nameWithOwner"]
  nameWithOwner = row["owner"] + "/" + row["name"]
  search_issues(nameWithOwner)


RATE_LIMITED
{'errors': [{'type': 'RATE_LIMITED', 'message': 'API rate limit exceeded for user ID 92131019.'}]}


KeyError: 'rateLimit'

In [10]:
search_issues("yeti-platform/yeti")

Requests: 0, endCursor: None, remaining: 154


In [5]:
df = pd.read_csv(ISSUES_PATH)

In [44]:
with open(DIFFS_PATH, "w") as file:
  writer = csv.writer(file, lineterminator="\n")
  
  row = ["repoName", "label", "url", "diffPreview"]
  writer.writerow(row)

In [45]:
url = "https://sourcegraph.com/.api/graphql"
headers = {"Authorization": f"token {sg_access_token}"}

def search_diffs(nameWithOwner):
  q = f"context:global repo:^github\\.com/{nameWithOwner}$ type:diff message:bug datetime"
  json = {"query": sg_query, "variables": {"q": q}}
  response = requests.post(url, json=json, headers=headers).json()["data"]
  diffs = response["search"]["results"]["results"]
  with open(DIFFS_PATH, "a") as file:
    writer = csv.writer(file, lineterminator="\n")
    for diff in diffs:
      row = [nameWithOwner, diff["label"]["text"], diff["url"], "<diff redacted>"] #diff["diffPreview"]["value"]
      writer.writerow(row)

for index, row in df.iterrows():
  nameWithOwner = row["nameWithOwner"]
  search_diffs(nameWithOwner)
