<a href="https://colab.research.google.com/github/atlantiquesun/Stock_ML/blob/main/History.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import requests
import time
import datetime

In [8]:

def generateForkQuery(owner, name, endCursor=None):
  if(endCursor is None):
    query = """
      { 
            repository(owner:"%s", name:"%s") { 
              forks(first: 100) {
                pageInfo {
                  endCursor
                  hasNextPage
                }
                nodes {
                  createdAt
                }
              }
            }
          }
    """%(owner, name)
    return query
  else: 
    query = """
            { 
              repository(owner:"%s", name:"%s") { 
                forks(first: 100, after:"%s") {
                  pageInfo {
                    endCursor
                    hasNextPage
                  }
                  nodes {
                    createdAt
                  }
                }
              }
            }
            """%(owner, name, endCursor)
    return query


def generateStarQuery(owner, name, endCursor=None):
  if(endCursor is None):
    query = """
      { 
            repository(owner:"%s", name:"%s") { 
              stargazers(first: 100) {
                pageInfo {
                  endCursor
                  hasNextPage
                }
                edges {
                  starredAt
                }
              }
            }
          }
    """%(owner, name)
    return query
  else:
    query = """
            { 
              repository(owner:"%s", name:"%s") { 
                stargazers(first: 100, after:"%s") {
                  pageInfo {
                    endCursor
                    hasNextPage
                  }
                  edges {
                    starredAt
                  }
                }
              }
            }
            """%(owner, name, endCursor)
    return query


def generateIssueQuery(owner, name, endCursor=None):
  if(endCursor is None):
    query = """
      {
      repository(owner: "%s", name: "%s") {
                issues(first: 100) {
                  pageInfo {
                    endCursor
                    hasNextPage
                  }
                  nodes {
                    closedAt
                    createdAt
                  }
                }
              }
    }
    """%(owner, name)
    return query
  else:
    query = """
      {
        repository(owner: "%s", name: "%s") {
                  issues(first: 100, after:"%s") {
                    pageInfo {
                      endCursor
                      hasNextPage
                    }
                    nodes {
                      closedAt
                      createdAt
                    }
                  }
                }
      }
    """%(owner, name, endCursor)
    return query


def getOID(owner, name, headers):
  #get object ID, prepare for fetching commit data
  query = """
    {
      repository(owner: "%s", name: "%s") {
            defaultBranchRef {
              target {
                ... on Commit {
                  oid
                  committedDate
                  history {
                    totalCount
                  }
                }
              }
            }
          }
      }
  """%(owner, name)
  response = requests.post('https://api.github.com/graphql', json={'query': query}, headers=headers)
  oid = response.json()['data']['repository']['defaultBranchRef']['target']['oid']
  return oid


def generateCommitQuery(owner, name, oid, endCursor=None):
  if(endCursor is None):
    query = """
    {
    repository(owner: "%s", name: "%s") {
            object(oid: "%s") {
              ... on Commit {
                history(first: 100) {
                  totalCount
                  pageInfo {
                    endCursor
                    hasNextPage
                  }
                  nodes {
                    committedDate
                  }
                }
              }
            }
          }
      }
    """%(owner, name, oid)
    return query
  else:
    query = """
      {
      repository(owner: "%s", name: "%s") {
              object(oid: "%s") {
                ... on Commit {
                  history(first: 100, after: "%s") {
                    totalCount
                    pageInfo {
                      endCursor
                      hasNextPage
                    }
                    nodes {
                      committedDate
                    }
                  }
                }
              }
            }
        }
    """%(owner, name, oid, endCursor)
    return query

def generatePRQuery(owner, name, endCursor=None):
  if (endCursor is None):
    query = """
          { 
          repository(owner:"%s", name:"%s") { 
              pullRequests(first: 100){
                  pageInfo{
                    hasNextPage
                    endCursor
                  }
                  nodes{
                    createdAt
                    mergedAt
                    closedAt
                  }
                        
                }
                    }
              }
    """%(owner, name)
  else:
    query = """
          { 
          repository(owner:"%s", name:"%s") { 
              pullRequests(first: 100, after:"%s"){
                  pageInfo{
                    hasNextPage
                    endCursor
                  }
                  nodes{
                    createdAt
                    mergedAt
                    closedAt
                  }
                        
                }
                    }
              }
    """%(owner, name, endCursor)
  return query

In [9]:
def processNode(x, category, closed=False, merged=False):
  '''
  return the date (year-month-day) in a record
  '''
  if (category == "star"):
    return x["starredAt"][:10]   
  elif (category == "fork"):
    return x["createdAt"][:10]
  elif (category == "commit"):
    return x["committedDate"][:10]
  elif (category == "issue" and closed):
    if x["closedAt"] is None:
      return None
    else:
      return x["closedAt"][:10]
  elif (category == "issue" and not closed):
    return x["createdAt"][:10]
  elif (category == "pullRequest" and merged):
    if x["mergedAt"] is None: 
      return None
    else:
      return x["mergedAt"][:10]
  elif (category == "pullRequest" and closed):
    if x["closedAt"] is None:
      return None
    else:
      return x["closedAt"][:10]
  elif (category == "pullRequest" and not closed):
    return x["createdAt"][:10]
  else:
    print("Invalid category")
    return

def categoryExtension(category):
    if(category == "star"):
      return "stargazers"
    else:
      return category+'s'

In [10]:
class History():
  def __init__(self, company, normalizedName):
    self.headers = {"Authorization": "token 4c09bf6175cdf500a222a1e39963b0936256f7bb"}
    self.company = company
    self.normalizedName = normalizedName
    self.categories = ("star", "fork", "commit", "issue", "pullRequest")

    self.repos = None
    self.repo_list() #read the list of repos into self.repos
    self.anomalyTracker = None
  
    dates = list(pd.date_range(start="1/01/1999", end='9/01/2021')) 
    self.historyDict = {"star": pd.DataFrame(index = dates, columns = self.repos), "fork": pd.DataFrame(index = dates, columns = self.repos)
    , "commit": pd.DataFrame(index = dates, columns = self.repos), "issueClosed":pd.DataFrame(index = dates, columns = self.repos),
    "issue":pd.DataFrame(index = dates, columns = self.repos), "pullRequest":pd.DataFrame(index = dates, columns = self.repos), 
    "pullRequestClosed": pd.DataFrame(index = dates, columns = self.repos), "pullRequestMerged": pd.DataFrame(index = dates, columns = self.repos)} 
    #"issue" = "issueCreated", "pullRequest" = "pullRequestCreated"

    for x in self.historyDict:
      self.historyDict[x].fillna(0)
  
  def repo_list(self):
    '''
    read the list of repos that the company owns
    '''
    df = pd.read_csv("/content/drive/MyDrive/StockML /Data/companyInfo/repoChart.csv", sep=",")
    df = df[self.normalizedName]
    df = df.dropna(axis=0)
    self.repos = list(df)
  
  def sleepMode(self, query):
    '''
    sleep for one hour, a notice each 10 minutes

    query: the last query that received a "api limit exceeded" notice (i.e. did not get a proper response)
    '''
    for i in range(1, 7):
      print("sleeping 10min:", i)
      time.sleep(600) 

    response = requests.post('https://api.github.com/graphql', json={'query': query}, headers=self.headers)
    return response
    
  def generateQuery(self, category, repo, endCursor=None, oid=None):
    '''
    category: "star," "fork," "commit," "issue", "pullRequest"
    '''
    if (category == "fork"):
      return generateForkQuery(owner=self.company, name=repo, endCursor=endCursor)
    elif (category == "star"):
      return generateStarQuery(owner=self.company, name=repo, endCursor=endCursor)
    elif (category == "commit"):
      return generateCommitQuery(owner=self.company, name=repo, endCursor=endCursor, oid=oid)
    elif (category == "issue"):
      return generateIssueQuery(owner=self.company, name=repo, endCursor=endCursor)
    elif (category == "pullRequest"):
      return generatePRQuery(owner=self.company, name=repo, endCursor=endCursor)
    else:
      print("Invalid query category")
      return 
  

  def exportData(self, category):
    df = self.historyDict[category]
    address = '/content/drive/MyDrive/StockML /Data/'+category+'History/'+self.company+'.csv'
    df.to_csv(address)

    if(category == "issue" or category == "pullRequest"):
      df = self.historyDict[category+"Closed"]
      address = '/content/drive/MyDrive/StockML /Data/'+category+'ClosedHistory/'+self.company+'.csv'
      df.to_csv(address)
    
    if(category == "pullRequest"):
      df = self.historyDict["pullRequestMerged"]
      address = '/content/drive/MyDrive/StockML /Data/pullRequestMergedHistory/'+self.company+'.csv'
      df.to_csv(address)
    

  def addData(self):
    for repo in self.repos:
      for category in self.categories:
        self.addCategory(category, repo)
  

  def addCategory(self, category, repo):
    print("Processing "+category+" data of "+self.company+"/"+repo)

    oid = getOID(owner = self.company, name=repo, headers=self.headers)
    nodes = []
    endCursor = None
    hasNextPage = True

    count = 1
    with requests.Session() as s:
      while(hasNextPage):
        print(count)
        count += 1
        query = self.generateQuery(category = category, repo=repo, endCursor=endCursor, oid=oid)
        response = s.post('https://api.github.com/graphql', json={'query': query}, headers=self.headers)

        self.anomalyTracker = response #to print out the content when the program accidentally stops
        while (response.json() is None or 'data' not in response.json().keys()): #not a proper response (e.g. api limit exceeded)
          response = self.sleepMode(query)

        if (category == "commit"):
          data = response.json()['data']['repository']['object']['history']
        else:
          data = response.json()['data']['repository'][categoryExtension(category)]
        
        endCursor = data['pageInfo']['endCursor']
        hasNextPage = data['pageInfo']['hasNextPage']
        if category == "star":
          nodes.extend(data['edges'])
        else:
          nodes.extend(data['nodes'])
    
      for x in nodes:
        date1 = processNode(x=x, category=category, closed=True) #closed = True, merged=False
        date2 = processNode(x=x, category=category, closed=False)  
        date3 = processNode(x=x, category=category, merged=True)

        if (self.historyDict[category].at[date2, repo])==0:
          self.historyDict[category].at[date2, repo] = 1
        else:
          self.historyDict[category].at[date2, repo] += 1

        if category == "issue" or category == "pullRequest": #need to add closedAt data
          if (date1 is not None): 
            if (self.historyDict[category+"Closed"].at[date1, repo])==0:
              self.historyDict[category+"Closed"].at[date1, repo] = 1
            else:
              self.historyDict[category+"Closed"].at[date1, repo] += 1
        
        if category == "pullRequest": #need to add mergedAt data
          if (date3 is not None): 
            if (self.historyDict[category+"Merged"].at[date3, repo])==0:
              self.historyDict[category+"Merged"].at[date3, repo] = 1
            else:
              self.historyDict[category+"Merged"].at[date3, repo] += 1

  

In [13]:
h = History(company = "facebook", normalizedName = "facebook")
h.addCategory(repo = "hermes", category="pullRequest")

Processing pullRequest data of facebook/hermes
1
2
3


In [None]:
h.anomalyTracker.json()

In [None]:
df = pd.read_csv("/content/drive/MyDrive/StockML /Data/companyInfo/repoChart.csv", sep=",")
companies = list(df.keys())[1:] #the first column is index
for company in companies:
  normalizedName = company #companyname, key in repoChart
  if "_" in company:
    githubUser = company.split('_')[0] #github user
  else:
    githubUser = company
  h = History(company=githubUser, normalizedName = normalizedName)
  h.addData()
  h.exportData()

In [61]:

  
  def addForkData(self):
    for repo in self.repos:
      self.forkData(repo)
    
    df = pd.DataFrame([(k, v) for k, v in self.forkHistory.items()], columns = ['date', 'count'])
    df = df.sort_values(by="date")
    df.to_csv('/content/drive/MyDrive/StockML /Data/forkHistory/'+self.normalizedName+'.csv')
  
  def forkData(self, repo):
    print("Processing fork data of "+self.company+"/"+repo)
    forkNodes = []
    endCursor = None
    hasNextPage = True

    count = 1
    with requests.Session() as s:
      while(hasNextPage):
        print(count)
        count += 1
        query = generateForkQuery(owner=self.company, name=repo, endCursor=endCursor)
        response = s.post('https://api.github.com/graphql', json={'query': query}, headers=self.headers)
        self.anomalyTracker = response
        while (response.json() is None or 'data' not in response.json().keys()): #error examination
          print(response.json())
          response = self.sleepMode(query)
        data = response.json()['data']['repository']['forks']
        endCursor = data['pageInfo']['endCursor']
        hasNextPage = data['pageInfo']['hasNextPage']
        forkNodes.extend(data['nodes'])
    
    for x in forkNodes:
      date = x["createdAt"][:10]
      if(date in self.forkHistory.keys()):
        self.forkHistory[date] += 1
      else:
        self.forkHistory[date] = 1

  def addStarData(self):
    flag = False 
    for repo in self.repos:
      if repo == "react":
        continue
      if repo == "react-native":
        flag = True
      if flag:
        self.starData(repo)

    #store star data in csv
    df = pd.DataFrame([(k, v) for k, v in self.starHistory.items()], columns = ['date', 'count'])
    df = df.sort_values(by="date")
    #df.to_csv('/content/drive/MyDrive/StockML /Data/starHistory/'+self.company+'.csv')
    df.to_csv('/content/drive/MyDrive/StockML /Data/starHistory/'+self.company+'-correct-addition.csv')

  def starData(self, repo):
    print("Processing Star Data of "+self.company+"/"+repo)
    starNodes = []
    endCursor = None
    hasNextPage = True
    
    count = 1
    with requests.Session() as s: #stream requests
      while(hasNextPage):
        print(count)
        count+=1
        query = generateStarQuery(owner=self.company, name=repo, endCursor=endCursor)
        response = s.post('https://api.github.com/graphql', json={'query': query}, headers=self.headers)
        if('data' not in response.json().keys()):
          print(response.json())
        data = response.json()['data']['repository']['stargazers']
        endCursor = data['pageInfo']['endCursor']
        hasNextPage = data['pageInfo']['hasNextPage']
        #specific to google
        if(repo == "react-native"):
          if(count >= 388):
            starNodes.extend(data['edges'])
        else:
          starNodes.extend(data['edges'])
    
    for x in starNodes:
      date = x["starredAt"][:10]
      if(date in self.starHistory.keys()):
        self.starHistory[date] += 1
      else:
        self.starHistory[date] = 1

IndentationError: ignored

In [1]:
import requests
headers = {"Accept": "application/vnd.github.hawkgirl-preview+json", "Authorization": "token 4c09bf6175cdf500a222a1e39963b0936256f7bb"}
query = """
      {
  repository(owner:"facebook", name:"react") {
    dependencyGraphManifests(first:10, dependenciesAfter:"NQ") { 
      totalCount
      pageInfo {
        hasNextPage
        endCursor
      }
      nodes {
        filename
        blobPath
        dependencies{
          pageInfo{
            hasNextPage
            endCursor
          }
          nodes{
            packageName
            repository{
              nameWithOwner
            }
        }
      }   
      }
    }
  }
}
    """
response = requests.post('https://api.github.com/graphql', json={'query':query}, headers=headers)

In [None]:
response.json()

In [None]:
import requests
headers = {"Accept": "application/vnd.github.hawkgirl-preview+json", "Authorization": "token 4c09bf6175cdf500a222a1e39963b0936256f7bb"}
query = """
      {
  repository(owner:"facebook", name:"react") {
    dependencyGraphManifests {
      totalCount
      pageInfo {
        hasNextPage
        endCursor
      }
      edges {
        node {
          dependencies(first:100){
            pageInfo {
              hasNextPage
              endCursor
            }
          }
        }
      }
    }
  }
}
    """
response = requests.post('https://api.github.com/graphql', json={'query':query}, headers=headers)

In [None]:
data = response.json()
data = data['data']['repository']['dependencyGraphManifests']['nodes']

In [None]:
dep = data[5]
print(dep)

{'filename': 'fixtures/art/yarn.lock', 'blobPath': '/facebook/react/blob/master/fixtures/art/yarn.lock', 'dependencies': {'nodes': [{'packageName': 'abbrev', 'repository': {'nameWithOwner': 'isaacs/abbrev-js'}, 'requirements': '= 1.1.0'}, {'packageName': 'acorn', 'repository': {'nameWithOwner': 'acornjs/acorn'}, 'requirements': '= 3.3.0'}, {'packageName': 'ajv', 'repository': {'nameWithOwner': 'ajv-validator/ajv'}, 'requirements': '= 4.11.5'}, {'packageName': 'align-text', 'repository': {'nameWithOwner': 'jonschlinkert/align-text'}, 'requirements': '= 0.1.4'}, {'packageName': 'amdefine', 'repository': {'nameWithOwner': 'jrburke/amdefine'}, 'requirements': '= 1.0.1'}, {'packageName': 'ansi-regex', 'repository': {'nameWithOwner': 'chalk/ansi-regex'}, 'requirements': '= 2.1.1'}, {'packageName': 'ansi-styles', 'repository': {'nameWithOwner': 'chalk/ansi-styles'}, 'requirements': '= 2.2.1'}, {'packageName': 'anymatch', 'repository': {'nameWithOwner': 'micromatch/anymatch'}, 'requirements': 

In [None]:
df = pd.DataFrame([(k, v) for k, v in h.forkHistory.items()], columns = ['date', 'count'])
df = df.sort_values(by="date")
df.to_csv('/content/drive/MyDrive/StockML /Data/starHistory/'+"microsoft"+'-1.csv')

In [None]:
#facebook stopped at facebook/react-native page 388