<a href="https://colab.research.google.com/github/atlantiquesun/Stock_ML/blob/main/History.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import requests
import time
import datetime

In [2]:
def generateForkQuery(owner, name, endCursor=None):
  if(endCursor is None):
    query = """
      { 
            repository(owner:"%s", name:"%s") { 
              forks(first: 100) {
                pageInfo {
                  endCursor
                  hasNextPage
                }
                nodes {
                  createdAt
                }
              }
            }
          }
    """%(owner, name)
    return query
  else: 
    query = """
            { 
              repository(owner:"%s", name:"%s") { 
                forks(first: 100, after:"%s") {
                  pageInfo {
                    endCursor
                    hasNextPage
                  }
                  nodes {
                    createdAt
                  }
                }
              }
            }
            """%(owner, name, endCursor)
    return query


def generateStarQuery(owner, name, endCursor=None):
  if(endCursor is None):
    query = """
      { 
            repository(owner:"%s", name:"%s") { 
              stargazers(first: 100) {
                pageInfo {
                  endCursor
                  hasNextPage
                }
                edges {
                  starredAt
                }
              }
            }
          }
    """%(owner, name)
    return query
  else:
    query = """
            { 
              repository(owner:"%s", name:"%s") { 
                stargazers(first: 100, after:"%s") {
                  pageInfo {
                    endCursor
                    hasNextPage
                  }
                  edges {
                    starredAt
                  }
                }
              }
            }
            """%(owner, name, endCursor)
    return query


def generateIssueQuery(owner, name, endCursor=None):
  if(endCursor is None):
    query = """
      {
      repository(owner: "%s", name: "%s") {
                issues(first: 100) {
                  pageInfo {
                    endCursor
                    hasNextPage
                  }
                  nodes {
                    closedAt
                    createdAt
                  }
                }
              }
    }
    """%(owner, name)
    return query
  else:
    query = """
      {
        repository(owner: "%s", name: "%s") {
                  issues(first: 100, after:"%s") {
                    pageInfo {
                      endCursor
                      hasNextPage
                    }
                    nodes {
                      closedAt
                      createdAt
                    }
                  }
                }
      }
    """%(owner, name, endCursor)
    return query


def getOID(owner, name, headers):
  #get object ID, prepare for fetching commit data
  query = """
    {
      repository(owner: "%s", name: "%s") {
            defaultBranchRef {
              target {
                ... on Commit {
                  oid
                  committedDate
                  history {
                    totalCount
                  }
                }
              }
            }
          }
      }
  """%(owner, name)
  response = requests.post('https://api.github.com/graphql', json={'query': query}, headers=headers)
  if(response.json()['data']['repository']['defaultBranchRef'] is None):
    return None
  oid = response.json()['data']['repository']['defaultBranchRef']['target']['oid']
  return oid


def generateCommitQuery(owner, name, oid, endCursor=None):
  if(endCursor is None):
    query = """
    {
    repository(owner: "%s", name: "%s") {
            object(oid: "%s") {
              ... on Commit {
                history(first: 100) {
                  totalCount
                  pageInfo {
                    endCursor
                    hasNextPage
                  }
                  nodes {
                    committedDate
                  }
                }
              }
            }
          }
      }
    """%(owner, name, oid)
    return query
  else:
    query = """
      {
      repository(owner: "%s", name: "%s") {
              object(oid: "%s") {
                ... on Commit {
                  history(first: 100, after: "%s") {
                    totalCount
                    pageInfo {
                      endCursor
                      hasNextPage
                    }
                    nodes {
                      committedDate
                    }
                  }
                }
              }
            }
        }
    """%(owner, name, oid, endCursor)
    return query

def generatePRQuery(owner, name, endCursor=None):
  if (endCursor is None):
    query = """
          { 
          repository(owner:"%s", name:"%s") { 
              pullRequests(first: 100){
                  pageInfo{
                    hasNextPage
                    endCursor
                  }
                  nodes{
                    createdAt
                    mergedAt
                    closedAt
                  }
                        
                }
                    }
              }
    """%(owner, name)
  else:
    query = """
          { 
          repository(owner:"%s", name:"%s") { 
              pullRequests(first: 100, after:"%s"){
                  pageInfo{
                    hasNextPage
                    endCursor
                  }
                  nodes{
                    createdAt
                    mergedAt
                    closedAt
                  }
                        
                }
                    }
              }
    """%(owner, name, endCursor)
  return query

In [3]:
def processNode(x, category, closed=False, merged=False):
  '''
  return the date (year-month-day) in a record
  '''
  if (category == "star"):
    return x["starredAt"][:10]   
  elif (category == "fork"):
    return x["createdAt"][:10]
  elif (category == "commit"):
    return x["committedDate"][:10]
  elif (category == "issue" and closed):
    if x["closedAt"] is None:
      return None
    else:
      return x["closedAt"][:10]
  elif (category == "issue" and not closed):
    return x["createdAt"][:10]
  elif (category == "pullRequest" and merged):
    if x["mergedAt"] is None: 
      return None
    else:
      return x["mergedAt"][:10]
  elif (category == "pullRequest" and closed):
    if x["closedAt"] is None:
      return None
    else:
      return x["closedAt"][:10]
  elif (category == "pullRequest" and not closed):
    return x["createdAt"][:10]
  else:
    print("Invalid category")
    return

def categoryExtension(category):
    if(category == "star"):
      return "stargazers"
    else:
      return category+'s'

In [4]:
class History():
  def __init__(self, company, normalizedName):
    self.headers = {"Authorization": "token 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"}
    self.categories = ("star", "fork", "commit", "issue", "pullRequest")

    self.company = None
    self.normalizedName = None

    self.repos = None
    self.anomalyTracker = None
  
    self.historyDict = None
    self.set_company(company, normalizedName)

    for x in self.historyDict:
      self.historyDict[x] = self.historyDict[x].fillna(0)
  
  def set_company(self, company, normalizedName):
    self.company = company
    self.normalizedName = normalizedName

    self.repos = None
    self.repo_list()
    self.anomalyTracker = None

    dates = list(pd.date_range(start="1/01/1999", end='9/01/2021')) 
    repos = [x[0] for x in self.repos]
    self.historyDict = {"star": pd.DataFrame(index = dates, columns = repos), "fork": pd.DataFrame(index = dates, columns = repos)
    , "commit": pd.DataFrame(index = dates, columns = repos), "issueClosed":pd.DataFrame(index = dates, columns = repos),
    "issue":pd.DataFrame(index = dates, columns = repos), "pullRequest":pd.DataFrame(index = dates, columns = repos), 
    "pullRequestClosed": pd.DataFrame(index = dates, columns = repos), "pullRequestMerged": pd.DataFrame(index = dates, columns = repos)} 
     #"issue" = "issueCreated", "pullRequest" = "pullRequestCreated"

    for x in self.historyDict:
      self.historyDict[x] = self.historyDict[x].fillna(0)
      
  def repo_list(self):
    '''
    read the list of repos that the company owns
    '''
    #df = pd.read_csv("/content/drive/MyDrive/StockML /Data/companyInfo/repoChart.csv", sep=",")
    #df = df[self.normalizedName]
    #df = df.dropna(axis=0)
    #self.repos = list(df)

    self.repos = []
    nextPage = "https://api.github.com/users/"+self.company+"/repos?per_page=100"
    with requests.Session() as s:
      while (nextPage is not None):
        print(nextPage)
        response = s.get(nextPage)
        if("next" in response.links):
          nextPage = response.links['next']['url']
        else:
          nextPage = None
        data = response.json()
        for repo in data:
          oid = getOID(owner = self.company, name=repo['name'], headers=self.headers)
          if(oid is not None): #check that the repository is not empty
            self.repos.append((repo['name'], oid))

  
  def sleepMode(self, query):
    '''
    sleep for one hour, a notice each 10 minutes

    query: the last query that received a "api limit exceeded" notice (i.e. did not get a proper response)
    '''
    for i in range(1, 7):
      print("sleeping 10min:", i)
      time.sleep(600) 

    response = requests.post('https://api.github.com/graphql', json={'query': query}, headers=self.headers)
    return response
    
  def generateQuery(self, category, repo, endCursor=None, oid=None):
    '''
    category: "star," "fork," "commit," "issue", "pullRequest"
    '''
    if (category == "fork"):
      return generateForkQuery(owner=self.company, name=repo, endCursor=endCursor)
    elif (category == "star"):
      return generateStarQuery(owner=self.company, name=repo, endCursor=endCursor)
    elif (category == "commit"):
      return generateCommitQuery(owner=self.company, name=repo, endCursor=endCursor, oid=oid)
    elif (category == "issue"):
      return generateIssueQuery(owner=self.company, name=repo, endCursor=endCursor)
    elif (category == "pullRequest"):
      return generatePRQuery(owner=self.company, name=repo, endCursor=endCursor)
    else:
      print("Invalid query category")
      return 
  
  def exportData(self):
    for category in self.categories:
      self.exportCategory(category)

  def exportCategory(self, category):
    df = self.historyDict[category]
    address = '/content/drive/MyDrive/StockML /Data/'+category+'History/'+self.company+'.csv'
    df.to_csv(address)

    if(category == "issue" or category == "pullRequest"):
      df = self.historyDict[category+"Closed"]
      address = '/content/drive/MyDrive/StockML /Data/'+category+'ClosedHistory/'+self.company+'.csv'
      df.to_csv(address)
    
    if(category == "pullRequest"):
      df = self.historyDict["pullRequestMerged"]
      address = '/content/drive/MyDrive/StockML /Data/pullRequestMergedHistory/'+self.company+'.csv'
      df.to_csv(address)

  def addData(self):
    for category in self.categories:
      for (repo, oid) in self.repos:
        self.addCategory(category, repo, oid)
      self.exportCategory(category) #export the category data for one user

  def addCategory(self, category, repo, oid):
    print("Processing "+category+" data of "+self.company+"/"+repo)

    nodes = []
    endCursor = None
    hasNextPage = True

    count = 1
    with requests.Session() as s:
      while(hasNextPage):
        print(count)
        count += 1
        query = self.generateQuery(category = category, repo=repo, endCursor=endCursor, oid=oid)
        response = s.post('https://api.github.com/graphql', json={'query': query}, headers=self.headers)

        self.anomalyTracker = response #to print out the content when the program accidentally stops
        while (response.json() is None or 'data' not in response.json().keys()): #not a proper response (e.g. api limit exceeded)
          response = self.sleepMode(query)

        if (category == "commit"):
          data = response.json()['data']['repository']['object']['history']
        else:
          data = response.json()['data']['repository'][categoryExtension(category)]
        
        endCursor = data['pageInfo']['endCursor']
        hasNextPage = data['pageInfo']['hasNextPage']
        if category == "star":
          nodes.extend(data['edges'])
        else:
          nodes.extend(data['nodes'])
    
      for x in nodes:
        date1 = processNode(x=x, category=category, closed=True) #closed = True, merged=False
        date2 = processNode(x=x, category=category, closed=False)  
        date3 = processNode(x=x, category=category, merged=True)

        if int(date2[:4])<1999:
            date2 = "1999-01-01"
        if (self.historyDict[category].at[date2, repo])==0:
          self.historyDict[category].at[date2, repo] = 1
        else:
          self.historyDict[category].loc[date2, repo] += 1
    
        if category == "issue" or category == "pullRequest": #need to add closedAt data
          if (date1 is not None): 
              if int(date1[:4])<1999:
                date1 = "1999-01-01"
              if (self.historyDict[category+"Closed"].at[date1, repo])==0:
                self.historyDict[category+"Closed"].at[date1, repo] = 1
              else:
                self.historyDict[category+"Closed"].at[date1, repo] += 1

        
        if category == "pullRequest": #need to add mergedAt data
          if (date3 is not None): 
            if int(date3[:4])<1999:
                date3 = "1999-01-01"
            if (self.historyDict[category+"Merged"].at[date3, repo])==0:
              self.historyDict[category+"Merged"].at[date3, repo] = 1
            else:
              self.historyDict[category+"Merged"].at[date3, repo] += 1

  

In [5]:
df = pd.read_csv("/content/drive/MyDrive/StockML /Data/companyInfo/companies_final.csv")

In [7]:
count = 0
h = None
for i in range(df.shape[0]):
  count+=1
  if(count<20):continue
  if(count == 30):break
  company = df.at[i, 'githubUser']
  normalizedName = df.at[i, "shortName"]
  if h is None:
    h = History(company, normalizedName)
  else:
    h.set_company(company, normalizedName)
  h.addData()
  


https://api.github.com/users/groupon/repos?per_page=100
Processing star data of groupon/ansible-silo
1
2
Processing star data of groupon/api-build-resources
1
Processing star data of groupon/api-parent-pom
1
Processing star data of groupon/artemisia
1
Processing star data of groupon/assertive
1
Processing star data of groupon/assertive-as-promised
1
Processing star data of groupon/backbeat
1
Processing star data of groupon/backbeat_ruby
1
Processing star data of groupon/baryon
1
Processing star data of groupon/codeburner
1
Processing star data of groupon/coffeelint-config-groupon
1
Processing star data of groupon/cson-parser
1
2
Processing star data of groupon/dependency-injection-checks
1
Processing star data of groupon/DotCi
1
2
3
4
5
6
Processing star data of groupon/DotCi-Plugins-Starter-Pack
1
Processing star data of groupon/FeatureAdapter
1
2
Processing star data of groupon/git-workflow
1
Processing star data of groupon/gleemail
1
2
3
4
Processing star data of groupon/gofer
1
Pro

KeyboardInterrupt: ignored

In [7]:
h.historyDict['star']
#stopped at wix (not completely processed)

Unnamed: 0,aasm,active_attr,active_merchant,acts-as-taggable-on,acts_as_textcaptcha,afterbuild-webpack-plugin,axios-inherit,backstage,bootstrap,broadcaster,brubeck,chef-prometheus-exporters,chef_andco_artifact_cookbook,chef_andco_aws,chef_andco_cookbook_service_factory,chef_andco_cookbook_unix_bin,chef_andco_database,chef_andco_monit,chef_andco_mysql,chef_andco_mysql2_chef_gem,chef_andco_wordpress_cookbook,chef_andco_wp_cli,cloud-files-asset-sync,cloudkey,cruisecontrol.rb,dangerfile.js,db-charmer,deadlock_retry,dependabot-core,devflow,dont_look_up_package,doorkeeper,drag_n_drop_package,ds_online_features,elastic_searchable,eslint-config-fiverr,event-custodian,failed-to-load,failtail,fiverr.github.io,...,octopus,omnicontacts,page-timing,passable,paypal_adaptive,perimeterx-axios-interceptor,prerender,pricing_simulator,published,pyresolve,rack-contrib,rails-math-captcha,react-benchmark,react-fetch-mock-provider,readiness-manager,RedisTags,resque,resque-async-method,resque_mailer,rollout_dashboard,rollout_service,rpm,rpm_contrib,ruby-kafka,ruby-style-guide,sass-lint-config-fiverr,seed_dump,sentry-ruby,sitemap_generator,snyk-patch-test,solr_mongo_importer,statistician,sunspot,switch_board,synthetics,talking-capistrano,to_csv,uploader_package,webpagetest,woothee-ruby
1999-01-01,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1999-01-02,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1999-01-03,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1999-01-04,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1999-01-05,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-08-28,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2021-08-29,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2021-08-30,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2021-08-31,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
df = h.historyDict["star"]
df = df.fillna(0)
l = df.sum()
sum(l)

1010160

In [None]:
h.exportData()

In [None]:
h.anomalyTracker.json()

AttributeError: ignored

In [None]:
df = pd.read_csv("/content/drive/MyDrive/StockML /Data/companyInfo/repoChart.csv", sep=",")
companies = list(df.keys())[1:] #the first column is index
for company in companies:
  normalizedName = company #companyname, key in repoChart
  if "_" in company:
    githubUser = company.split('_')[0] #github user
  else:
    githubUser = company
  h = History(company=githubUser, normalizedName = normalizedName)
  h.addData()
  h.exportData()

In [None]:
import requests
headers = {"Accept": "application/vnd.github.hawkgirl-preview+json", "Authorization": "token 4c09bf6175cdf500a222a1e39963b0936256f7bb"}
query = """
      {
  repository(owner:"facebook", name:"react") {
    dependencyGraphManifests(first:10, dependenciesAfter:"NQ") { 
      totalCount
      pageInfo {
        hasNextPage
        endCursor
      }
      nodes {
        filename
        blobPath
        dependencies{
          pageInfo{
            hasNextPage
            endCursor
          }
          nodes{
            packageName
            repository{
              nameWithOwner
            }
        }
      }   
      }
    }
  }
}
    """
response = requests.post('https://api.github.com/graphql', json={'query':query}, headers=headers)

In [None]:
response.json()

In [None]:
import requests
headers = {"Accept": "application/vnd.github.hawkgirl-preview+json", "Authorization": "token 4c09bf6175cdf500a222a1e39963b0936256f7bb"}
query = """
      {
  repository(owner:"facebook", name:"react") {
    dependencyGraphManifests {
      totalCount
      pageInfo {
        hasNextPage
        endCursor
      }
      edges {
        node {
          dependencies(first:100){
            pageInfo {
              hasNextPage
              endCursor
            }
          }
        }
      }
    }
  }
}
    """
response = requests.post('https://api.github.com/graphql', json={'query':query}, headers=headers)

In [None]:
data = response.json()
data = data['data']['repository']['dependencyGraphManifests']['nodes']

In [None]:
dep = data[5]
print(dep)

{'filename': 'fixtures/art/yarn.lock', 'blobPath': '/facebook/react/blob/master/fixtures/art/yarn.lock', 'dependencies': {'nodes': [{'packageName': 'abbrev', 'repository': {'nameWithOwner': 'isaacs/abbrev-js'}, 'requirements': '= 1.1.0'}, {'packageName': 'acorn', 'repository': {'nameWithOwner': 'acornjs/acorn'}, 'requirements': '= 3.3.0'}, {'packageName': 'ajv', 'repository': {'nameWithOwner': 'ajv-validator/ajv'}, 'requirements': '= 4.11.5'}, {'packageName': 'align-text', 'repository': {'nameWithOwner': 'jonschlinkert/align-text'}, 'requirements': '= 0.1.4'}, {'packageName': 'amdefine', 'repository': {'nameWithOwner': 'jrburke/amdefine'}, 'requirements': '= 1.0.1'}, {'packageName': 'ansi-regex', 'repository': {'nameWithOwner': 'chalk/ansi-regex'}, 'requirements': '= 2.1.1'}, {'packageName': 'ansi-styles', 'repository': {'nameWithOwner': 'chalk/ansi-styles'}, 'requirements': '= 2.2.1'}, {'packageName': 'anymatch', 'repository': {'nameWithOwner': 'micromatch/anymatch'}, 'requirements': 

In [None]:
df = pd.DataFrame([(k, v) for k, v in h.forkHistory.items()], columns = ['date', 'count'])
df = df.sort_values(by="date")
df.to_csv('/content/drive/MyDrive/StockML /Data/starHistory/'+"microsoft"+'-1.csv')

In [None]:
#facebook stopped at facebook/react-native page 388