In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Set sensible defaults
sns.set()
sns.set_style("ticks")
sns.set_context('talk')

In [2]:
import requests
import json
from datetime import datetime, timedelta
import os
from dotenv import load_dotenv 
load_dotenv()

# Setup access_token, you can generate one from Github Developer Settings
access_token=os.getenv("GITHUB_ACCESS_TOKEN")

headers = {'Authorization':"Bearer "+access_token, 'Content-Type':'application/json'}

# calculate the date 1 week ago and 1 month ago
date_one_week_ago = datetime.now() - timedelta(weeks=1)
date_one_week_ago_iso = date_one_week_ago.isoformat()

date_one_month_ago = datetime.now() - timedelta(weeks=4)
date_one_month_ago_iso = date_one_month_ago.isoformat()

# Create the GraphQL query
# Note: GitHub's GraphQL API only allows fetching the first 100 records in one request
# Create the GraphQL query
query = """
{{
  repos: search(query: "stars:>200 created:>{date_one_month_ago} archived:false sort:stars-desc", type: REPOSITORY, first: 100) {{
    repositoryCount
    edges {{
      node {{
        ... on Repository {{
          name
          description
          url
          stargazers {{
            totalCount
          }}
          createdAt
          updatedAt
          diskUsage
          forkCount
          isArchived
          isFork
          isMirror
          isPrivate
          languages(first:3) {{
            nodes {{
              name
            }}
          }}
          licenseInfo {{
            name
          }}
          owner {{
            login
          }}
          primaryLanguage {{
            name
          }}
          pullRequests {{
            totalCount
          }}
          issues {{
            totalCount
          }}
          watchers {{
            totalCount
          }}
        }}
      }}
    }}
  }}
}}
""".format(date_one_week_ago=date_one_week_ago_iso, date_one_month_ago=date_one_month_ago_iso)

# Send the POST request to the GitHub GraphQL API
response = requests.post('https://api.github.com/graphql', headers=headers, json={'query': query})

# Ensure the request was successful
if response.status_code == 200:
    data = json.loads(response.content.decode('utf-8'))
    print(json.dumps(data, indent=4))
else:
    print('Error', response.status_code, response.text)


{
    "data": {
        "repos": {
            "repositoryCount": 132,
            "edges": [
                {
                    "node": {
                        "name": "llama2.c",
                        "description": "Inference Llama 2 in one file of pure C",
                        "url": "https://github.com/karpathy/llama2.c",
                        "stargazers": {
                            "totalCount": 8452
                        },
                        "createdAt": "2023-07-23T05:15:06Z",
                        "updatedAt": "2023-07-27T10:05:45Z",
                        "diskUsage": 614,
                        "forkCount": 610,
                        "isArchived": false,
                        "isFork": false,
                        "isMirror": false,
                        "isPrivate": false,
                        "languages": {
                            "nodes": [
                                {
                                    "name": "Python"
   

In [3]:
# Get the list of repository nodes
repos_list = data['data']['repos']['edges']
# Flatten the data into a pandas DataFrame
# json_normalize is deprecated since pandas v1.3.0, hence use pandas.json_normalize
# Let's extract the 'node' key in each dictionary and create a new list
node_list = [repo['node'] for repo in repos_list]

# Normalize
df = pd.json_normalize(node_list, 
      meta=['name', 'description', 'url', 
      ['stargazers', 'totalCount'],
      'createdAt', 'updatedAt',
      'diskUsage', 'forkCount',
      'isArchived', 'isFork',
      'isMirror', 'isPrivate',
      ['licenseInfo', 'name'],
      ['owner', 'login'],
      ['primaryLanguage', 'name'],
      ['pullRequests', 'totalCount'],
      ['issues', 'totalCount'],
      ['watchers', 'totalCount']], errors='ignore')

# Convert numeric columns
numeric_cols = ['stargazers.totalCount', 'diskUsage', 'forkCount', 
                'pullRequests.totalCount', 'issues.totalCount', 'watchers.totalCount']
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

# Print the DataFrame
print(df[['name', 'primaryLanguage.name', 'stargazers.totalCount', 'createdAt', 'description']])


                        name primaryLanguage.name  stargazers.totalCount  \
0                   llama2.c               Python                   8452   
1             python-mastery               Python                   6868   
2                    MetaGPT               Python                   6437   
3        gpt-prompt-engineer     Jupyter Notebook                   4128   
4                   InternLM               Python                   2278   
..                       ...                  ...                    ...   
95               RedCloud-OS                Shell                    254   
96          LLM-Reading-List                  NaN                    254   
97            prompts-royale                  Vue                    253   
98                  LLMDrift     Jupyter Notebook                    253   
99  flipper-zero-evil-portal                 HTML                    245   

               createdAt                                        description  
0   2023-

In [4]:
df.to_csv('repo_data.csv', index=False)

In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Construct the arXiv API request URL
base_url = "http://export.arxiv.org/api/query?"
search_query = "all:electron"  # replace with your query
start = 0  # start at the beginning
total_results = 2000  # maximum results is 2000
url = f"{base_url}search_query={search_query}&sortBy=submittedDate&sortOrder=descending&max_results={total_results}"

response = requests.get(url)

# Parse response using BeautifulSoup
soup = BeautifulSoup(response.content, 'xml')

# Extract relevant information for each entry (modify this as needed)
entries = []
for entry in soup.findAll('entry'):
    title = entry.title.text
    summary = entry.summary.text
    url = entry.id.text
    published = entry.published.text
    authors = [author.find('name').text for author in entry.findAll('author')]
    entries.append({'title': title, 'summary': summary, 'url': url, 'published': published, 'authors': authors})

# Convert list of entries to pandas DataFrame
papers_df = pd.DataFrame(entries)
papers_df.head()


Unnamed: 0,title,summary,url,published,authors
0,Generation and Life Cycle of Solar Spicules,Physical mechanism for the creation of solar...,http://arxiv.org/abs/2307.14328v1,2023-07-26T17:46:56Z,"[Hamid Saleem, Zain H. Saleem]"
1,A new Low Gain Avalanche Diode concept: the do...,This paper describes the new concept of the ...,http://arxiv.org/abs/2307.14320v1,2023-07-26T17:37:24Z,"[F. Carnesecchi, S. Strazzi, A. Alici, R. Arci..."
2,Non-chiral one-dimensional sates propagating i...,Boundaries between structural twins of bilay...,http://arxiv.org/abs/2307.14293v1,2023-07-26T16:49:44Z,"[V. V. Enaldiev, C. Moulsdale, A. K. Geim, V. ..."
3,High-speed plasma measurements with a plasma i...,Plasma impedance probes (PIPs) are a type of...,http://arxiv.org/abs/2307.14263v1,2023-07-26T15:50:47Z,"[John W. Brooks, Erik M. Tejero, Matthew C. Pa..."
4,Floquet engineering of the Lifshitz phase tran...,Within the Floquet theory of periodically dr...,http://arxiv.org/abs/2307.14258v1,2023-07-26T15:42:26Z,"[I. V. Iorsh, D. D. Sedov, S. A. Kolodny, R. E..."


In [6]:
papers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      2000 non-null   object
 1   summary    2000 non-null   object
 2   url        2000 non-null   object
 3   published  2000 non-null   object
 4   authors    2000 non-null   object
dtypes: object(5)
memory usage: 78.2+ KB


In [7]:
papers_df.tail(5)

Unnamed: 0,title,summary,url,published,authors
1995,On the capillary discharge in the high repetit...,We investigate the main physical processes t...,http://arxiv.org/abs/2305.19679v1,2023-05-31T09:20:31Z,"[P. Sasorov, G. Bagdasarov, N. Bobrova, G. Gri..."
1996,Implementation of the SCAN Exchange-Correlatio...,Kohn-Sham density functional theory (DFT) is...,http://arxiv.org/abs/2305.19662v1,2023-05-31T08:58:10Z,"[Renxi Liu, Daye Zheng, Xinyuan Liang, Xinguo ..."
1997,The Ferris ferromagnetic resonance technique: ...,Measurements of ferromagnetic resonance (FMR...,http://arxiv.org/abs/2306.01783v1,2023-05-31T08:48:29Z,"[Amit Rothschild, Benjamin Assouline, Nadav Am..."
1998,Novel slow dynamics of phase transition in the...,DyRu2Si2 is a frustrated magnet to exhibit m...,http://arxiv.org/abs/2305.19656v1,2023-05-31T08:41:03Z,"[Subaru Yoshimoto, Yoshikazu Tabata, Takeshi W..."
1999,Equivalence between State-Space Stability Anal...,Stability of power electronic converters con...,http://arxiv.org/abs/2305.19655v1,2023-05-31T08:40:47Z,"[Pablo Rodriguez-Ortega, Javier Roldan-Perez, ..."


In [8]:
papers_df.head(5)

Unnamed: 0,title,summary,url,published,authors
0,Generation and Life Cycle of Solar Spicules,Physical mechanism for the creation of solar...,http://arxiv.org/abs/2307.14328v1,2023-07-26T17:46:56Z,"[Hamid Saleem, Zain H. Saleem]"
1,A new Low Gain Avalanche Diode concept: the do...,This paper describes the new concept of the ...,http://arxiv.org/abs/2307.14320v1,2023-07-26T17:37:24Z,"[F. Carnesecchi, S. Strazzi, A. Alici, R. Arci..."
2,Non-chiral one-dimensional sates propagating i...,Boundaries between structural twins of bilay...,http://arxiv.org/abs/2307.14293v1,2023-07-26T16:49:44Z,"[V. V. Enaldiev, C. Moulsdale, A. K. Geim, V. ..."
3,High-speed plasma measurements with a plasma i...,Plasma impedance probes (PIPs) are a type of...,http://arxiv.org/abs/2307.14263v1,2023-07-26T15:50:47Z,"[John W. Brooks, Erik M. Tejero, Matthew C. Pa..."
4,Floquet engineering of the Lifshitz phase tran...,Within the Floquet theory of periodically dr...,http://arxiv.org/abs/2307.14258v1,2023-07-26T15:42:26Z,"[I. V. Iorsh, D. D. Sedov, S. A. Kolodny, R. E..."


In [9]:
papers_df.to_csv("papers_data.csv", index=False)