In [None]:
import os
import logging
import pymongo
import multiprocessing
import pandas as pd
from pprint import pprint
from collections import Counter, defaultdict
from queue import Queue
from typing import List, Set
import json
from github import Github

MONGO_URL = "mongodb://127.0.0.1:27017"


def select_projects_from_libraries_io() -> pd.DataFrame:
    """Select a project dataframe as our research subject"""
    db = pymongo.MongoClient(MONGO_URL)

    projects = pd.DataFrame(list(db.libraries.repositories.find({
        "Host Type": "GitHub",
        "Fork": "false",
        "Language": "Python",
        "Stars Count": {"$gt": 10},
    })))

    projects = projects.drop(columns=['_id', 'Description', 'Issues enabled', 'Wiki enabled', 'Pages enabled', 'Forks Count', 'Mirror URL', 'Default branch', 'Watchers Count', 'UUID', 'Fork Source Name with Owner', 'SCM type', 'Pull requests enabled', 'Logo URL'])
    print(projects.head(5))
    print(list(projects))
    db.migration_helper_py.lioProject.insert_many(projects.to_dict(orient='records'))
    logging.debug(
        f"{len(projects)} non-fork GitHub Python projects with stars > 10")
    return projects


def select_libraries_from_libraries_io() -> pd.DataFrame:
    """Select a library dataframe as our research subject"""
    db = pymongo.MongoClient(MONGO_URL)
    libraries = pd.DataFrame(list(db.libraries.projects.find({
        "Platform": "Pypi",
        "Dependent Repositories Count": {"$gt": 10}
    })))

#     print(libraries.head(5))
    libraries = libraries.drop(columns=["_id", "Description", "Keywords", "Dependent Projects Count", "Last synced Timestamp", "Homepage URL", "Repository URL", "Status", "Package Manager ID"])
    print(list(libraries))
    print(libraries.head(5))
    db.migration_helper_py.lioRepository.insert_many(libraries.to_dict(orient='records'))
    logging.debug(
        f"{len(libraries)} libraries with dependent repository count > 10")
    return libraries



if __name__ == "__main__":
#     logging.basicConfig(level=logging.DEBUG)
#     select_libraries_from_libraries_io()
#     select_projects_from_libraries_io()
#     print(select_libraries_from_libraries_io())
#     print(select_projects_from_libraries_io())
    g = Github("06bf70084bea39f1c23cf2f0a9f89045f5c27d72")
    repo = g.get_repo("PyGithub/PyGithub")
    contents = repo.get_contents("README.md")


In [None]:
db = pymongo.MongoClient(MONGO_URL).libraries.projects
print(db.list_collection_names())


In [None]:
from github import Github
import github
import pymongo
import re

access_token = "06bf70084bea39f1c23cf2f0a9f89045f5c27d72"
def getRequirements(namewithowner)->str:
    g = Github(access_token)
    repo = g.get_repo(namewithowner)
    dependencies = []
    try:
        contents_byte = repo.get_contents("requirements.txt").decoded_content
        contents_str = str(contents_byte, 'utf8')
        raw_dependencies = contents_str.split('\n')
        for lib in raw_dependencies:
            name, version = None, None
            if lib.find('<') != -1:
                mid = lib.find('<')
                name, version = lib[:mid].split(';')[0], lib[mid:].replace("'", "")
            elif lib.find('>') != -1:
                mid = lib.find('>')
                name, version = lib[:mid].split(';')[0], lib[mid:].replace("'", "")
            elif lib.find('==') != -1:
                mid = lib.find('==')
                name, version = lib[:mid].split(';')[0], lib[mid + 2:].replace("'", "")
            else:
                name = lib
                version = None
            if name == '' or name == 'Deprecated':
                continue
            dependencies.append({'name': name, 'version': version})            
    except github.GithubException:
        return dependencies
    return  dependencies

MONGO_URL = "mongodb://127.0.0.1:27017"
db = pymongo.MongoClient(MONGO_URL).migration_helper_py
projects = list(db.lioProject.find({}, sort=[{"Stars Count", pymongo.DESCENDING}]))
print(len(projects))
i = 0
for project in projects:
    name_with_owner = project['Name with Owner']
    dependencies = getRequirements(name_with_owner)
    db.lioProject.update_one({'Name with Owner': name_with_owner},{'$set':{'Dependencies': dependencies}})
    i += 1
    if i % 100 == 0:
        print(i)

In [21]:
import re
'''
处理依赖版本信息
'''
def is_canonical(version):
    return re.match(r'^([1-9][0-9]*!)?(0|[1-9][0-9]*)(\.(0|[1-9][0-9]*))*((a|b|rc)(0|[1-9][0-9]*))?(\.post(0|[1-9][0-9]*))?(\.dev(0|[1-9][0-9]*))?$', version) is not None


DEPENDENCY_PATTERN = r"""
    ^\s*
    (?P<package_name>\w+)
    
    \s*
    (?P<specifier>~=|==|!=|>=|<=|<|>|===)
    \s*
    v?
    (
        (?:(?:[0-9]+)!)?                           # epoch
        (?:[0-9]+(?:\.[0-9]+)*)                  # release segment
        (?:                                          # pre-release
            [-_\.]?
            (?:a|b|c|rc|alpha|beta|pre|preview)
            [-_\.]?
            (?:[0-9]+)?
        )?
        (?:                                         # post release
            (?:-(?:[0-9]+))
            |
            (?:
                [-_\.]?
                (?:post|rev|r)
                [-_\.]?
                (?:[0-9]+)?
            )
        )?
        (?:                                          # dev release
            [-_\.]?
            (?:dev)
            [-_\.]?
            (?:[0-9]+)?
        )?
    )
    (?:\+(?:[a-z0-9]+(?:[-_\.][a-z0-9]+)*))?       # local version
    (?:
        \s*,\s*
        (~=|==|!=|>=|<=|<|>|===)
        \s*
        v?
        (
            (?:(?:[0-9]+)!)?                           # epoch
            (?:[0-9]+(?:\.[0-9]+)*)                  # release segment
            (?:                                          # pre-release
                [-_\.]?
                (?:a|b|c|rc|alpha|beta|pre|preview)
                [-_\.]?
                (?:[0-9]+)?
            )?
            (?:                                         # post release
                (?:-(?:[0-9]+))
                |
                (?:
                    [-_\.]?
                    (?:post|rev|r)
                    [-_\.]?
                    (?:[0-9]+)?
                )
            )?
            (?:                                          # dev release
                [-_\.]?
                (?:dev)
                [-_\.]?
                (?:[0-9]+)?
            )?
        )
        (?:\+(?:[a-z0-9]+(?:[-_\.][a-z0-9]+)*))?       # local version
    )?
    (?:
        \s*,\s*
        (~=|==|!=|>=|<=|<|>|===)
        \s*
        v?
        (
            (?:(?:[0-9]+)!)?                           # epoch
            (?:[0-9]+(?:\.[0-9]+)*)                  # release segment
            (?:                                          # pre-release
                [-_\.]?
                (?:a|b|c|rc|alpha|beta|pre|preview)
                [-_\.]?
                (?:[0-9]+)?
            )?
            (?:                                         # post release
                (?:-(?:[0-9]+))
                |
                (?:
                    [-_\.]?
                    (?:post|rev|r)
                    [-_\.]?
                    (?:[0-9]+)?
                )
            )?
            (?:                                          # dev release
                [-_\.]?
                (?:dev)
                [-_\.]?
                (?:[0-9]+)?
            )?
        )
        (?:\+(?:[a-z0-9]+(?:[-_\.][a-z0-9]+)*))?       # local version
    )
    
"""
_regex = re.compile(
    r"^\s*" + DEPENDENCY_PATTERN + r"\s*;.*$",
    re.VERBOSE | re.IGNORECASE,
)
print(_regex.search("project >= 909!1.2.dev1 , <2.0.dev1 , !=1.0;ply").groups())


('project', '>=', '909!1.2.dev1', '<', '2.0.dev1', '!=', '1.0')


In [23]:
import pymongo
MONGO_URL = "mongodb://127.0.0.1:27017"
db = pymongo.MongoClient(MONGO_URL).migration_helper_py
projects = list(db.lioProject.find({}, sort=[{"Stars Count", pymongo.DESCENDING}]))
total = len(projects)
no_github_repo = 0
no_requirements = 0
with_requirements = 0
i = 0
for project in projects:
    i += 1
    dependencies = project['Dependencies']
    if dependencies is None:
        no_github_repo += 1
    elif len(dependencies) == 0:
        no_requirements += 1
    else:
        with_requirements += 1
        db.ProjectwithRequirements.insert_one(project)
    if i % 10000 == 0:
        print(i)
print(f"total: {total}")
print(f'no_github_repo: {no_github_repo}')
print(f'no_requirements: {no_requirements}')

10000
20000
30000
40000
50000
60000
70000
80000
total: 82547
no_github_repo: 27131
no_requirements: 41137


In [24]:
from github import Github
import pkg_resources
import github
import pymongo
import re
from dependency import Dependency
access_token = "06bf70084bea39f1c23cf2f0a9f89045f5c27d72"
def getRequirements(namewithowner)->str:
    g = Github(access_token)
    try:
        repo = g.get_repo(namewithowner)
    except github.GithubException:
        print(namewithowner)
        return None
    dependencies = []
    try:
        contents_byte = repo.get_contents("requirements.txt").decoded_content
        contents_str = str(contents_byte, 'utf8')
        dependencies = [x.to_dict() for x in Dependency.parse_requirements(contents_str)]
    except github.GithubException:
        return dependencies
    except pkg_resources.packaging.requirements.InvalidRequirement:
        return None
    return  dependencies

MONGO_URL = "mongodb://127.0.0.1:27017"
db = pymongo.MongoClient(MONGO_URL).migration_helper_py
projects = list(db.ProjectwithRequirements.find({}, sort=[{"Stars Count", pymongo.DESCENDING}]))
# projects = list(db.lioProject.find({}, sort=[{"Stars Count", pymongo.DESCENDING}]))
print(len(projects))
i = 0
for project in projects:
    name_with_owner = project['Name with Owner']
    dependencies = getRequirements(name_with_owner)
    db.lioProject.update_one({'Name with Owner': name_with_owner},{'$set':{'Dependencies': dependencies}})
    i += 1
    break
    if i % 100 == 0:
        print(i)

14279


ConnectTimeout: HTTPSConnectionPool(host='api.github.com', port=443): Max retries exceeded with url: /repos/vinta/awesome-python (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7fb44bf6de80>, 'Connection to api.github.com timed out. (connect timeout=15)'))