In [None]:
import os
import logging
import pymongo
import multiprocessing
import pandas as pd
from pprint import pprint
from collections import Counter, defaultdict
from queue import Queue
from typing import List, Set
import json
from github import Github

MONGO_URL = "mongodb://127.0.0.1:27017"


def select_projects_from_libraries_io() -> pd.DataFrame:
    """Select a project dataframe as our research subject"""
    db = pymongo.MongoClient(MONGO_URL)

    projects = pd.DataFrame(list(db.libraries.repositories.find({
        "Host Type": "GitHub",
        "Fork": "false",
        "Language": "Python",
        "Stars Count": {"$gt": 10},
    })))

    projects = projects.drop(columns=['_id', 'Description', 'Issues enabled', 'Wiki enabled', 'Pages enabled', 'Forks Count', 'Mirror URL', 'Default branch', 'Watchers Count', 'UUID', 'Fork Source Name with Owner', 'SCM type', 'Pull requests enabled', 'Logo URL'])
    print(projects.head(5))
    print(list(projects))
    db.migration_helper_py.lioProject.insert_many(projects.to_dict(orient='records'))
    logging.debug(
        f"{len(projects)} non-fork GitHub Python projects with stars > 10")
    return projects


def select_libraries_from_libraries_io() -> pd.DataFrame:
    """Select a library dataframe as our research subject"""
    db = pymongo.MongoClient(MONGO_URL)
    libraries = pd.DataFrame(list(db.libraries.projects.find({
        "Platform": "Pypi",
        "Dependent Repositories Count": {"$gt": 10}
    })))

#     print(libraries.head(5))
    libraries = libraries.drop(columns=["_id", "Description", "Keywords", "Dependent Projects Count", "Last synced Timestamp", "Homepage URL", "Repository URL", "Status", "Package Manager ID"])
    print(list(libraries))
    print(libraries.head(5))
    db.migration_helper_py.lioRepository.insert_many(libraries.to_dict(orient='records'))
    logging.debug(
        f"{len(libraries)} libraries with dependent repository count > 10")
    return libraries



if __name__ == "__main__":
#     logging.basicConfig(level=logging.DEBUG)
#     select_libraries_from_libraries_io()
#     select_projects_from_libraries_io()
#     print(select_libraries_from_libraries_io())
#     print(select_projects_from_libraries_io())
    g = Github("06bf70084bea39f1c23cf2f0a9f89045f5c27d72")
    repo = g.get_repo("PyGithub/PyGithub")
    contents = repo.get_contents("README.md")


In [None]:
db = pymongo.MongoClient(MONGO_URL).libraries.projects
print(db.list_collection_names())


In [None]:
from github import Github
import github
import pymongo
import re

access_token = "06bf70084bea39f1c23cf2f0a9f89045f5c27d72"
def getRequirements(namewithowner)->str:
    g = Github(access_token)
    repo = g.get_repo(namewithowner)
    dependencies = []
    try:
        contents_byte = repo.get_contents("requirements.txt").decoded_content
        contents_str = str(contents_byte, 'utf8')
        raw_dependencies = contents_str.split('\n')
        for lib in raw_dependencies:
            name, version = None, None
            if lib.find('<') != -1:
                mid = lib.find('<')
                name, version = lib[:mid].split(';')[0], lib[mid:].replace("'", "")
            elif lib.find('>') != -1:
                mid = lib.find('>')
                name, version = lib[:mid].split(';')[0], lib[mid:].replace("'", "")
            elif lib.find('==') != -1:
                mid = lib.find('==')
                name, version = lib[:mid].split(';')[0], lib[mid + 2:].replace("'", "")
            else:
                name = lib
                version = None
            if name == '' or name == 'Deprecated':
                continue
            dependencies.append({'name': name, 'version': version})            
    except github.GithubException:
        return dependencies
    return  dependencies

MONGO_URL = "mongodb://127.0.0.1:27017"
db = pymongo.MongoClient(MONGO_URL).migration_helper_py
projects = list(db.lioProject.find({}, sort=[{"Stars Count", pymongo.DESCENDING}]))
print(len(projects))
i = 0
for project in projects:
    name_with_owner = project['Name with Owner']
    dependencies = getRequirements(name_with_owner)
    db.lioProject.update_one({'Name with Owner': name_with_owner},{'$set':{'Dependencies': dependencies}})
    i += 1
    if i % 100 == 0:
        print(i)

In [21]:
import re
'''
处理依赖版本信息
'''
def is_canonical(version):
    return re.match(r'^([1-9][0-9]*!)?(0|[1-9][0-9]*)(\.(0|[1-9][0-9]*))*((a|b|rc)(0|[1-9][0-9]*))?(\.post(0|[1-9][0-9]*))?(\.dev(0|[1-9][0-9]*))?$', version) is not None


DEPENDENCY_PATTERN = r"""
    ^\s*
    (?P<package_name>\w+)
    
    \s*
    (?P<specifier>~=|==|!=|>=|<=|<|>|===)
    \s*
    v?
    (
        (?:(?:[0-9]+)!)?                           # epoch
        (?:[0-9]+(?:\.[0-9]+)*)                  # release segment
        (?:                                          # pre-release
            [-_\.]?
            (?:a|b|c|rc|alpha|beta|pre|preview)
            [-_\.]?
            (?:[0-9]+)?
        )?
        (?:                                         # post release
            (?:-(?:[0-9]+))
            |
            (?:
                [-_\.]?
                (?:post|rev|r)
                [-_\.]?
                (?:[0-9]+)?
            )
        )?
        (?:                                          # dev release
            [-_\.]?
            (?:dev)
            [-_\.]?
            (?:[0-9]+)?
        )?
    )
    (?:\+(?:[a-z0-9]+(?:[-_\.][a-z0-9]+)*))?       # local version
    (?:
        \s*,\s*
        (~=|==|!=|>=|<=|<|>|===)
        \s*
        v?
        (
            (?:(?:[0-9]+)!)?                           # epoch
            (?:[0-9]+(?:\.[0-9]+)*)                  # release segment
            (?:                                          # pre-release
                [-_\.]?
                (?:a|b|c|rc|alpha|beta|pre|preview)
                [-_\.]?
                (?:[0-9]+)?
            )?
            (?:                                         # post release
                (?:-(?:[0-9]+))
                |
                (?:
                    [-_\.]?
                    (?:post|rev|r)
                    [-_\.]?
                    (?:[0-9]+)?
                )
            )?
            (?:                                          # dev release
                [-_\.]?
                (?:dev)
                [-_\.]?
                (?:[0-9]+)?
            )?
        )
        (?:\+(?:[a-z0-9]+(?:[-_\.][a-z0-9]+)*))?       # local version
    )?
    (?:
        \s*,\s*
        (~=|==|!=|>=|<=|<|>|===)
        \s*
        v?
        (
            (?:(?:[0-9]+)!)?                           # epoch
            (?:[0-9]+(?:\.[0-9]+)*)                  # release segment
            (?:                                          # pre-release
                [-_\.]?
                (?:a|b|c|rc|alpha|beta|pre|preview)
                [-_\.]?
                (?:[0-9]+)?
            )?
            (?:                                         # post release
                (?:-(?:[0-9]+))
                |
                (?:
                    [-_\.]?
                    (?:post|rev|r)
                    [-_\.]?
                    (?:[0-9]+)?
                )
            )?
            (?:                                          # dev release
                [-_\.]?
                (?:dev)
                [-_\.]?
                (?:[0-9]+)?
            )?
        )
        (?:\+(?:[a-z0-9]+(?:[-_\.][a-z0-9]+)*))?       # local version
    )
    
"""
_regex = re.compile(
    r"^\s*" + DEPENDENCY_PATTERN + r"\s*;.*$",
    re.VERBOSE | re.IGNORECASE,
)
print(_regex.search("project >= 909!1.2.dev1 , <2.0.dev1 , !=1.0;ply").groups())


('project', '>=', '909!1.2.dev1', '<', '2.0.dev1', '!=', '1.0')


In [11]:
import pymongo
MONGO_URL = "mongodb://127.0.0.1:27017"
db = pymongo.MongoClient(MONGO_URL).migration_helper_py
projects = list(db.lioProject.find({}, sort=[{"Stars Count", pymongo.DESCENDING}]))
total = len(projects)
no_github_repo = 0
no_requirements = 0
with_requirements = 0
i = 0
for project in projects:
    i += 1
    dependencies = project['Dependencies']
    if dependencies is None:
        no_github_repo += 1
    elif len(dependencies) == 0:
        no_requirements += 1
    else:
        with_requirements += 1
        db.ProjectwithRequirements.insert_one(project)
    if i % 10000 == 0:
        print(i)
print(f"total: {total}")
print(f'no_github_repo: {no_github_repo}')
print(f'no_requirements: {no_requirements}')

TypeError: first item in each key pair must be a string

In [13]:
from github import Github
import pkg_resources
import github
import pymongo
import numpy as np
import re
import pandas as pd
from dependency import Dependency
access_token = "06bf70084bea39f1c23cf2f0a9f89045f5c27d72"
def getRequirements(namewithowner)->str:
    g = Github(access_token)
    try:
        repo = g.get_repo(namewithowner)
    except github.GithubException:
        print(namewithowner)
        return None
    dependencies = []
    try:
        contents_byte = repo.get_contents("requirements.txt").decoded_content
        contents_str = str(contents_byte, 'utf8')
        dependencies = [x.to_dict() for x in Dependency.parse_requirements(contents_str)]
    except github.GithubException:
        return dependencies
    except pkg_resources.packaging.requirements.InvalidRequirement:
        return None
    except:
        print(namewithowner)
        raise IndexError
    return  dependencies

MONGO_URL = "mongodb://127.0.0.1:27017"
db = pymongo.MongoClient(MONGO_URL).migration_helper_py
projects = list(db.ProjectwithRequirements.find({}, sort=[{"Stars Count", pymongo.DESCENDING}]))
# projects = list(db.lioProject.find({}, sort=[{"Stars Count", pymongo.DESCENDING}]))
print(len(projects))
i = 0
pName = pd.DataFrame(columns=['Name with Owner'])
for project in projects:
    pName = pName.append({'Name with Owner':project['Name with Owner']}, ignore_index=True)
    
#     dependencies = getRequirements(name_with_owner)
#     db.lioProject.update_one({'Name with Owner': name_with_owner},{'$set':{'Dependencies': dependencies}})
#     i += 1
#     if i % 100 == 0:
#         print(i)
pName.to_csv('projects.csv', index=False)
print(pName)

14279
                           Name with Owner
0                     vinta/awesome-python
1                     TheAlgorithms/Python
2                             nvbn/thefuck
3                         keras-team/keras
4                          ansible/ansible
...                                    ...
14274           Crisimple/InterfaceTesting
14275       Deerhound579/mcgill-course-map
14276                    kirarpit/connect4
14277  KimJeongSun/SpecAugment_numpy_scipy
14278              Remi-Gau/COBIDAS_chckls

[14279 rows x 1 columns]


In [1]:
import os
from pathos.pools import ProcessPool
import pymongo
from tqdm import tqdm
import time
MONGO_URL = "mongodb://127.0.0.1:27017"
db = pymongo.MongoClient(MONGO_URL).migration_helper_py
projects = list(db.ProjectwithRequirements.find({}, sort=[{"Stars Count", pymongo.DESCENDING}]))
print(len(projects))
i = 0

def clone_repo(project):
    name_with_owner = project['Name with Owner']
    cmd = "cd repos && git clone https://github.com/{}.git".format(name_with_owner)
    print("Starting to clone {}".format(name_with_owner))
    print("Running command '{}'".format(cmd))
    os.system(cmd)
    print("Finshed cloning {}".format(name_with_owner))
    print("#####################################")
    print("")

def parallel(func, *args):
    pool = ProcessPool(96)
    try:
        start = time.time()
        # imap方法
        with tqdm(total=len(args[0]), desc="计算进度") as t:  # 进度条设置
            r = pd.DataFrame()
            for i in pool.imap(func, *args):
                r = r.append(i, ignore_index=True)
                t.set_postfix({'并行函数': func.__name__, "计算花销": "%ds" % (time.time() - start)})
                t.update()
        return r
    except Exception as e:
        print(e)
    finally:
        # 关闭池
        pool.close()  # close the pool to any new jobs
        pool.join()  # cleanup the closed worker processes
        pool.clear()  # Remove server with matching state


# parallel(clone_repo, projects[0:5])
for project in projects:
    name_with_owner = project['Name with Owner']
    clone_repo(project)
    i += 1
    if i == 5:
        break
        


14279


TypeError: string indices must be integers

In [15]:
import os
from pathos.pools import ProcessPool
from tqdm import tqdm
import time
import re
import subprocess
import datetime
from dependency import Dependency
import pandas as pd
import warnings
import numpy as np

# warnings.filterwarnings("ignore")
cope_repos = set()

def parallel(func, *args):
    pool = ProcessPool(96)
    try:
        start = time.time()
        # imap方法
        with tqdm(total=len(args[0]), desc="计算进度") as t:  # 进度条设置
            r = pd.DataFrame()
            for i in pool.imap(func, *args):
                r = r.append(i, ignore_index=True)
                t.set_postfix({'并行函数': func.__name__, "计算花销": "%ds" % (time.time() - start)})
                t.update()
        return r
    except Exception as e:
        print(e)
    finally:
        # 关闭池
        pool.close()  # close the pool to any new jobs
        pool.join()  # cleanup the closed worker processes
        pool.clear()  # Remove server with matching state



class Diff:
    def __init__(self, repo, text):
        self.repo = repo
        self.text = text
        self.date = None
        self.commit = ''
        self.adds = []
        self.rems = []
    
    def analyse(self):
        for line in self.text:
            if line.find('commit') == 0:
                self.commit = line[7:]

            if line.find('Date:') == 0:
                time_str = line[5:].strip()
                self.date = time_str
#                 self.date = datetime.datetime.strptime(time_str, '%a %b %d %H:%M:%S %Y %z')

            if line.find('--- ') == 0:
                continue

            if line.find('+++ ') == 0:
                continue

            if line.find('+') == 0:
                if line[1:].find('-r') == 0:
                    cope_repos.add(self.repo)
                    continue
                add = Dependency.parse_requirements(line[1:])
                if add:
                    self.adds.append(add[0])

            if line.find('-') == 0:
                if line[1:].find('-r') == 0:
                    cope_repos.add(self.repo)
                    continue
                rem = Dependency.parse_requirements(line[1:])
                if rem:
                    self.rems.append(rem[0])


            
class Log:
    def __init__(self, repo, log):
        self.repo = repo
        self.log = log.decode().split('\n')
        self.diffs = []
        self.results = []

    def split_log(self):
        start = -1
        for i in range(0, len(self.log)):
            value = self.log[i].find('commit')
            if value == 0:
                if start != -1:
                    new_start = i
                    self.diffs.append(self.log[start:new_start])
                    start = new_start
                else:
                    start = i
        new_start = len(self.log)
        self.diffs.append(self.log[start:new_start])

    def analyse(self):
        self.split_log()
        for diff in self.diffs:
            d = Diff(self.repo, diff)
            d.analyse()
            self.results.append(d)
        return self.results



def get_repos(path):
    for root, dirs, files in os.walk(path):
        array = dirs
        if array:
            return array

def get_requirements_log(project:str):
    df = pd.DataFrame(columns = ['repoName', 'commit', 'date', 'type', 'l1', 'v1', 'l2', 'v2'])
    cmd = 'cd repos/{} && git log -p requirements.txt  '.format(project)
    p = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE)  
    out,err = p.communicate() 
    l = Log(project, out)
    for r in l.analyse():
        common_l = set([x.project_name for x in r.adds]) & set([x.project_name for x in r.rems])
        for l in common_l:
            data = {
                'repoName': project,
                'commit': r.commit,
                'date': r.date,
                'type': 'verchange', 
                'l1': l, 
                'v1': [';'.join(['' + spec[0] + spec[1] for spec in x._specs]) for x in r.adds if x.project_name == l][0],
                'l2': l, 
                'v2': [';'.join(['' + spec[0] + spec[1] for spec in x._specs]) for x in r.rems if x.project_name == l][0]
            }
            df = df.append(data, ignore_index=True)
        for add in r.adds:
            if add.project_name in common_l or add.project_name is None:
                continue
            data = {
                'repoName': project,
                'commit': r.commit,
                'date': r.date,
                'type': 'add', 
                'l1': add.project_name, 
                'v1': ';'.join(['' + spec[0] + spec[1] for spec in add._specs]),
                'l2': np.nan, 
                'v2': np.nan
            }
            df = df.append(data, ignore_index=True)
        for rem in r.rems:
            if rem.project_name in common_l or rem.project_name is None:
                continue
            data = {
                'repoName': str(project),
                'commit': r.commit,
                'date': r.date,
                'type': 'rem', 
                'l1': np.nan, 
                'v1': np.nan,
                'l2': rem.project_name, 
                'v2': ';'.join(['' + spec[0] + spec[1] for spec in rem._specs])
            }
            df = df.append(data, ignore_index=True)
    return df
i = 0
repos = get_repos('repos')
total_df = parallel(get_requirements_log, repos)
total_df.to_excel('data/migration_changes.xlsx', index=False, encoding='UTF-8')
cope_repos = np.array(list(cope_repos))
np.savetxt('data/cope_repos.csv', cope_repos)
print(total_df.head(5))

计算进度: 100%|██████████| 2126/2126 [01:18<00:00, 27.10it/s, 并行函数=get_requirements_log, 计算花销=78s]


               repoName                                    commit  \
0  ncbi-genome-download  ab00b6c2f3dfadd1d1022a428e41b773fbd1abd1   
1  ncbi-genome-download  ab00b6c2f3dfadd1d1022a428e41b773fbd1abd1   
2  ncbi-genome-download  ab00b6c2f3dfadd1d1022a428e41b773fbd1abd1   
3  ncbi-genome-download  a078d93517236acfacb0ec1a633e6728384390b5   
4  ncbi-genome-download  a078d93517236acfacb0ec1a633e6728384390b5   

                             date       type               l1        v1  \
0  Fri Nov 11 14:27:01 2016 +0100        rem              NaN       NaN   
1  Fri Nov 11 14:27:01 2016 +0100        rem              NaN       NaN   
2  Fri Nov 11 14:27:01 2016 +0100        rem              NaN       NaN   
3   Fri Sep 9 19:10:45 2016 +0200  verchange  ndg-httpsclient   >=0.4.0   
4   Fri Sep 9 19:10:45 2016 +0200  verchange        pyOpenSSL  >=16.0.0   

                l2        v2  
0         requests            
1        pyOpenSSL  >=16.0.0  
2  ndg-httpsclient   >=0.4.0  
3  ndg-htt

In [45]:
import pandas as pd
import numpy as np
from openpyxl import Workbook, load_workbook
import time

def type_count():
    counts = dict(zip(*np.unique(df['type'].values, return_counts=True)))
    print(counts)

# type_count()
ws = load_workbook('data/migration_changes_without_verchanges.xlsx', read_only=True)['Sheet1']
commit = ws.cell(2, 2).value
now_commit = ''
add_rem = []
add = []
rem = []
for s in ws.rows:
    now_commit = s[1].value
    if now_commit != commit:
        if add and rem:
            add_rem.append({'add':add, 'rem': rem})
        add = []
        rem = []
        commit = now_commit
    else:
        if s[3].value == 'add':
            add.append(s[4].value)
        elif s[3].value == 'rem':
            rem.append(s[6].value)
for m in add_rem[:50]:
    print('add: {}'.format(m['add']))
    print('rem: {}'.format(m['rem']))
    

# # df.query('type=="add" | type=="rem"').to_excel('data/migration_changes_without_verchanges.xlsx', index=False)

add: ['Flask-SocketIO']
rem: ['Sphinx', 'guzzle_sphinx_theme', 'sphinxcontrib-httpdomain', 'sphinxcontrib-fulltoc']
add: ['Flask-SQLAlchemy']
rem: ['dogpile.cache']
add: ['requests', 'six']
rem: ['elasticsearch2', 'elasticsearch5', 'elasticsearch']
add: ['dnslib', 'aiohttp', 'psycopg2', 'tldextract']
rem: ['beautifulsoup4']
add: ['pycparser', 'wsaccel']
rem: ['greenlet', 'readline']
add: ['readline']
rem: ['pycparser', 'wsaccel']
add: ['chardet', 'hyperlink', 'urllib3']
rem: ['appdirs', 'packaging', 'pyparsing']
add: ['Automat', 'packaging', 'pyparsing', 'sortedcontainers']
rem: ['enum34', 'ipaddress']
add: ['service_identity', 'bitstring']
rem: ['pyqrcode']
add: ['zope.interface', 'Twisted', 'netaddr', 'PyTrie', 'Jinja2', 'mistune', 'Pygments', 'PyYAML', 'shutilwhich', 'sdnotify', 'psutil', 'lmdb', 'u-msgpack-python', 'cbor', 'py-ubjson', 'cryptography', 'pyOpenSSL', 'pyasn1', 'pyasn1-modules', 'service_identity', 'PyNaCl', 'treq', 'setproctitle', 'watchdog', 'u-msgpack-python']
rem: 