In [54]:
import remove_empty_base as rm
import os
import glob
from itertools import chain

def get_code_files(repo_path, language="js", language_oblivious=False):
    """
    Get code files from a repository
    :param repo_path:
    :param language:
    """
    if(language_oblivious):
        languages = ["js", "py", "java", "c", "cpp", "cs", "go","ts"]
        files = []
        for language in languages:
            files.extend(rm.find_files(repo_path, "*.{}".format(language), depth=7))
        return files

    files = rm.find_files(repo_path, "*.{}".format(language), depth=7)
    return files


def get_name(repo_path, from_end=1):
    """
    Get repo name given the repo path.
    :param repo_path:
    """
    return repo_path.split("/")[-from_end]



def get_code_name(code_path):
    """
    Get name of code file
    """
    return "".join(code_path.split("/")[-1].split(".")[0:-1])




In [55]:
platform = "azure"
from code_handler import CodePreprocessor as cp
z = cp(["sss"])
base_path = "./storage/nfs/" + platform + "/"
import glob
import os

repositories = glob.glob(os.path.join(base_path, "*","*"))
configs = {
    "serverless": "serverless.yml",
    "azure": "function.json",
    "aws": "template.yml",
    "ibm": "manifest.yml",
}

# API FREQ

In [1]:
import json
import remove_empty_base as rm
import os
import glob
from itertools import chain
import api_nodes
import pandas as pd
from collections import Counter
import networkx as nx
import os
import glob
from itertools import chain
import toydataset as td

def read_api(platform="serverless"):
    with open("storage/oneplace/{}_api_dict.json".format(platform), "r") as fp:
        api_dict = json.load(fp)
    return api_dict


In [2]:
def get_api_counts(api_dict):
    """
    Given an api_dict, returns a dictionary of api counts

    :params:
        api_dict: dictionary of repository: api_string_list ("api1 api2 api3")
    :returns:
        api_counts: dictionary of api:count
    
    """
    from collections import Counter
    api_counts = Counter()
    for i in api_dict:
        res = api_dict[i].split(" ")
        for api in res:
            if(api == ""):
                continue
            api_counts[api] += 1
    return api_counts


def get_weight(api, api_counts, adjust_by=10, clip=None):
    """
    Given an api and api_counts, returns the weight of the api

    :params:
        api: api to be weighted
        api_counts: api_counts dictionary
        adjust_by: adjust the weight by this value
        clip: clip the weight to this value
    :returns:
        weight: weight of the api

    """
    res = api_counts[api]
    if(clip != None and res < clip):
        return 0
    if(res != 0):
        return (1/res) * adjust_by
        # return 1
    else:
        return 0




def get_code_files(repo_path, language="js", language_oblivious=False):
    """
    Get code files from a repository
    :param repo_path:
    :param language:
    """
    if(language_oblivious):
        languages = ["js", "py", "java", "c", "cpp", "cs", "go","ts"]
        files = []
        for language in languages:
            files.extend(rm.find_files(repo_path, "*.{}".format(language), depth=7))
        return files

    files = rm.find_files(repo_path, "*.{}".format(language), depth=7)
    return files


def get_name(repo_path, from_end=1):
    """
    Get repo name given the repo path.
    :param repo_path:
    """
    return repo_path.split("/")[-from_end]



def get_code_name(code_path):
    """
    Get name of code file
    """
    return "".join(code_path.split("/")[-1].split(".")[0:-1])



def get_api_dict(platform, repositories):
    """
    Returns api_dict i.e. a dictionary of repo_name: all_apis

    :params:
        platform: str - platform name e.g. serverless, azure, aws, ibm
        repositories: list of repositories [name1, name2, ...]

    :return:
        api_dict: dict - {repo_name: all_apis}
    """
    from code_handler import CodePreprocessor as cp
    z = cp(["sss"])
    import api_nodes
    location_mapping = api_nodes.repo_location_mapping()
    api_dict = dict()
    store_path = './storage/oneplace/{}/apis/'.format(platform)
    repositories_loc = [location_mapping[repo][0] for repo in repositories if repo in location_mapping]
    for repo in repositories_loc:
        # print(repo)
        # break
        code_files = get_code_files(repo, language_oblivious=True)[:1]
        # print(code_files)
        # break
        repo_name = get_name(repo)
        all_apis = []
        # TODO procedure for javascript files and typescript files
        # if(len(code_files) > 0 and (z.code_lang(code_files[0]) == "js" or z.code_lang(code_files[0]) == "ts")):
        #     # TODO find package.json
        #     # TODO parse package.json
        #     # TODO find all dependencies
        #     pass
        # else:    
        for i in code_files:
            try:
                name = get_code_name(i)
                # print(name)
                language = z.code_lang(path=i)
                # language = i.split(".")[-1]
                # print(language)
                code = z.load_raw_code(i)
                # print(code)
                # break
                apis = z.get_imports(language,code)
                all_apis.append(apis)
                # print(all_apis)
                # break
            except:
                # print("ERROR : {}".format(i))
                continue
        
        # print(all_apis)
        all_apis = list(chain.from_iterable(all_apis))
        # print(all_apis)
        all_apis = (set(all_apis))
        if "" in all_apis:
            all_apis.remove("")
        all_apis = " ".join(all_apis)
        # print(all_apis)
        # break
        if(repo_name not in api_dict):
            api_dict[repo_name] = all_apis
        # if(not os.path.exists(store_path + repo_name + "/")):
        #     os.makedirs(store_path + repo_name + "/")

    return api_dict

def get_api_dict_for(repository, location_mapping):
    """
    Returns api dict for repo name.
    :params:
        repository: repository name
        location_mapping: location mapping

    :returns:
        api_dict: api dict for the repo (can be used for multi repo search)
    """
    repo_platform = location_mapping[repository]
    return get_api_dict(repo_platform, [repository])


def get_api_dict_for_repositories(repositories, location_mapping):
    """
    Returns api dict for repositories of multiple platform.

    :params:
        repositories: repositories belonging to different platforms.
        location_mapping: location mapping

    :returns:
        api_dict: api dict for the repo (can be used for multi repo search)
    """
    api_dict = dict()
    for repo in repositories:
        if repo not in location_mapping:
            continue
        repo_platform = location_mapping[repo]
        api_dict.update(get_api_dict(repo_platform, [repo]))
    return api_dict


def get_repo_weight_dict(repo_list_with_weights):
    """
    Returns repository:weight dictionary from search with score = true.

    :params:
        repo_list_with_weights: result of api_nodes.search_repositories ([[0,0,0,0,0,reponame]])

    :returns:
        repo_weight_dict: dictionary of repo:weight

    """
    repo_weight_dict = {}
    for i in repo_list_with_weights:
        repo_weight_dict[i[-1]] = i[0]
    return repo_weight_dict


def calc_weight_between_repositories(repo1, repo2, counts):
    """
    Given two repositories, returns the weight of the edge between them

    :params:
        repo1: repository name
        repo2: repository name
        counts: api_counts dictionary
    :returns:
        weight: weight of the edge between the two repositories

    """
    # Base case.
    if(repo1 == repo2):
        return 0

    average = int(sum(list(counts.values()))/len(list(counts.values())))
    
    corressponding = {
        "s3": "blob",
        "S3": "blob",
        "dynamodb": "cosmos",
        "DynamoDB": "cosmos",
        "sns":"eventgrid",
        "sns":"event-grid",
        "sqs":"queue",
        "ses":"sendgrid",
        "kinesis":"eventhub",
        "kinesis":"event-hubs",
        "kinesisanalytics":"eventhub",
        "kinesisanalytics":"event-hubs",
        "lex":"botbuilder",
        "polly":"speech",
    }
    corressponding_rev = {v:k for k,v in corressponding.items()}
    corressponding.update(corressponding_rev)

    # If both repositories is known then proceed.
    if(repo1 in api_dict and repo2 in api_dict):
        api_list1 = api_dict[repo1].split(" ")
        tmp = [corressponding[i] for i in api_list1 if i in corressponding]
        api_list1.extend(tmp)
        api_list2 = api_dict[repo2].split(" ")
        # First Set, add corressponding apis.
        common_apis = set(api_list1).intersection(api_list2)
        if(len(common_apis) == 0):
            return 0
        return sum([get_weight(i, counts, clip=average) for i in common_apis])
    else:
        return 0


def get_community_energy(community, repo_weight):
    """
    Given a community and repo_weight dictionary, returns the energy of the community

    :params:
        community: community to be evaluated
        repo_weight: dictionary of repo:weight
    :returns:
        energy: energy of the community

    """
    if(len(community) < 3):
        return 0
    return sum([repo_weight[i] for i in community])/len(community)


def find_match(selected_apis,repositories, api_dict, threshold=1,skip=False):
    """
    Given a repository and api_dict, returns the number of matching apis.

    :params:
        selected_apis: set of selected apis
        repository: repository name
        api_dict: api_dict dictionary
        threshold: threshold for matching
    :returns:
        match: number of matching apis

    """
    res = []
    for repository in repositories:
        if(repository in api_dict):
            api_list = api_dict[repository].split(" ")
            if(skip):
                if(len(api_list) > 15):
                    continue
            mlen = len([i for i in api_list if i in selected_apis])
            if(mlen > threshold):
                res.append(repository)
    return res

def get_license(repo, location_mapping, license_specific:rm.LicenseSpecific):
    """
    Given location mapping and repository name, find the license of the repository.

    :params:
        repo: repository name
        location_mapping: location mapping dictionary
        license_specific: license specific object from remove_empty_sarvesh

    :returns:
        license: license of the repository
    """
    # Get platform
    platform = location_mapping[repo][1]

    # Build query
    query = "storage/oneplace/{}/apis/{}/License".format(platform,repo)

    # Check if exists.
    if(not os.path.exists(query)):
        return "NONE"

    # Get license
    license = license_specific.which_license(query)

    return license

In [5]:
# api_dict = read_api(platform="serverless")
# api_dict.update(read_api(platform="aws"))
# api_dict.update(read_api(platform="azure"))
# api_dict.update(read_api(platform="ibm"))
# api_dict = get_api_dict_for_repositories(ntmp, location_mapping)

In [3]:
# counts = get_api_counts(api_dict)
location_mapping = api_nodes.repo_location_mapping()

In [60]:
query = "slack bot to send message to slack"
ffrepos = td.get_repositories()
tmp = api_nodes.search_repositories_for(query, ffrepos, location_mapping, num=10, with_score=True, all_repos=True, thres=50,word_vec=True)

In [61]:
searched_repositories = tmp.copy()
ntmp = [i[-1] for i in tmp]

In [4]:
# 

import glob
allrepos = [i.split("/")[-1] for i in glob.glob("storage/oneplace/*/*/*")]

In [5]:
api_dict = get_api_dict_for_repositories(allrepos, location_mapping)
import pickle
with open("api_dict.pkl", "wb") as f:
    pickle.dump(api_dict, f)

In [64]:
import pickle
def load_api_dict(path="api_dict.pkl"):
    """
    Load's the api_dict.pkl file.

    :params:
        path: path to the api_dict.pkl file
    
    :returns:
        global_api_dict: global_api_dict dictionary
    """
    with open(path, "rb") as f:
        return pickle.load(f)


def get_api_dict_from_global(ntmp, global_api_dict):
    """
    Given a list of repositories, returns a dictionary of api_dict for the repositories.

    :params:
        ntmp: list of repositories
        global_api_dict: global api_dict dictionary

    :returns:
        api_dict: api_dict dictionary
    """
    api_dict = {}
    for repo in ntmp:
        if(repo in global_api_dict):
            api_dict[repo] = global_api_dict[repo]
    return api_dict

In [65]:
global_api_dict = load_api_dict()
api_dict = get_api_dict_from_global(ntmp, global_api_dict)
counts = get_api_counts(api_dict)

In [66]:
repo_weight = get_repo_weight_dict(tmp)

In [67]:
G = nx.Graph()

In [68]:
G.add_nodes_from(list(repo_weight.keys()))

In [69]:
# Making the graph starts here

In [70]:
repositories = list(repo_weight.keys())

In [71]:
calc_weight_between_repositories(repositories[0], repositories[5], counts)

0

In [72]:
for i in range(len(repositories)):
    for j in range(i+1, len(repositories)):
        # calculate weight between two repos needed.
        w = calc_weight_between_repositories(repositories[i], repositories[j], counts)
        if(w != 0):
            G.add_edge(repositories[i], repositories[j], weight=w)


In [73]:
# Find communities based on the weights.
from networkx.algorithms.community import louvain_communities
partition = louvain_communities(G, weight='weight')

In [74]:
# Now partition has communities.
# We need a function to determine energy of the community.
# Energy is defined as the sum of repo weight in community divided by the number of repository in a community?
# It will basically indicate average repository energy.



In [75]:
max_comm = set()
max_energy = 0
for i in partition:
    energy = get_community_energy(i, repo_weight)
    if(energy > max_energy):
        max_energy = energy
        max_comm = i

In [76]:
max_energy

4.266862876761951

In [77]:
fin = []
for res in max_comm:
    fin.append((res, repo_weight[res]))

In [78]:
new_comm = [i[0] for i in sorted(fin, key=lambda x: x[1], reverse=True)[:30]] # I want most common api of top 30 strongest.

In [79]:
# Next todo is to get the most frequent api's in the community.

In [80]:
mlis = []
for i in new_comm:
    tmp = api_dict[i]
    tmp = tmp.split(" ")
    mlis.extend(tmp)

mlis = Counter(mlis)

In [81]:
selected_apis = {i[0] for i in mlis.most_common(15)}

In [82]:
len(ntmp)

39

In [83]:
new_comm

['bscaspar_serverless-cognito-auth',
 'darwaishx_rekognition-sqs',
 'enr1c091_amazon-cognito-facial-recognition-auth',
 'HaifengMei_go-ceries-server',
 'Sai503_GdriveImagetoPDF',
 'angelo-munoz_image-to-text',
 'preshetin_csv-to-dynamodb',
 'ioviic_RecordMe',
 'davidpallmann_world-factbook-site']

In [84]:
selected_apis

{'Decimal',
 'DynamoDB',
 'TestCase',
 'botocore.vendored',
 'csv-parser',
 'decimal',
 'dynamodb',
 'fetch',
 'googleapis',
 'node-fetch',
 'pytest',
 's3',
 'ses',
 'unittest',
 'webpack-node-externals'}

In [85]:
# Filter results based on the selected apis.
query = "slack bot to send message to slack"
tmp = api_nodes.search_repositories_for(query,ffrepos,location_mapping, num=20,with_score=False)

In [86]:
tmp

['SiarheiMelnik_gather-bot',
 'ConsenSysMesh_Luxarity-SensuiMod',
 'AlexeyPerov_LogKeeper-Flutter-Firebase',
 'miridius_serverless-telegram',
 'revmischa_qanda',
 'keetonian_cw-logs-to-chime',
 'jayfry1077_serverless_discord_diceroll_bot',
 'JuHwon_lambda-log-shipper',
 'kdcio_serverless-html-to-pdf',
 'amuelli_serverless-slack-lunch-hunter',
 'sturman_bus_lviv_bot',
 'AnilRedshift_captions_please',
 'harshkavdikar1_s3-to-gcs-streaming',
 'david--wright_habiticabot',
 'stuartleaver_discord-reminders-azure-functions',
 'cossou_mws-orders-webhook',
 'jayfry1077_serverless_discord_LFG_bot',
 'sam-negotiator_website-change-monitor',
 'angelo-munoz_image-to-text',
 'marteinn_Cynomys']

In [87]:
selected_low = find_match(selected_apis,tmp,api_dict,threshold=1)
print(selected_low)

['revmischa_qanda', 'jayfry1077_serverless_discord_LFG_bot', 'sam-negotiator_website-change-monitor', 'angelo-munoz_image-to-text']


In [88]:
tmp

['SiarheiMelnik_gather-bot',
 'ConsenSysMesh_Luxarity-SensuiMod',
 'AlexeyPerov_LogKeeper-Flutter-Firebase',
 'miridius_serverless-telegram',
 'revmischa_qanda',
 'keetonian_cw-logs-to-chime',
 'jayfry1077_serverless_discord_diceroll_bot',
 'JuHwon_lambda-log-shipper',
 'kdcio_serverless-html-to-pdf',
 'amuelli_serverless-slack-lunch-hunter',
 'sturman_bus_lviv_bot',
 'AnilRedshift_captions_please',
 'harshkavdikar1_s3-to-gcs-streaming',
 'david--wright_habiticabot',
 'stuartleaver_discord-reminders-azure-functions',
 'cossou_mws-orders-webhook',
 'jayfry1077_serverless_discord_LFG_bot',
 'sam-negotiator_website-change-monitor',
 'angelo-munoz_image-to-text',
 'marteinn_Cynomys']

In [89]:
api_dict["keetonian_cw-logs-to-chime"]

'loghelpers gzip lambdainit test extract lambdalogging chime patch config handlers base pytest'

In [90]:
selected_apis

{'Decimal',
 'DynamoDB',
 'TestCase',
 'botocore.vendored',
 'csv-parser',
 'decimal',
 'dynamodb',
 'fetch',
 'googleapis',
 'node-fetch',
 'pytest',
 's3',
 'ses',
 'unittest',
 'webpack-node-externals'}

In [91]:
# global_api_dict = read_api(platform="serverless")
# global_api_dict.update(read_api(platform="aws"))
# global_api_dict.update(read_api(platform="azure"))
# global_api_dict.update(read_api(platform="ibm"))
# global_api_dict = get_api_dict_for_repositories(ffrepos, location_mapping)

In [93]:
selected_high = find_match(selected_apis,list(global_api_dict.keys()),global_api_dict,threshold=6,skip=True)
print(selected_high)

['yomageli_yogeo', 'angelo-munoz_image-to-text', 'enr1c091_amazon-cognito-facial-recognition-auth']


In [94]:
global_api_dict["awslabs_aws-serverless-twitter-event-source"]

'checkpoint dynamodb lambdainit ClientError botocore.exceptions unittest.mock test MagicMock Attr twitter poller ses pytest'

In [95]:
location_mapping = api_nodes.repo_location_mapping()

In [96]:
location_mapping["awslabs_aws-serverless-twitter-event-source"]

['storage/nfs/aws/13001-13201/awslabs_aws-serverless-twitter-event-source',
 'aws']

In [97]:
ls = rm.LicenseSpecific()
final_selected = set(selected_low + selected_high)

In [98]:
for i in final_selected:
    print(i, " ", get_license(i, location_mapping, ls))

enr1c091_amazon-cognito-facial-recognition-auth   ('mit', 0.9831618334892422)
jayfry1077_serverless_discord_LFG_bot   ('mit', 0.9817842129845866)
revmischa_qanda   ('lgpl3', 0.019661911041841507)
yomageli_yogeo   ('mit', 0.8598598598598599)
angelo-munoz_image-to-text   ('mit', 0.9840823970037453)
sam-negotiator_website-change-monitor   ('mit', 0.9854664791373652)


# Global api dict

In [19]:
tot = 0
count = 0
for repo in api_dict:
    res = api_dict[repo].split(" ")
    tot += len(res)
    count += 1


In [20]:
tot/count

2.7985566363351113