In [1]:
%matplotlib inline
import re
import json
import numpy as np
import matplotlib.pyplot as plt

import pydotplus
from sklearn import tree
from IPython.display import Image

from random import shuffle
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV

from sklearn.model_selection import train_test_split

from os import listdir
from os.path import isfile, join

from collections import defaultdict

from sklearn.metrics import f1_score, make_scorer
from sklearn.metrics import explained_variance_score, r2_score
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import mean_squared_log_error, median_absolute_error
from sklearn.metrics import accuracy_score, f1_score, classification_report

from scipy.stats import spearmanr, kendalltau
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from fim import apriori, fpgrowth

In [2]:
from docker_manager import *

In [3]:
path = '../datasets/'
repo_name_list = load_images(path)
repo_list_latest, last_updated = get_latest_as_repo(repo_name_list)

In [4]:
repo_list_latest, last_updated = get_latest_as_repo(repo_name_list)

In [5]:
def get_knee_point_value(values):
    y = values
    x = np.arange(0, len(y))
    
    index = 0
    max_d = -float('infinity')
    
    for i in range(0, len(x)):
        c = closest_point_on_segment(a=[x[0], y[0]], b=[x[-1], y[-1]], p=[x[i], y[i]])
        d = np.sqrt((c[0]-x[i])**2 + (c[1]-y[i])**2)
        if d > max_d:
            max_d = d
            index = i
            
    return index

def closest_point_on_segment(a, b, p):
    sx1 = a[0]
    sx2 = b[0]
    sy1 = a[1]
    sy2 = b[1]
    px = p[0]
    py = p[1]

    x_delta = sx2 - sx1
    y_delta = sy2 - sy1

    if x_delta == 0 and y_delta == 0:
        return p

    u = ((px - sx1) * x_delta + (py - sy1) * y_delta) / (x_delta * x_delta + y_delta * y_delta)
    if u < 0:
        closest_point = a
    elif u > 1:
        closest_point = b
    else:
        cp_x = sx1 + u * x_delta
        cp_y = sy1 + u * y_delta
        closest_point = [cp_x, cp_y]

    return closest_point

def get_knee_threshold(values):
    score_values = sorted(values, reverse=True)
    knee_index = get_knee_point_value(score_values)
    threshold = score_values[knee_index]
    return threshold

def dict2transaction(reposet, target=None, use_software_version=True):
    y = list()
    X = list()
    
    stars = list()
    pulls = list()
    size = list()
    complete_size = list()
    nbr_layers = list()
    nbr_softwares = list()
    
    for repo_name, repo in reposet.items():
        stars.append(repo['stars'])
        pulls.append(repo['pulls'])
        size.append(repo['size'])
        complete_size.append(repo['complete_size'])
        nbr_layers.append(repo['nbr_layers'])
        nbr_softwares.append(repo['nbr_softwares'])
    
    size_threshold = get_knee_threshold(size)
    size_lower_than = [x for x in size if x < size_threshold]
    size_hist, size_bins = np.histogram(size_lower_than, bins='auto')
    print('size_threshold', size_threshold)
    
    complete_size_threshold = get_knee_threshold(complete_size)
    complete_size_lower_than = [x for x in complete_size if x < complete_size_threshold]
    complete_size_hist, complete_size_bins = np.histogram(complete_size_lower_than, bins='auto')
    print('complete_size_threshold', complete_size_threshold)
        
    stars_threshold = get_knee_threshold(stars)
    stars_lower_than = [x for x in stars if x < stars_threshold]
    stars_hist, stars_bins = np.histogram(stars_lower_than, bins='auto')
    print('stars_threshold', stars_threshold)
    
    pulls_threshold = get_knee_threshold(pulls)
    pulls_lower_than = [x for x in pulls if x < pulls_threshold]
    pulls_hist, pulls_bins = np.histogram(pulls_lower_than, bins='auto')
    print('pulls_threshold', pulls_threshold)
    
    nbr_layers_hist, nbr_layers_bins = np.histogram(nbr_layers, bins='auto')
    nbr_softwares_hist, nbr_softwares_bins = np.histogram(nbr_softwares, bins='auto')
    
    transactions = dict()
    items_labels_size = set()
    items_labels_csize = set()
    items_labels_layers = set()
    items_labels_softwares = set()
    items_labels_softwares_names = set()
    items_labels_stars = set()
    items_labels_pulls = set()
    items_labels_distro = set()
    for repo_name, repo in reposet.items():
        
        if repo['size'] > np.max(size_bins):
            size_label = '%s_size' % (len(size_hist) + 1)
        else:
            size_label = '%s_size' % np.digitize([repo['size']], size_bins)[0]
            
        if repo['complete_size'] > np.max(complete_size_bins):
            complete_size_label = '%s_csize' % (len(complete_size_hist) + 1)
        else:
            complete_size_label = '%s_csize' % np.digitize([repo['complete_size']], complete_size_bins)[0]
            
        nbr_layers_label = '%s_layers' % np.digitize([repo['nbr_layers']], nbr_layers_bins)[0]
        nbr_softwares_label = '%s_softwares' % np.digitize([repo['nbr_softwares']], nbr_softwares_bins)[0]
        
        transaction = [size_label, complete_size_label, nbr_layers_label, nbr_softwares_label, 
                       '%s_distro' % repo['distro']]
        
        items_labels_size.add(size_label)
        items_labels_csize.add(complete_size_label)
        items_labels_layers.add(nbr_layers_label)
        items_labels_softwares.add(nbr_softwares_label)
        items_labels_distro.add(repo['distro'])
        
        softwares = list()
        for s in repo['softwares']:
            slabel = '%s%s' % (s[0], s[1]) if use_software_version else '%s' % (s[0])
            softwares.append(slabel)
            
        transaction += softwares
        items_labels_softwares_names |= set(softwares)
        
        if target is not None:
            if target == 'pulls' or 'pulls' in target:
                if repo['pulls'] > np.max(pulls_bins):
                    pulls_label = '%s_pulls' % (len(pulls_hist) + 1)
                else:
                    pulls_label = '%s_pulls' % np.digitize([repo['pulls']], pulls_bins)[0]
                transaction += [pulls_label]
                items_labels_pulls.add(pulls_label)

            if target == 'stars' or 'stars' in target:
                if repo['stars'] > np.max(stars_bins):
                    stars_label = '%s_stars' % (len(stars_hist) + 1)
                else:
                    stars_label = '%s_stars' % np.digitize([repo['stars']], stars_bins)[0]
                transaction += [stars_label]
                items_labels_stars.add(stars_label)
            
        transactions[repo_name] = transaction
        
    labels = {
        'size': size_bins,
        'csize': complete_size_bins,
        'layers': nbr_layers_bins,
        'softwares': nbr_softwares_bins,
    }
    
    items_labels = {
        'size': items_labels_size,
        'csize': items_labels_csize,
        'layers': items_labels_layers,
        'softwares': items_labels_softwares,
        'softwares_names': items_labels_softwares_names,
        'distro': items_labels_distro,
    }
    
    if target is not None:
        if target == 'pulls' or 'pulls' in target:
            labels['pulls'] = pulls_bins
            items_labels['pulls'] = items_labels_pulls

        if target == 'stars' or 'stars' in target:
            labels['stars'] = stars_bins
            items_labels['stars'] = items_labels_stars
    
    return transactions, labels, items_labels

In [6]:
def itemset2string(itemset):
    s = ''
    byte2gigabyte = 1073741824
    for item in itemset[0]:
        sitem = item

        if len(re.findall('.*_size', item)):   
            sitem = int(item.replace('_size', ''))
            if len(labels['size']) == sitem: 
                sitem = 'size > %.4f' % (labels['size'][-1]/byte2gigabyte)
            else:
                sitem = '%.4f <= size < %.4f' % (labels['size'][sitem-1]/byte2gigabyte, 
                                                 labels['size'][sitem]/byte2gigabyte)

        if len(re.findall('.*_csize', item)):   
            sitem = int(item.replace('_csize', ''))
            if len(labels['csize']) == sitem: 
                sitem = 'csize > %.4f' % (labels['csize'][-1]/byte2gigabyte)
            else:
                sitem = '%.4f <= csize < %.4f' % (labels['csize'][sitem-1]/byte2gigabyte, 
                                                  labels['csize'][sitem]/byte2gigabyte)

        if len(re.findall('.*_layers', item)):
            sitem = int(item.replace('_layers', ''))
            if len(labels['layers']) == sitem: 
                sitem = 'layers > %d' % labels['layers'][-1]
            else:
                sitem = '%d <= layers < %d' % (labels['layers'][sitem-1], labels['layers'][sitem])

        if len(re.findall('.*_softwares', item)):
            sitem = int(item.replace('_softwares', ''))
            if len(labels['softwares']) == sitem: 
                sitem = 'softwares > %d' % labels['softwares'][-1]
            else:
                sitem = '%d <= softwares < %d' % (labels['softwares'][sitem-1], labels['softwares'][sitem])

        if len(re.findall('.*_distro', item)):   
            sitem = item.replace('_distro', '')
              
        if len(re.findall('.*_stars', item)):
            sitem = int(item.replace('_stars', ''))
            if len(labels['stars']) == sitem: 
                sitem = 'stars > %d' % labels['stars'][-1]
            else:
                sitem = '%d <= stars < %d' % (labels['stars'][sitem-1], labels['stars'][sitem])
                
        if len(re.findall('.*_pulls', item)):
            sitem = int(item.replace('_pulls', ''))
            if len(labels['pulls']) == sitem: 
                sitem = 'pulls > %d' % labels['pulls'][-1]
            else:
                sitem = '%d <= pulls < %d' % (labels['pulls'][sitem-1], labels['pulls'][sitem])

        s += '%s, ' % (sitem)    
    
    s = '{%s} (%.4f)' % (s[:-2], itemset[-1])
    return s

def rule2string(rule):
    s_itemset = itemset2string([rule[1], rule[3]])
    if len(re.findall('.*_stars', rule[0])):
        sitem = int(rule[0].replace('_stars', ''))
        if len(labels['stars']) == sitem: 
            sitem = 'stars > %d' % labels['stars'][-1]
        else:
            sitem = '%d <= stars < %d' % (labels['stars'][sitem-1], labels['stars'][sitem])

    if len(re.findall('.*_pulls', rule[0])):
        sitem = int(rule[0].replace('_pulls', ''))
        if len(labels['pulls']) == sitem: 
            sitem = 'pulls > %.2f' % labels['pulls'][-1]
        else:
            sitem = '%.2f <= pulls < %.2f' % (labels['pulls'][sitem-1], labels['pulls'][sitem])
    
    s_cons = sitem
    
    s_rule = '{%s} <-- %s (%.4f, %.4f)' % (s_cons, s_itemset, rule[4], rule[5])

    return s_rule

def report_itemsets(itemsets, target=None, nbr_show=10, only_all=False):  
    max_i = 8 if target is not None else 7
    max_i = 1 if only_all else max_i
    for i in range(0, max_i):
        count = 0
        
        if i == 0:
            print('\n-- all')
        elif i == 1:
            print('\n-- size')
        elif i == 2:
            print('\n-- csize')
        elif i == 3:
            print('\n-- layers')
        elif i == 4:
            print('\n-- softwares')
        elif i == 5:
            print('\n-- distro')
        elif i == 6:
            print('\n-- or')
        elif i == 7:
            print('\n-- %s' % target)

        for itemset in sorted(itemsets, key=lambda x: x[-1], reverse=True):

            flag_size = False
            flag_csize = False
            flag_layers = False
            flag_softwares = False
            flag_distro = False
            flag_stars = False
            flag_pulls = False
            
            for item in itemset[0]:
                if len(re.findall('.*_size', item)):   
                    flag_size = True
                if len(re.findall('.*_csize', item)):   
                    flag_csize = True
                if len(re.findall('.*_layers', item)):   
                    flag_layers = True
                if len(re.findall('.*_softwares', item)):   
                    flag_softwares = True
                if len(re.findall('.*_distro', item)):   
                    flag_distro = True
                if len(re.findall('.*_stars', item)):   
                    flag_stars = True
                if len(re.findall('.*_pulls', item)):   
                    flag_pulls = True
            
            if target is None:
                flag_target = True
            else:
                flag_target = flag_stars if target == 'stars' else flag_pulls
            
            if i == 0:
                flag = True
            elif i == 1:
                flag = flag_size & flag_target
            elif i == 2:
                flag = flag_csize & flag_target
            elif i == 3:
                flag = flag_layers & flag_target
            elif i == 4:
                flag = flag_softwares & flag_target
            elif i == 5:
                flag = flag_distro & flag_target
            elif i == 6:
                flag = (flag_size | flag_csize | flag_layers | flag_softwares | flag_distro) & flag_target
            elif i == 7:
                flag = flag_target
                
            if flag:
                s = itemset2string(itemset)
                print(s)
                count += 1

            if count >= nbr_show:
                break
        
    return

## Analysis without Stars/Pulls

In [7]:
repo_list = repo_list_latest
transactions, labels, items_labels = dict2transaction(repo_list, target=None, use_software_version=False)

size_threshold 695042085
complete_size_threshold 1751071137
stars_threshold 288
pulls_threshold 16.91084


In [8]:
to_remove = ['erl', 'tar', 'bash', 'perl', 'wget', 'curl']

for repo_name in transactions:
    transaction = transactions[repo_name]
    for tor in to_remove:
        if tor in transaction:
            transaction.remove(tor)
    transactions[repo_name] = transaction

In [9]:
len(transactions), np.mean([len(t) for t in transactions]), np.std([len(t) for t in transactions])

(879, 19.054607508532424, 6.3248592046446719)

In [10]:
supp = 5
zmin = 3

itemsets = fpgrowth(list(transactions.values()), supp=supp, zmin=zmin, target='m', report='as')
print(len(itemsets))
report_itemsets(itemsets, target=None)

21

-- all
{0.0037 <= csize < 0.0993, 0.0019 <= size < 0.0419, httpd, ash, unzip} (0.1047)
{ping, git, python} (0.0956)
{pip, git, python} (0.0853)
{ping, unzip, python} (0.0751)
{npm, node, git, python} (0.0728)
{9 <= softwares < 10, Debian GNU/Linux 8 (jessie), git, python} (0.0660)
{java, Debian GNU/Linux 8 (jessie), unzip} (0.0648)
{Alpine Linux v3.4, httpd, ash, unzip} (0.0637)
{java, git, unzip, python} (0.0626)
{3 <= layers < 5, ash, unzip} (0.0569)

-- size
{0.0037 <= csize < 0.0993, 0.0019 <= size < 0.0419, httpd, ash, unzip} (0.1047)
{1 <= layers < 3, 0.0037 <= csize < 0.0993, 0.0019 <= size < 0.0419, ash, unzip} (0.0512)

-- csize
{0.0037 <= csize < 0.0993, 0.0019 <= size < 0.0419, httpd, ash, unzip} (0.1047)
{1 <= layers < 3, 0.0037 <= csize < 0.0993, 0.0019 <= size < 0.0419, ash, unzip} (0.0512)

-- layers
{3 <= layers < 5, ash, unzip} (0.0569)
{1 <= layers < 3, httpd, ash, unzip} (0.0557)
{1 <= layers < 3, 0.0037 <= csize < 0.0993, 0.0019 <= size < 0.0419, ash, unzip} (0.

In [11]:
supp = 5
zmin = 3

itemsets = fpgrowth(list(transactions.values()), supp=supp, zmin=zmin, target='c', report='as')
print(len(itemsets))
report_itemsets(itemsets, target=None)

45

-- all
{httpd, ash, unzip} (0.1695)
{0.0019 <= size < 0.0419, ash, unzip} (0.1286)
{0.0037 <= csize < 0.0993, ash, unzip} (0.1251)
{0.0037 <= csize < 0.0993, 0.0019 <= size < 0.0419, ash, unzip} (0.1229)
{git, unzip, python} (0.1229)
{Debian GNU/Linux 8 (jessie), git, python} (0.1149)
{0.0019 <= size < 0.0419, httpd, ash, unzip} (0.1104)
{0.0037 <= csize < 0.0993, httpd, ash, unzip} (0.1069)
{0.0037 <= csize < 0.0993, 0.0019 <= size < 0.0419, httpd, ash, unzip} (0.1047)
{9 <= softwares < 10, git, python} (0.1035)

-- size
{0.0019 <= size < 0.0419, ash, unzip} (0.1286)
{0.0037 <= csize < 0.0993, 0.0019 <= size < 0.0419, ash, unzip} (0.1229)
{0.0019 <= size < 0.0419, httpd, ash, unzip} (0.1104)
{0.0037 <= csize < 0.0993, 0.0019 <= size < 0.0419, httpd, ash, unzip} (0.1047)
{1 <= layers < 3, 0.0037 <= csize < 0.0993, 0.0019 <= size < 0.0419} (0.0523)
{1 <= layers < 3, 0.0019 <= size < 0.0419, ash, unzip} (0.0523)
{1 <= layers < 3, 0.0037 <= csize < 0.0993, 0.0019 <= size < 0.0419, ash

## Rule Analysis

In [25]:
supp = 1
zmin = 3
conf = 10

repo_list = repo_list_latest

transactions, labels, items_labels = dict2transaction(repo_list, target='stars', use_software_version=False)

to_remove = ['erl', 'tar', 'bash', 'perl', 'wget', 'curl']

for repo_name in transactions:
    transaction = transactions[repo_name]
    for tor in to_remove:
        if tor in transaction:
            transaction.remove(tor)
    transactions[repo_name] = transaction

appear = {k: 'cons' for k in items_labels['stars']}
appear[None] = 'ante'

srules = fpgrowth(list(transactions.values()), supp=supp, zmin=zmin, target='r', conf=conf, report='ascl',
                appear=appear)
print(len(srules))

target_count = defaultdict(int)
for r in srules:
    target_count[r[0]] += 1
    
nbr_show = 10
count = 0
top_rules_conf = list()
for rule in sorted(srules, key=lambda x: x[4], reverse=True):

    print(rule2string(rule))
    top_rules_conf.append(rule)
    count += 1

    if count >= nbr_show:
        break

print('------')

nbr_show = 10
count = 0
top_rules_lift = list()
for rule in sorted(srules, key=lambda x: x[5], reverse=True):

    print(rule2string(rule))
    top_rules_lift.append(rule)
    count += 1

    if count >= nbr_show:
        break

size_threshold 695042085
complete_size_threshold 1751071137
stars_threshold 288
pulls_threshold 16.91084
12900
{13 <= stars < 19} <-- {0.2019 <= size < 0.2419, ping, unzip} (0.0080) (0.7778, 2.3904)
{13 <= stars < 19} <-- {0.5774 <= csize < 0.6730, 10 <= layers < 12} (0.0091) (0.7273, 2.2352)
{13 <= stars < 19} <-- {0.0819 <= size < 0.1219, 5 <= softwares < 6, python} (0.0114) (0.7143, 2.1953)
{13 <= stars < 19} <-- {10 <= softwares < 11, size > 0.6419, unzip} (0.0080) (0.7000, 2.1514)
{13 <= stars < 19} <-- {0.9599 <= csize < 1.0555, ping} (0.0080) (0.7000, 2.1514)
{13 <= stars < 19} <-- {5 <= softwares < 6, pip, python} (0.0068) (0.6667, 2.0490)
{13 <= stars < 19} <-- {5 <= softwares < 6, pip} (0.0068) (0.6667, 2.0490)
{13 <= stars < 19} <-- {0.1949 <= csize < 0.2906, 0.0819 <= size < 0.1219, 5 <= softwares < 6, python} (0.0068) (0.6667, 2.0490)
{13 <= stars < 19} <-- {Ubuntu 14.04.5 LTS, php} (0.0068) (0.6667, 2.0490)
{13 <= stars < 19} <-- {Ubuntu 14.04.5 LTS, php, ping} (0.0068) (

In [14]:
def is_subrule(rules, rule):
    for i, r in enumerate(rules):
        if set(rule[1]) < set(r[1]):
            return True
    return False

def is_superrule(rules, rule):
    superrule_of = list()
    for i, r in enumerate(rules):
        if set(r[1]) < set(rule[1]):
            superrule_of.append(i)
    return superrule_of

In [15]:
target_count

defaultdict(int,
            {'10_stars': 230,
             '11_stars': 259,
             '12_stars': 163,
             '13_stars': 121,
             '14_stars': 139,
             '15_stars': 120,
             '16_stars': 53,
             '17_stars': 98,
             '18_stars': 30,
             '19_stars': 55,
             '1_stars': 136,
             '20_stars': 60,
             '21_stars': 45,
             '22_stars': 5,
             '23_stars': 39,
             '24_stars': 14,
             '25_stars': 37,
             '26_stars': 23,
             '27_stars': 2,
             '29_stars': 24,
             '2_stars': 1385,
             '30_stars': 1,
             '33_stars': 3,
             '34_stars': 17,
             '35_stars': 2,
             '36_stars': 14,
             '37_stars': 23,
             '38_stars': 1,
             '39_stars': 1,
             '3_stars': 3216,
             '40_stars': 14,
             '42_stars': 21,
             '44_stars': 687,
             '4_stars': 

In [17]:
#Regole che predicono il max
stars = '44_stars'
max_rules = list()
for r in srules:
    if r[0] == stars:
        idx = is_subrule(max_rules, r)
        if len(max_rules) == 0:
            max_rules.append(r)
        elif not is_subrule(max_rules, r):
            superrule_of = is_superrule(max_rules, r)
            max_rules = [mr for i, mr in enumerate(max_rules) if i not in superrule_of]
            max_rules.append(r)
            
nbr_show = 10
count = 0
top_rules_max = list()
for r in sorted(max_rules, key=lambda x: x[5], reverse=True):

    print(rule2string(r))
    top_rules_max.append(r)
    count += 1

    if count >= nbr_show:
        break

{stars > 283} <-- {Debian GNU/Linux 9 (stretch), java, unzip} (0.0046) (0.4444, 8.3121)
{stars > 283} <-- {Debian GNU/Linux 9 (stretch), git, python} (0.0046) (0.3636, 6.8008)
{stars > 283} <-- {0.1949 <= csize < 0.2906, 3 <= layers < 5} (0.0034) (0.3333, 6.2340)
{stars > 283} <-- {0.1949 <= csize < 0.2906, 0.0819 <= size < 0.1219, ash, unzip} (0.0034) (0.3333, 6.2340)
{stars > 283} <-- {0.2419 <= size < 0.2819, 8 <= softwares < 9, git, python} (0.0034) (0.3333, 6.2340)
{stars > 283} <-- {3 <= softwares < 4, 0.1219 <= size < 0.1619} (0.0034) (0.3333, 6.2340)
{stars > 283} <-- {0.1949 <= csize < 0.2906, 0.0819 <= size < 0.1219, httpd} (0.0034) (0.3000, 5.6106)
{stars > 283} <-- {0.0993 <= csize < 0.1949, 0.0419 <= size < 0.0819, 5 <= softwares < 6} (0.0034) (0.3000, 5.6106)
{stars > 283} <-- {3 <= softwares < 4, 0.2906 <= csize < 0.3862} (0.0034) (0.2727, 5.1006)
{stars > 283} <-- {Alpine Linux v3.7, 0.0037 <= csize < 0.0993, 0.0019 <= size < 0.0419, ash, unzip} (0.0034) (0.2727, 5.1006

Quanto sono rispettate queste regole dalle repository ufficiali?

Far vedere che le regole che predicono il successo (quelle max) sono maggiormente rispettate dalle repository ufficiali e quelle che predicono insuccesso/successo ma non al top sono maggiormente rispettate dalle altre.
Nulla da dire per quelle con lift o confidenza più alte.

In [26]:
def get_rules_satisfied(rules, x):
    rules_satisfied = defaultdict(list)
    for rule in rules:
        ant = rule[1]
        cons = rule[0]
        satisfied = True
        if set(ant) < set(x):
            rules_satisfied[cons].append(rule)
    
    return rules_satisfied

def predict(rules_satisfied, k_rules=10, agg_fun=np.mean, accuracy_type='conf'):

    best_rules = dict()
    cons_accuracy = dict()
    accuracy_index = 4 if accuracy_type == 'conf' else 5

    for cons in rules_satisfied:
        best_rules[cons] = sorted(rules_satisfied[cons], key=lambda x: x[accuracy_index], reverse=True)[:k_rules]
        accuracy_measures = [rule[accuracy_index] for rule in best_rules[cons]]
        cons_accuracy[cons] = 0.0 if len(accuracy_measures) == 0 else agg_fun(accuracy_measures)
    
    pred = max(cons_accuracy, key=cons_accuracy.get)
    return pred, best_rules

def analyze_rules(transactions, top_rules): 
    count = 0
    count_correct = 0
    count_correct_officials = 0
    coverage_of_top_rules = list()
    is_official_list = list()

    for repo_name, transaction in transactions.items():

        satisfied_rules = get_rules_satisfied(rules, transaction)
        all_satisfied_rules = list()
        for cons in satisfied_rules:
            all_satisfied_rules.extend(satisfied_rules[cons])

        count_tr = 0
        for tr in all_satisfied_rules:
            if tr in top_rules:
                count_tr += 1
        coverage_of_top_rules.append(count_tr)
        is_official_list.append(1 if repo_name in officials else 0)

        count += 1
    
    off_coverage_of_top_rules = [x for x, y in zip(coverage_of_top_rules,is_official_list) if y == 1]
    
    avg_ctp = np.mean(coverage_of_top_rules)
    std_ctp = np.std(coverage_of_top_rules)
    avg_octp = np.mean(off_coverage_of_top_rules)
    std_octp = np.std(off_coverage_of_top_rules)
    
    return avg_ctp, std_ctp, avg_octp, std_octp

In [29]:
officials = json.load(open('../officials_images.json', 'r', encoding='utf8'))['officials']

print('Nbr of officials', len(officials))

Nbr of officials 148


In [47]:
val = [list(), list(), list(), list()]
for r in range(0, 100):
    random_rules = list()
    for i in np.random.randint(0, len(rules), 10):
        random_rules.append(rules[i])
    avg_ctp, std_ctp, avg_octp, std_octp = analyze_rules(transactions, random_rules)
    val[0].append(avg_ctp)
    val[1].append(std_ctp)
    val[2].append(avg_octp)
    val[3].append(std_octp)

In [48]:
np.mean(val[0]), np.mean(val[1]), np.mean(val[2]), np.mean(val[3])

(0.18601820250284409,
 0.45994487100248976,
 0.14799999999999999,
 0.38628975628689516)

In [36]:
analyze_rules(transactions, top_rules_conf)

(0.11262798634812286, 0.41836377385162288, 0.0, 0.0)

In [37]:
analyze_rules(transactions, top_rules_lift)

(0.10238907849829351, 0.55297826677114659, 0.0, 0.0)

In [38]:
analyze_rules(transactions, top_rules_max)

(0.11149032992036405,
 0.39200477648497561,
 0.59999999999999998,
 0.79999999999999993)

In [None]:
# studio pulls

In [31]:
supp = 1
zmin = 3
conf = 10

repo_list = repo_list_latest

transactions, labels, items_labels = dict2transaction(repo_list, target='pulls', use_software_version=False)

to_remove = ['erl', 'tar', 'bash', 'perl', 'wget', 'curl']

for repo_name in transactions:
    transaction = transactions[repo_name]
    for tor in to_remove:
        if tor in transaction:
            transaction.remove(tor)
    transactions[repo_name] = transaction

appear = {k: 'cons' for k in items_labels['pulls']}
appear[None] = 'ante'

prules = fpgrowth(list(transactions.values()), supp=supp, zmin=zmin, target='r', conf=conf, report='ascl',
                appear=appear)
print(len(prules))

target_count = defaultdict(int)
for r in prules:
    target_count[r[0]] += 1
    
nbr_show = 10
count = 0
top_rules_conf = list()
for rule in sorted(prules, key=lambda x: x[4], reverse=True):

    print(rule2string(rule))
    top_rules_conf.append(rule)
    count += 1

    if count >= nbr_show:
        break

print('------')

nbr_show = 10
count = 0
top_rules_lift = list()
for rule in sorted(prules, key=lambda x: x[5], reverse=True):

    print(rule2string(rule))
    top_rules_lift.append(rule)
    count += 1

    if count >= nbr_show:
        break

size_threshold 695042085
complete_size_threshold 1751071137
stars_threshold 288
pulls_threshold 16.91084
9325
{0.00 <= pulls < 0.07} <-- {8 <= softwares < 9, ping, unzip, python} (0.0125) (1.0000, 1.9447)
{0.00 <= pulls < 0.07} <-- {10 <= softwares < 11, pip, git, unzip, python} (0.0102) (1.0000, 1.9447)
{0.00 <= pulls < 0.07} <-- {10 <= softwares < 11, pip, git, unzip} (0.0102) (1.0000, 1.9447)
{0.00 <= pulls < 0.07} <-- {10 <= softwares < 11, size > 0.6419, unzip} (0.0114) (1.0000, 1.9447)
{0.00 <= pulls < 0.07} <-- {Ubuntu 16.04.2 LTS, git, python} (0.0114) (1.0000, 1.9447)
{0.00 <= pulls < 0.07} <-- {Ubuntu 16.04.2 LTS, git} (0.0159) (1.0000, 1.9447)
{0.00 <= pulls < 0.07} <-- {size > 0.6419, java, git, unzip} (0.0182) (0.9412, 1.8303)
{0.00 <= pulls < 0.07} <-- {size > 0.6419, java, git, unzip, python} (0.0171) (0.9375, 1.8231)
{0.00 <= pulls < 0.07} <-- {size > 0.6419, pip, unzip, python} (0.0148) (0.9286, 1.8058)
{0.00 <= pulls < 0.07} <-- {size > 0.6419, pip, unzip} (0.0148) (0

In [53]:
target_count

defaultdict(int,
            {'105_pulls': 30,
             '107_pulls': 3,
             '10_pulls': 23,
             '115_pulls': 6,
             '116_pulls': 14,
             '119_pulls': 16,
             '11_pulls': 41,
             '12_pulls': 14,
             '131_pulls': 5,
             '139_pulls': 4,
             '13_pulls': 4,
             '149_pulls': 3,
             '14_pulls': 27,
             '15_pulls': 36,
             '164_pulls': 59,
             '169_pulls': 42,
             '16_pulls': 26,
             '170_pulls': 7,
             '171_pulls': 1,
             '17_pulls': 15,
             '180_pulls': 9,
             '185_pulls': 16,
             '189_pulls': 15,
             '18_pulls': 82,
             '191_pulls': 8,
             '194_pulls': 3,
             '19_pulls': 23,
             '1_pulls': 3282,
             '202_pulls': 58,
             '203_pulls': 10,
             '208_pulls': 2,
             '20_pulls': 11,
             '21_pulls': 79,
             '228

In [19]:
#Regole che predicono il max
pulls = '228_pulls'
max_rules = list()
for r in prules:
    if r[0] == pulls:
        idx = is_subrule(max_rules, r)
        if len(max_rules) == 0:
            max_rules.append(r)
        elif not is_subrule(max_rules, r):
            superrule_of = is_superrule(max_rules, r)
            max_rules = [mr for i, mr in enumerate(max_rules) if i not in superrule_of]
            max_rules.append(r)
            
nbr_show = 10
count = 0
top_rules_max = list()
for r in sorted(max_rules, key=lambda x: x[5], reverse=True):

    print(rule2string(r))
    top_rules_max.append(r)
    count += 1

    if count >= nbr_show:
        break

{pulls > 16.72} <-- {3 <= softwares < 4, 0.1219 <= size < 0.1619} (0.0046) (0.4444, 9.3016)
{pulls > 16.72} <-- {3 <= softwares < 4, 0.2906 <= csize < 0.3862} (0.0046) (0.3636, 7.6104)
{pulls > 16.72} <-- {0.1949 <= csize < 0.2906, 0.0819 <= size < 0.1219, ash, unzip} (0.0034) (0.3333, 6.9762)
{pulls > 16.72} <-- {0.2419 <= size < 0.2819, 8 <= softwares < 9, git, python} (0.0034) (0.3333, 6.9762)
{pulls > 16.72} <-- {3 <= softwares < 4, 3 <= layers < 5} (0.0034) (0.3333, 6.9762)
{pulls > 16.72} <-- {Debian GNU/Linux 9 (stretch), java, unzip} (0.0034) (0.3333, 6.9762)
{pulls > 16.72} <-- {Debian GNU/Linux 9 (stretch), git, python} (0.0034) (0.2727, 5.7078)
{pulls > 16.72} <-- {0.2906 <= csize < 0.3862, 0.1219 <= size < 0.1619, 7 <= layers < 10} (0.0034) (0.2500, 5.2321)
{pulls > 16.72} <-- {5 <= layers < 7, git, python} (0.0034) (0.2500, 5.2321)
{pulls > 16.72} <-- {3 <= softwares < 4, Debian GNU/Linux 8 (jessie)} (0.0057) (0.2500, 5.2321)


In [57]:
analyze_rules(transactions, top_rules_max)

(0.12627986348122866,
 0.44871569370938691,
 0.83999999999999997,
 1.1551623262554922)

In [58]:
val = [list(), list(), list(), list()]
for r in range(0, 100):
    random_rules = list()
    for i in np.random.randint(0, len(rules), 10):
        random_rules.append(rules[i])
    avg_ctp, std_ctp, avg_octp, std_octp = analyze_rules(transactions, random_rules)
    val[0].append(avg_ctp)
    val[1].append(std_ctp)
    val[2].append(avg_octp)
    val[3].append(std_octp)

In [59]:
np.mean(val[0]), np.mean(val[1]), np.mean(val[2]), np.mean(val[3])

(0.17518771331058022,
 0.44030164933290289,
 0.14079999999999998,
 0.37551210917674205)

In [27]:
nbr_show = 10
count = 0
top_rules_conf = list()
for rule in sorted(srules, key=lambda x: x[4], reverse=True):
    print(rule2string(rule))
    top_rules_conf.append(rule)
    count += 1

    if count >= nbr_show:
        break

{13 <= stars < 19} <-- {0.2019 <= size < 0.2419, ping, unzip} (0.0080) (0.7778, 2.3904)
{13 <= stars < 19} <-- {0.5774 <= csize < 0.6730, 10 <= layers < 12} (0.0091) (0.7273, 2.2352)
{13 <= stars < 19} <-- {0.0819 <= size < 0.1219, 5 <= softwares < 6, python} (0.0114) (0.7143, 2.1953)
{13 <= stars < 19} <-- {10 <= softwares < 11, size > 0.6419, unzip} (0.0080) (0.7000, 2.1514)
{13 <= stars < 19} <-- {0.9599 <= csize < 1.0555, ping} (0.0080) (0.7000, 2.1514)
{13 <= stars < 19} <-- {5 <= softwares < 6, pip, python} (0.0068) (0.6667, 2.0490)
{13 <= stars < 19} <-- {5 <= softwares < 6, pip} (0.0068) (0.6667, 2.0490)
{13 <= stars < 19} <-- {0.1949 <= csize < 0.2906, 0.0819 <= size < 0.1219, 5 <= softwares < 6, python} (0.0068) (0.6667, 2.0490)
{13 <= stars < 19} <-- {Ubuntu 14.04.5 LTS, php} (0.0068) (0.6667, 2.0490)
{13 <= stars < 19} <-- {Ubuntu 14.04.5 LTS, php, ping} (0.0068) (0.6667, 2.0490)


In [32]:
covered_transaction_indexes_p = set()
for rule in top_rules_conf:
    ant = rule[1]
    cons = rule[0]
    index = 0
    for repo_name, transaction in transactions.items():
        if set(ant) < set(transaction):
            covered_transaction_indexes_p.add(index)
        index += 1

In [35]:
len(covered_transaction_indexes & covered_transaction_indexes_p) / len(covered_transaction_indexes | covered_transaction_indexes_p)

0.09009009009009009