In [205]:
import networkx as nx
import matplotlib.pyplot as plt
import pickle
import numpy as np
import json
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
from collections import Counter
import re
import pickle


CUSTOM_FORMAT = {"with_labels" : False, "font_size": 4,
                "node_size": 20, "node_color": "#ff0000", "edgecolors": "#911a07",
                "width": 0.4, "edge_color": "black"}
import igraph as ig

In [3]:
### Cargo el json con la informacion de cada paquete, y el grafo

filename = '../proc_jsons.json'
with open(filename, 'r') as f:
    pkg_json = json.load(f)

### Re-formateo el la lista de diccionarios por un diccionario de diccionarios, 
### con el nombre del paquete (nodo) de llave para la informacion.
info_dict = {}
for package in pkg_json:
    info_dict[package.pop('Name')] = package

G = nx.read_gexf('../PyPi Network.gexf')
G_ig = ig.Graph.from_networkx(G)

# Analizo tendencias con la informacion

In [180]:
df = pd.DataFrame(info_dict)
df = df.T

In [181]:
@np.vectorize
def format_strings(s: str):
    """
    Aplico un formato unificado para los paquetes.
    Esta funcion es importante porque las dependencias en varios de los paquetes
    aparecen con un formato distinto al nombre de las librerias (por ej. Numpy 
    es la libreria, y en las dependencias puede aparecer como NumPy, numpy, numpy[1.02], etc)

    """
    fmt_string = s.lower() # Normalizo todo a minuscula
    fmt_string = re.sub(r'[\_\-\| .,/\\;:=~]', '-', fmt_string) # Unifico separadores
    fmt_string = re.sub(r'\(\[\{', '[', fmt_string) # Unifico parentesis
    fmt_string = re.sub(r'\)\]\}', ']', fmt_string) # IDEM
    return fmt_string

In [182]:
# Normalizo a minuscula los nombres de los paquetes.
new_ix = format_strings(np.array(df.index))
df = df.reindex(new_ix)

In [183]:
df.head()

Unnamed: 0,Dependencies,Tags,Licence,Package Version,Python Version,Classifiers
0-------------------------0,,,,,,
0-core-client,[redis],,Apache 2.0,1.1.0a8,,[]
0-orchestrator,"[Jinja2, aioredis, etcd3, netaddr, python-date...",,Apache 2.0,1.1.0a8,,[]
0-0-1,,,,,,
0-618,,,,,,


In [184]:
### Remplazo strings que indican elemento faltante por None
to_replace = ['', 'UNKNOWN', 'none', 'None', 'empty', 'nothing', np.nan, 'NaN', 'nan']
df.replace(to_replace, None, inplace=True)

In [185]:
print(f'Number of packages: {len(df)}')

for column in ['Tags', 'Licence', 'Python Version', 'Dependencies']:
    print(f'NaN count {column} = {sum(df[column].isna())}')

### para el resto de las columnas, reviso cuantas tienen listas vacias
for column in ['Dependencies', 'Package Version', 'Classifiers']:
    print(f'NaN count {column} = {sum(~df[column].astype(bool))}')

Number of packages: 398843
NaN count Tags = 287145
NaN count Licence = 184957
NaN count Python Version = 266645
NaN count Dependencies = 66172
NaN count Dependencies = 245693
NaN count Package Version = 66172
NaN count Classifiers = 145397


Descarto los paquetes que tienen 0 dependencias y dependen de 0 paquetes

In [187]:
@np.vectorize
def format_dependencies(s: str):
    """
    Formateo cosas particulares que aparecen en las dependencias.
    """

    fmt_string = re.sub(r'\[.*\]', '', s) # Le quito la parte entre corchetes
    fmt_string = fmt_string.rstrip('-') # Le quito la - del final
    
    return fmt_string

In [188]:
df['Dependencies'] = df['Dependencies'].apply(lambda x: list(format_strings(x)) if 
                                              np.logical_and(type(x) == list, bool(x)) else x)
df['Dependencies'] = df['Dependencies'].apply(lambda x: list(format_dependencies(x)) if
                                              np.logical_and(type(x) == list, bool(x)) else x)

In [189]:
dep_pakage_counter = []
for dep in df['Dependencies']:
    if np.logical_and(type(dep) == list, bool(dep)):
        dep_pakage_counter += dep 

dep_pakage_counter = Counter(dep_pakage_counter)
dep_pakage_counter = dict(sorted(dep_pakage_counter.items(), key=lambda x: x[1], reverse=True))

In [190]:
# saco los paquetes que no estan indexados en el dataframe (los guarde como 'packages_not_found.txt')
packages_not_found = set(dep_pakage_counter.keys()) - set(df.index)
dep_pakage_counter = {k: i for k, i in dep_pakage_counter.items() if k not in packages_not_found}

In [191]:
# Creo la columna de la cantidad de librerias que dependen de cada lib, y la relleno con lo que obtuvimos recien
df['Dependant Libraries Count'] = np.zeros(len(df))
df.loc[dep_pakage_counter.keys(), 'Dependant Libraries Count'] = list(dep_pakage_counter.values())

Ahora, filtro por las librerias que dependen de otras o otras librerias dependen de ella

In [193]:
df = df[np.logical_or(df['Dependant Libraries Count'] > 0, df['Dependencies'].astype(bool))]

Formateo los classifiers

Primero, observo que los classifiers pueden tener como maximo 5 elementos [grupo, subgrupo, ..., subsubsubsubgrupo]

In [195]:
def format_classifiers(classifiers, column):
    
    # Si es None o una lista vacia, devuelvo None
    if bool(classifiers) == False:
        ret = None

    else:
        x = []
        for c in classifiers:
            split_classifier = c.split(' :: ')

            if split_classifier[0] == column:
                # x = c.replace(split_classifier[0] + ' :: ',  '')
                x.append(split_classifier[1])
        
        # Si no hay coincidencias, devuelvo None
        if bool(x) == False:
            ret = None
        
        else:
            ret = ''
            x = sorted(set(x))
            for element in x:
                ret += element
                ret += ' - '
            ret = ret.rstrip(' - ')

    return ret

In [196]:
df['Programming Language'] = df['Classifiers'].apply(format_classifiers, column='Programming Language')
df['Programming Language'].value_counts()

Python                                                135234
JavaScript - Python                                      699
C++ - Python                                             482
C - Python                                               360
Cython - Python                                          255
                                                       ...  
C - Java - Python                                          1
C - Python - Rust                                          1
C - C++ - Java - JavaScript - Objective C - Python         1
Logo - Python                                              1
PHP                                                        1
Name: Programming Language, Length: 100, dtype: int64

Notese que aunque hayan varios paquetes que tienen bajo "programming language" otro lenguaje que no es python, la gran mayoria tiene al menos una version de python anotada.

Para estos ~60 casos, descartamos los nodos.

In [197]:
# descarto las librerias que no tienen a python como uno de sus languages (las que no especifican, las dejo)
df = df[np.logical_or(df['Programming Language'].isnull(), df['Programming Language'].str.contains('Python'))]

In [198]:
df[~df['Programming Language'].isnull()]

Unnamed: 0,Dependencies,Tags,Licence,Package Version,Python Version,Classifiers,Dependant Libraries Count,Programming Language
01d61084-d29e-11e9-96d1-7c5cf84ffe8e,"[click, attrs]",,,0.1.0,">=3.7,<4.0","[Programming Language :: Python :: 3, Programm...",2.0,Python
0x-contract-addresses,"[mypy-extensions, bandit, black, coverage, cov...",ethereum cryptocurrency 0x decentralized block...,Apache 2.0,3.0.0,">=3.6, <4","[Development Status :: 5 - Production/Stable, ...",6.0,Python
0x-contract-artifacts,"[mypy-extensions, bandit, black, coverage, cov...",ethereum cryptocurrency 0x decentralized block...,Apache 2.0,3.0.0,">=3.6, <4","[Development Status :: 5 - Production/Stable, ...",4.0,Python
0x-contract-wrappers,"[0x-contract-addresses, 0x-contract-artifacts,...",ethereum cryptocurrency 0x decentralized block...,Apache 2.0,2.0.0,">=3.6, <4","[Development Status :: 2 - Pre-Alpha, Intended...",3.0,Python
0x-json-schemas,"[jsonschema, mypy-extensions, stringcase, 0x-c...",ethereum cryptocurrency 0x decentralized block...,Apache 2.0,2.1.0,">=3.6, <4","[Development Status :: 2 - Pre-Alpha, Intended...",3.0,Python
...,...,...,...,...,...,...,...,...
zzdb,[],,Apache 2.0,0.1.11,">=3.7,<4.0","[License :: Other/Proprietary License, Program...",1.0,Python
zzgui,"[pyqt5, qscintilla, zzdb]",,Apache 2.0,0.1.18,">=3.7,<3.11","[License :: Other/Proprietary License, Program...",0.0,Python
zzhfun,"[pandas, numpy, xgboost, scikit-learn]",,,0.30,,"[License :: OSI Approved :: MIT License, Opera...",0.0,Python
zzsukitest,"[jinja2, pyyaml, requests]",,,1.0.6,>=3.6,"[License :: OSI Approved :: MIT License, Opera...",0.0,Python


In [199]:
df[~df['Dependencies'].astype(bool)]

Unnamed: 0,Dependencies,Tags,Licence,Package Version,Python Version,Classifiers,Dependant Libraries Count,Programming Language
1a23-telemetry,[],1A23 Studio,AGPLv3+,1.0.0,,"[Intended Audience :: Developers, License :: O...",1.0,Python
2to3,[],2to3,MIT,1.0,,[],7.0,
3to2,[],,,1.1.1,,"[Development Status :: 5 - Production/Stable, ...",1.0,Python
absql,,,,,,,1.0,
adwin,,,,,,,1.0,
...,...,...,...,...,...,...,...,...
zxcvbn-python,[],"zxcvbn,password,security",MIT,4.4.24,,"[Intended Audience :: Developers, License :: O...",1.0,Python
zxcvbn,[],"zxcvbn,password,security",MIT,4.4.28,,"[Intended Audience :: Developers, License :: O...",19.0,Python
zxingmod,[],,LGPL v3 or later,0.19,,[],1.0,
zxtouch,[],,GPL-3.0,0.0.7.post11,>=3.7,[],1.0,


# Armo el grafo

In [204]:
# with open('PyPi filtered DataFrame.df', 'wb') as f:
#     pickle.dump(df, f)

In [207]:
edges = []
nodes = [n for n in df.index]

for node in nodes:
    dependencies = df.loc[node, 'Dependencies']
    
    if np.logical_and(type(dependencies) == list, bool(dependencies)):
        node_edges = [(dep, node) for dep in df.loc[node, 'Dependencies']]
        edges += node_edges

In [208]:
G = nx.DiGraph()
G.add_nodes_from(nodes)
G.add_edges_from(edges)

In [209]:
# nx.write_gexf(G, 'PyPi Network V2.gexf')

# Comunidades

In [1]:
com_fg = G_ig.community_fastgreedy()

def clusters_to_list(clusters, G) -> dict:
    """
    Arma un diccionario con los nombres de los nodos
    como keys y el id del cluster como valor.
    """
    dic = {}
    clusters_list = []
    for i, cluster in enumerate(sorted(list(clusters), key = len, reverse = True)):
        nodes_in_cluster = []
        for node in cluster:
            nodes_in_cluster.append(G.vs[node]['_nx_name'])
        clusters_list.append(nodes_in_cluster)
    return clusters_list

clusters = com_fg.as_clustering()
communities = clusters_to_list(clusters, G_ig)
for i in range(15):
    print(len(communities[i]))

NameError: name 'G_ig' is not defined

In [None]:
nodes_to_plot = communities[10]
G_com = G.subgraph(nodes_to_plot)
nx.draw(G_com, labels={i: str(i) for i in range(len(list(G_com.nodes())))}, **CUSTOM_FORMAT)

In [None]:
fig, axs = plt.subplots(2, 2)
axs = axs.flatten()
for i, com_ix in enumerate(range(8, 12)):
    nodes_to_plot = communities[com_ix]
    G_com = G.subgraph(nodes_to_plot)
    nx.draw_kamada_kawai(G_com, ax=axs[i], **CUSTOM_FORMAT)