# Data Preprocessing

This notebook turns the data we retrieved in the data retrieval notebook into a graph format. 
Will end up with:

1. **pypi_nodes.csv** - Nodes with an index/id and the name from pypi
2. **pypi_edges.csv** - Edges between node ids - found via GitHub dependency graphs
3. **pypi_nodes_lang.csv** - Language e.g. READMEs affiliated with each node - indexed by node
4. **eval_topics.csv** - Top 200 topics associated with packages with GitHub repos
5. **node_topics.csv** - Nodes with assigned topics

In [1]:
# All the data from PyPI
pypi_data_path = 'retrieved_data/pypi_data.json'

# Only Packages from PyPI that linked to Github Repo
pypi_github_pkgs_path = 'retrieved_data/pypi_github_data.json'

# Data from GitHub API for packages identified in step 2
github_data_path = 'retrieved_data/github_data.json'

In [2]:
import os
import ujson
import pandas as pd
from tqdm import tqdm

from github_data import GitHub
from graph_data import PyPIGraph
from tokenizer import Tokenizer

In [3]:
class PyPI:

    def __init__(self):
        self.nodes_path = 'pypi_nodes.csv'
        self.edges_path = 'pypi_edges.csv'
        self.lang_path = 'pypi_nodes_lang.csv'

        self.github_data = GitHub(github_data_path)
        self.eval_topics_path = 'eval_topics.csv'
        self.node_topics_path = 'node_topics.csv'

        self.tokenizer = Tokenizer()

        with open(pypi_data_path, 'r', encoding='utf-8') as f:
            self.saved_data = ujson.load(f)

        self.saved_data_name_dict = dict((d['name'], d) for d in self.saved_data['data'])

    def create_eval_topics_list(self):
        eval_topics = self.github_data.get_evaluation_topics(n_top=200)

        eval_topics_df = pd.DataFrame(eval_topics, columns=["topics"])
        print(eval_topics_df)
        eval_topics_df.to_csv(self.eval_topics_path)
        print('Saved Evaluation Topics!')

    def create_node_topics_list(self):
        eval_topics = self.get_eval_topics()
        nodes = self.get_nodes()

        node_topic = []
        for pkg, data in self.github_data.pkgs_with_topics_gen():
            for t in data['topics']:
                if t['name'] in eval_topics:
                    node_id = nodes.index(pkg)
                    topic_id = eval_topics.index(t['name'])
                    node_topic.append([node_id, topic_id])

        node_topic_df = pd.DataFrame(node_topic, columns=["node_id", "topic_id"])
        print(node_topic_df)
        node_topic_df.to_csv(self.node_topics_path)
        print('Saved Node Topics List!')

    def create_node_language_list(self):
        nodes = self.get_nodes()
        nodes_lang = []

        print('Creating node language list (tokenizing too)...')
        for node in tqdm(nodes):
            # Use GitHub README or PyPI Description
            readme = self.github_data.get_readme(node)
            desc = self.saved_data_name_dict.get(node)['description']
            desc = desc if desc != 'UNKNOWN' else ''
            # Only take longest language value - avoid repeats
            tokenized = self.tokenizer.tokenize_doc(max(readme, desc))
            nodes_lang.append(' '.join(tokenized))  # Join so one line

        assert len(nodes_lang) == len(nodes)
        nodes_lang_df = pd.DataFrame(nodes_lang, columns=["language"])
        print(nodes_lang_df)
        nodes_lang_df.to_csv(self.lang_path)
        print('Saved PyPI Node Language!')

    def create_edge_list(self):
        pkg_deps_dict = self.github_data.get_pkg_dependency_dict()
        nodes = self.get_nodes()
        edges = []

        print("Creating edge list...")
        for pkg in tqdm(pkg_deps_dict):
            for dep in pkg_deps_dict[pkg]:
                # Ensure we have a node record
                if pkg in nodes and dep in nodes:
                    pkg_id, dep_id = str(nodes.index(pkg)), str(nodes.index(dep))
                    # Undirected!
                    edges.append([pkg_id, dep_id])
                    edges.append([dep_id, pkg_id])

        # Save edge list to a CSV
        edges_df = pd.DataFrame(edges, columns=["src", "dest"])
        print(edges_df)
        edges_df.to_csv(self.edges_path)
        print('Saved PyPI Edges!')

    def create_nodes(self):
        print("Creating Nodes...")
        nodes = [str(pkg["name"]) for pkg in self.saved_data['data']]

        # Save nodes to a CSV
        nodes_df = pd.DataFrame(nodes, columns=["nodes"])
        print(nodes_df)
        nodes_df.to_csv(self.nodes_path)
        print('Saved PyPI Nodes!')

    def get_nodes(self):
        if os.path.exists(self.nodes_path):
            return list(pd.read_csv(self.nodes_path, na_filter=False)["nodes"])
        raise NotImplementedError("There are no saved nodes - call create_nodes()!")

    def get_eval_topics(self):
        if os.path.exists(self.eval_topics_path):
            return list(pd.read_csv(self.eval_topics_path)["topics"])
        raise NotImplementedError("There are no saved nodes - call create_eval_topics_list()!")

    def print_statistics(self):
        edges = pd.read_csv(self.edges_path).values
        nodes = self.get_nodes()
        print('\n------------------------------')
        print('     PyPI Data Statistics    ')
        print('------------------------------')

        print(f"Total Number of Nodes: {len(nodes)}")
        print(f"Total Number of Edges: {len(edges)}")

        graph = PyPIGraph('pypi_nodes.csv', 'pypi_edges.csv', 'pypi_nodes_lang.csv')
        num_nodes_with_connections = graph.num_nodes_with_connections()
        print(f'Total of {num_nodes_with_connections}/{len(nodes)} has connections')

        num_node_lang = graph.num_nodes_with_features()
        print(f"Total Number of Nodes with language data: {num_node_lang}")

        num_nodes_graph_and_lang_data = graph.num_nodes_with_connections_and_features()
        print(f'Total of {num_nodes_graph_and_lang_data} have language and graph data')
        print()

In [4]:
data = PyPI()

In [5]:
# Create nodes - assign index/id and name to node
data.create_nodes()

Creating Nodes...
                   nodes
0              mediajson
1                alengen
2        chunked-scatter
3     sphinxcontrib-nvd3
4                 pyprot
..                   ...
92                 fanyi
93          localstorage
94                xy-tel
95  django-custom-mixins
96             pycleaner

[97 rows x 1 columns]
Saved PyPI Nodes!


In [6]:
# Create a src -> dest edge list between node ids
# Undirected so two connections per link
data.create_edge_list()

100%|██████████| 48/48 [00:00<00:00, 52675.72it/s]

Creating edge list...
  src dest
0  28   28
1  28   28
2  53   53
3  53   53
4  58   58
5  58   58
6  80   80
7  80   80
Saved PyPI Edges!





In [7]:
# Tokenize all documents and index by node id
# May take hours for all data
data.create_node_language_list()

  0%|          | 0/97 [00:00<?, ?it/s]

Creating node language list (tokenizing too)...


100%|██████████| 97/97 [00:10<00:00,  8.93it/s]

                                             language
0   json serialiser parser python support extensio...
1   alengen generate model table sqlalchemy port p...
2   chunked_scatter tool take bed file sequence di...
3   sphinxcontrib.nvd3 sphinx chart extension nvd3...
4   package design represent maniupate amino acid ...
..                                                ...
92                                                   
93  localstorage image target image target image t...
94                                 car brand juhe api
95  django custom mixin list custom mixin project ...
96                                                   

[97 rows x 1 columns]
Saved PyPI Node Language!





In [8]:
# Get up to top 200 topics from GitHub repos
# Assign an index to each topic
data.create_eval_topics_list()

Total of 60 topics to chose from
Getting top 200 topics
                     topics
0                    django
1                rd-project
2           rd-section-apmm
3                   protein
4                 alignment
5               amino-acids
6             substitutions
7              score-matrix
8                 profiling
9      structure-prediction
10           cryptocurrency
11                   crypto
12         cryptocurrencies
13                     news
14            cryptocontrol
15                      api
16               api-client
17                      zim
18                  openzim
19                  youtube
20                  scraper
21          django-packages
22                      orm
23                 database
24                  awesome
25          database-schema
26               torrent-dl
27                 peerflix
28                   pyflix
29                  torrent
30                   magnet
31                      vlc
32                  

In [9]:
# Label a node as belonging to topics
data.create_node_topics_list()

    node_id  topic_id
0         0         1
1         0         2
2         4         3
3         4         4
4         4         5
5         4         6
6         4         7
7         4         8
8         4         9
9         6        10
10        6        11
11        6        12
12        6        13
13        6        14
14        6        15
15        6        16
16        7        17
17        7        18
18        7        19
19        7        20
20       29         0
21       29        21
22       29        22
23       29        23
24       29        24
25       29        25
26       30        26
27       30        27
28       30        28
29       30        29
30       30        30
31       30        31
32       30        32
33       41        33
34       41        34
35       44        35
36       44        36
37       44        37
38       44        38
39       44        39
40       58        40
41       58        41
42       58        42
43       65        43
44       6

In [10]:
data.print_statistics()


------------------------------
     PyPI Data Statistics    
------------------------------
Total Number of Nodes: 97
Total Number of Edges: 8
Total of 4/97 has connections
Total Number of Nodes with language data: 84
Total of 4 have language and graph data

