# Data Preprocessing

This notebook turns the data we retrieved in the data retrieval notebook into a graph format. 
Will end up with:

1. **pypi_nodes.csv** - Nodes with an index/id and the name from pypi
2. **pypi_edges.csv** - Edges between node ids - found via GitHub dependency graphs
3. **pypi_nodes_lang.csv** - Language e.g. READMEs affiliated with each node - indexed by node
4. **eval_topics.csv** - Top 200 topics associated with packages with GitHub repos
5. **node_topics.csv** - Nodes with assigned topics

In [16]:
# All the data from PyPI
pypi_data_path = 'data_retrieval/data/pypi_data.json'

# Data from GitHub API for packages with github links
github_data_path = 'data_retrieval/data/github_data.json'

In [17]:
import os
import ujson
import pandas as pd
from tqdm import tqdm

from data_preprocess.github_data import GitHub
from data_preprocess.graph_data import PyPIGraph
from data_preprocess.tokenizer import Tokenizer

In [18]:
class PyPI:

    def __init__(self):
        self.nodes_path = 'data_preprocess/data/pypi_nodes.csv'
        self.edges_path = 'data_preprocess/data/pypi_edges.csv'
        self.lang_path = 'data_preprocess/data/pypi_nodes_lang.csv'

        self.github_data = GitHub(github_data_path)
        self.eval_topics_path = 'data_preprocess/data/eval_topics.csv'
        self.node_topics_path = 'data_preprocess/data/node_topics.csv'

        self.tokenizer = Tokenizer()

        with open(pypi_data_path, 'r', encoding='utf-8') as f:
            self.saved_data = ujson.load(f)

        self.saved_data_name_dict = dict((d['name'], d) for d in self.saved_data['data'])

    def create_eval_topics_list(self):
        eval_topics = self.github_data.get_evaluation_topics(n_top=200)

        eval_topics_df = pd.DataFrame(eval_topics, columns=["topics"])
        print(eval_topics_df)
        eval_topics_df.to_csv(self.eval_topics_path)
        print('Saved Evaluation Topics!')

    def create_node_topics_list(self):
        eval_topics = self.get_eval_topics()
        nodes = self.get_nodes()

        node_topic = []
        for pkg, data in self.github_data.pkgs_with_topics_gen():
            for t in data['topics']:
                if t['name'] in eval_topics:
                    node_id = nodes.index(pkg)
                    topic_id = eval_topics.index(t['name'])
                    node_topic.append([node_id, topic_id])

        node_topic_df = pd.DataFrame(node_topic, columns=["node_id", "topic_id"])
        print(node_topic_df)
        node_topic_df.to_csv(self.node_topics_path)
        print('Saved Node Topics List!')

    def create_node_language_list(self):
        nodes = self.get_nodes()
        nodes_lang = []

        print('Creating node language list (tokenizing too)...')
        for node in tqdm(nodes):
            # Use GitHub README or PyPI Description
            readme = self.github_data.get_readme(node)
            desc = self.saved_data_name_dict.get(node)['description']
            desc = desc if desc != 'UNKNOWN' else ''
            # Only take longest language value - avoid repeats
            tokenized = self.tokenizer.tokenize_doc(max(readme, desc))
            nodes_lang.append(' '.join(tokenized))  # Join so one line

        assert len(nodes_lang) == len(nodes)
        nodes_lang_df = pd.DataFrame(nodes_lang, columns=["language"])
        print(nodes_lang_df)
        nodes_lang_df.to_csv(self.lang_path)
        print('Saved PyPI Node Language!')

    def create_edge_list(self):
        pkg_deps_dict = self.github_data.get_pkg_dependency_dict()
        nodes = self.get_nodes()
        edges = []

        print("Creating edge list...")
        for pkg in tqdm(pkg_deps_dict):
            for dep in pkg_deps_dict[pkg]:
                # Ensure we have a node record
                if pkg in nodes and dep in nodes:
                    pkg_id, dep_id = str(nodes.index(pkg)), str(nodes.index(dep))
                    # Undirected!
                    edges.append([pkg_id, dep_id])
                    edges.append([dep_id, pkg_id])

        # Save edge list to a CSV
        edges_df = pd.DataFrame(edges, columns=["src", "dest"])
        print(edges_df)
        edges_df.to_csv(self.edges_path)
        print('Saved PyPI Edges!')

    def create_nodes(self):
        print("Creating Nodes...")
        nodes = [str(pkg["name"]) for pkg in self.saved_data['data']]

        # Save nodes to a CSV
        nodes_df = pd.DataFrame(nodes, columns=["nodes"])
        print(nodes_df)
        nodes_df.to_csv(self.nodes_path)
        print('Saved PyPI Nodes!')

    def get_nodes(self):
        if os.path.exists(self.nodes_path):
            return list(pd.read_csv(self.nodes_path, na_filter=False)["nodes"])
        raise NotImplementedError("There are no saved nodes - call create_nodes()!")

    def get_eval_topics(self):
        if os.path.exists(self.eval_topics_path):
            return list(pd.read_csv(self.eval_topics_path)["topics"])
        raise NotImplementedError("There are no saved nodes - call create_eval_topics_list()!")

    def print_statistics(self):
        edges = pd.read_csv(self.edges_path).values
        nodes = self.get_nodes()
        print('\n------------------------------')
        print('     PyPI Data Statistics    ')
        print('------------------------------')

        print(f"Total Number of Nodes: {len(nodes)}")
        print(f"Total Number of Edges: {len(edges)}")

        graph = PyPIGraph(self.nodes_path, self.edges_path, self.lang_path)
        num_nodes_with_connections = graph.num_nodes_with_connections()
        print(f'Total of {num_nodes_with_connections}/{len(nodes)} has connections')

        num_node_lang = graph.num_nodes_with_features()
        print(f"Total Number of Nodes with language data: {num_node_lang}")

        num_nodes_graph_and_lang_data = graph.num_nodes_with_connections_and_features()
        print(f'Total of {num_nodes_graph_and_lang_data} have language and graph data')
        print()

In [19]:
data = PyPI()

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

In [5]:
# Create nodes - assign index/id and name to node
data.create_nodes()

Creating Nodes...
                    nodes
0                 sizzler
1         django-mongokit
2        gmaps-url-parser
3   django-excel-response
4                  slidoc
..                    ...
93           django-elmah
94                mambupy
95           django-jauth
96                  awsme
97               yakstack

[98 rows x 1 columns]
Saved PyPI Nodes!


In [6]:
# Create a src -> dest edge list between node ids
# Undirected so two connections per link
data.create_edge_list()

100%|██████████| 44/44 [00:00<00:00, 15293.72it/s]

Creating edge list...
  src dest
0  87   87
1  87   87
Saved PyPI Edges!





In [7]:
# Tokenize all documents and index by node id
# May take hours for all data
data.create_node_language_list()

  1%|          | 1/98 [00:00<00:10,  9.17it/s]

Creating node language list (tokenizing too)...


100%|██████████| 98/98 [00:14<00:00,  6.66it/s]

                                             language
0   sizzler vpn websocket sizzler linux tool set v...
1   django mongokit peter bengtsson 2010 2011 lice...
2   gmaps url parser |build| |downloads| |license|...
3   django excel response image target alt late ve...
4   slidoc image target image target image target ...
..                                                ...
93  djelmah djelmah catch unhandled exception prod...
94  image target alt build status mambupy python a...
95  django jauth simple oauth2 authentication clie...
96  amazon web services cloud watch metrics librar...
97  yakstack command line utility help stack yak e...

[98 rows x 1 columns]
Saved PyPI Node Language!





In [8]:
# Get up to top 200 topics from GitHub repos
# Assign an index to each topic
data.create_eval_topics_list()

Total of 93 topics to chose from
Getting top 200 topics
             topics
0            django
1             excel
2               csv
3          maildrop
4           testing
..              ...
83  static-analysis
84  static-analyzer
85        simulator
86       api-client
87         requests

[88 rows x 1 columns]
Saved Evaluation Topics!


In [9]:
# Label a node as belonging to topics
data.create_node_topics_list()

    node_id  topic_id
0         3         0
1         3         1
2         3         2
3         6         3
4         6         4
..      ...       ...
84       86        83
85       86        84
86       86        85
87       90        86
88       90        87

[89 rows x 2 columns]
Saved Node Topics List!


In [None]:
data.print_statistics()