## This model constuct a networkx graph from data using the following nodes

### Nodes:
- Station

### Edges:
- Trip: bike trip between stations

In [1]:
import pandas as pd
import networkx as nx

In [25]:
def fix_columns(df):
    if any(' ' in x for x in df.columns):
        col_rename = dict()
        for c in df.columns:
            col_rename[c] = c.replace(' ', '_')
        df.rename(columns=col_rename, inplace = True)
#     print(df.columns)

In [26]:
trips_filename = '../data//201508_trip_data.csv'
stations_filename = '../data/201508_station_data.csv'

trips_df = pd.read_csv(trips_filename)
stations_df = pd.read_csv(stations_filename)

# if columns have spaces in their names we need to replace them with underscore
fix_columns(trips_df)
fix_columns(stations_df)


In [27]:
# print(trips_df.columns)

Index(['Trip_ID', 'Duration', 'Start_Date', 'Start_Station', 'Start_Terminal',
       'End_Date', 'End_Station', 'End_Terminal', 'Bike_#', 'Subscriber_Type',
       'Zip_Code'],
      dtype='object')


In [30]:
from enum import Enum

class GraphEdges(Enum):
    UNIDIR = 1
    BIDIR  = 2
    

mapper = {
    'nodes': [
        {
            'type'      : 'Station',
            'key'       : 'station_id',
            'attributes': ['station_id', 'name', 'lat', 'long', 'landmark']
        },
    ],

    'edges': [
        {
            'type'      : 'Trip',
            'from'      : {
                'key'     :'Start_Terminal',
                'node_key': 'Station_id'
            },
            'to'        : {
                'key'     : 'End_Terminal',
                'node_key': 'Station_id'
            },
            'attributes': ['Trip_ID', 'Start_Date', 'End_Date']
        }
    ]
}

In [35]:
def create_graph(graph_mapper, nodes_from, edges_from,  graph_type = 'networkx', graph_edges = GraphEdges.UNIDIR):
    '''
    
    Nodes in the graph will have an attribute '_id_' that was originally the key in the source data.
    
    
    params:
        graph_mapper:
        node_from:
        edges_from:
        graph_type: 'networkx' | 'graphframe'
        graph_edges: GraphEdges enum (UNIDIR | BIDIR)
        
    return:
        constructured "graph_type" graph object based on the provided source data and according to 
        the mapper schema description.
    '''
    # graph object
    gObj = None

    if graph_edges == GraphEdges.BIDIR:
        gObj = nx.MultiDiGraph()
    else:
        gObj = nx.MultiGraph()
        
    assert (gObj != None),"Graph object wasn't constructed correctly"

    
    # get list of node types and edge_types
    node_types = graph_mapper['nodes']
    edge_types = graph_mapper['edges']
    
#     print(node_types)
#     print(edge_types)
    
    for node_type in node_types:
        assert all(c in nodes_from.columns for c in node_type['attributes']), \
                "mismatch between nodes_from and mapper's attributes for node: {}".format(node_type['type'])
                
        # do selection of attribute list
        nodes_df = nodes_from.loc[:,node_type['attributes']]
        nodes_df.rename(columns={node_type['key']:'_id_'}, inplace = True)
        # set index on the key
        nodes_df.set_index('_id_', inplace = True)
        nodes_df['_type_'] = node_type['type']
        nodes_list = [x for x in nodes_df.to_dict('index').items()]
        gObj.add_nodes_from(nodes_list)

        
    for edge_type in edge_types:
        
        assert all(c in edges_from.columns for c in edge_type['attributes']), \
                "mismatch between edges_from and mapper's attributes for edge: {}".format(edge_type['type'])

        src = edge_type['from']['key']
        src_index = edges_from.columns.get_loc(src)
        dst = edge_type['to']['key']
        dst_index = edges_from.columns.get_loc(dst)

        for row in edges_from.itertuples():
            attr = dict()
            attr['_type_'] = edge_type['type']
            for a in edge_type['attributes']:
                attr[a] = row[edges_from.columns.get_loc(a)]

            gObj.add_edges_from([(row[src_index], row[dst_index], attr)])
        
        
#         gObj.add_edges_from(edges_list)
    
    return gObj

In [36]:
%%time
g = create_graph(graph_mapper = mapper, 
                 nodes_from = stations_df, 
                 edges_from = trips_df, 
                 graph_type = 'networkx')

CPU times: user 4.27 s, sys: 83.4 ms, total: 4.35 s
Wall time: 4.35 s


In [None]:
nx.number_of_edges(g)

In [None]:
nx.number_of_nodes(g)