## Introduction to Vaex

In [1]:
import vaex

import os
import h5py
import numpy as np
import pylab as pb
import pandas as pd

import altair as alt

#### Convert CSV into HDF5 (Hierarchial Data Fromat)
#### HDF has 3 basic types - Files, Group and Dataset. 
<i>Files may have Groups and Datasets. Data Model is like a file system. There is a root group, which may have several groups or datasets. Groups may have Groups and Datasets.</i> 

In [2]:
fileName = "./data/Locality_village_pincode_final_mar-2017.csv"

In [3]:
%%cmd
SET fileName="./data/Locality_village_pincode_final_mar-2017.csv"
find /c /v "" %fileName%

Microsoft Windows [Version 10.0.18362.836]
(c) 2019 Microsoft Corporation. All rights reserved.

C:\Users\knows\OneDrive\Documents\Studies\Programming\Vaex>SET fileName="./data/Locality_village_pincode_final_mar-2017.csv"

C:\Users\knows\OneDrive\Documents\Studies\Programming\Vaex>find /c /v "" %fileName%

---------- ./DATA/LOCALITY_VILLAGE_PINCODE_FINAL_MAR-2017.CSV: 906268

C:\Users\knows\OneDrive\Documents\Studies\Programming\Vaex>

In [4]:
# Divide file into set of segments; each segment containing 2000 lines
nLines = 906268
chunksize=2000
nSegments = (nLines)//chunksize
nSegments

453

In [5]:
# How much data is getting left out
leftover = nLines - nSegments*chunksize
leftover

268

#### Size Stats
<i>CSV file size is 55.9 MB</i><br>
<i>On guestimating column width HDF5 file size is 239 MB</i><br>
<i>On compressing HDF5 file, size is 36.568 MB</i><br>

In [22]:
# Create the hdf5 file
hdf_key = 'hdf_key'
columns = ['Locality name','Officename','Pincode','Sub-distname','Districtname','StateName']
store = pd.HDFStore('data/IndiaPost12.h5', complevel=9, complib='zlib')
for chunk in pd.read_csv(fileName, chunksize=chunksize, encoding='latin1'):
    # don't index data columns in each iteration - we'll do it later ...
    store.append(hdf_key, chunk, data_columns=columns, index=False,  
                 min_itemsize={'Locality name':80,'Officename': 50, 
                               'Sub-distname':50, 'StateName':30, 'Districtname':25}, encoding='latin1')
    # index data columns in HDFStore

store.create_table_index(hdf_key, columns=columns, optlevel=9, kind='full')
store.close()


In [3]:
# Read data in HDF5 format into Pandas Dataframe
pDf = pd.read_hdf('data/IndiaPost12.h5')

INFO:MainThread:numexpr.utils:NumExpr defaulting to 4 threads.


In [5]:
# Vaex doesn't natively supports HDF5. It supports HDFS. We shall later convert it to HDF5.
vDf = vaex.from_pandas(pDf, copy_index=False)

In [19]:
pDf1 = pd.read_csv('data/IN/IN.csv')

In [20]:
vDf = vaex.from_pandas(pDf1, copy_index=False)

In [21]:
vDf.head()

#,country code,postal code,Area,State,state code,District,province code,community name,community code,latitude,longitude,accuracy
0,IN,744101,Marine Jetty,Andaman & Nicobar Islands,1,South Andaman,,Portblair,,11.6667,92.75,3
1,IN,744101,Port Blair,Andaman & Nicobar Islands,1,South Andaman,,Port Blair,,11.6667,92.75,4
2,IN,744101,N.S.Building,Andaman & Nicobar Islands,1,South Andaman,,Portblair,,11.6667,92.75,3
3,IN,744102,Haddo,Andaman & Nicobar Islands,1,South Andaman,,Port Blair,,11.6833,92.7167,4
4,IN,744102,Chatham,Andaman & Nicobar Islands,1,South Andaman,,Portblair,,11.7,92.6667,3
5,IN,744102,Herbertabad,Andaman & Nicobar Islands,1,South Andaman,,Ferrargunj,,11.7167,92.6167,4
6,IN,744102,Delanipur,Andaman & Nicobar Islands,1,South Andaman,,Portblair,,11.7,92.6667,3
7,IN,744102,Radio Colony,Andaman & Nicobar Islands,1,South Andaman,,Port Blair,,11.7,92.6667,3
8,IN,744103,Minnie Bay,Andaman & Nicobar Islands,1,South Andaman,,Portblair,,11.6651,92.7121,1
9,IN,744103,Brijgunj,Andaman & Nicobar Islands,1,South Andaman,,Port Blair,,11.6651,92.7121,1


##### ZIP  resource: http://www.geonames.org/export/zip/

In [77]:
pDfCity = pDf1.District.isin(['Mumbai','Ghaziabad', 'Kolkata', 'Chennai'])
pDf1[pDfCity]

Unnamed: 0,country code,postal code,Area,State,state code,District,province code,community name,community code,latitude,longitude,accuracy
36230,IN,400001,Mumbai G.P.O.,Maharashtra,16,Mumbai,,Mumbai,,18.6291,72.8919,1
36231,IN,400001,Bazargate,Maharashtra,16,Mumbai,,Mumbai,,18.6291,72.8919,1
36232,IN,400001,Town Hall (Mumbai),Maharashtra,16,Mumbai,,Mumbai,,18.6291,72.8919,1
36233,IN,400001,Tajmahal,Maharashtra,16,Mumbai,,Mumbai,,18.6291,72.8919,1
36235,IN,400001,Stock Exchange,Maharashtra,16,Mumbai,,Mumbai,,18.6291,72.8919,1
...,...,...,...,...,...,...,...,...,...,...,...,...
132994,IN,245304,Anwarpur,Uttar Pradesh,36,Ghaziabad,,Hapur,,28.4703,78.2375,3
132995,IN,245304,Kalchina,Uttar Pradesh,36,Ghaziabad,,Hapur,,28.4703,78.2375,3
132996,IN,245304,Tyori,Uttar Pradesh,36,Ghaziabad,,Hapur,,28.4703,78.2375,3
132997,IN,245304,Pilkhauwa Post Office,Uttar Pradesh,36,Ghaziabad,,Ghaziabad,,28.4703,78.2375,3


In [80]:


alt.data_transformers.disable_max_rows()
url = "https://raw.githubusercontent.com/deldersveld/topojson/master/countries/india/india-states.json"

source = alt.topo_feature(url, "IND_adm1")

base = alt.Chart(source).mark_geoshape(color='#D3D3D3').encode(
# tooltip='properties.NAME_1:N'
).properties(
    width = 650,
    height = 900
)
# base


points = alt.Chart(pDf1[pDfCity]).mark_circle(size=10).encode(
    longitude='longitude:Q',
    latitude='latitude:Q',
    color=alt.Color('District:N'),
    tooltip=alt.Tooltip(['Area','District', 'State', 'postal code '])
)


mapF = alt.layer(
  base, points
)

mapF

In [81]:
mapF.save('IndiaPinCode4City.html')