# Reading and Writing Data

## A) mat files (scipy.io)

In [5]:
import numpy as np
from scipy.io import loadmat as loadmat #this is the scipy module that loads .mat files
from scipy.io import savemat as savemat #this is the scipy module that saves .mat files

matfile = loadmat('python_test.mat')  # load .mat file

array1 = matfile['array1']
array2 = matfile['array2']

array1.shape

type(array1)

savemat('python_test_save.mat',{'array1':array1}) #save numpy array to .mat file

## B) csv, txt, xls with pandas

In [6]:
import pandas as pd #great for reading .csv and .txt files

uri1 = 'http://www.ldeo.columbia.edu/~rpa/usgs_earthquakes_2014.csv' #example from Ryan's worskshop

d1 = pd.read_csv(uri1,index_col='time') #many argument options (see pandas website for all the details)

type(d1)

d1.head()

Unnamed: 0_level_0,latitude,longitude,depth,mag,magType,nst,gap,dmin,rms,net,id,updated,place,type
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2014-01-31 23:53:37.000,60.252,-152.7081,90.2,1.1,ml,,,,0.29,ak,ak11155107,2014-02-05T19:34:41.515Z,"26km S of Redoubt Volcano, Alaska",earthquake
2014-01-31 23:48:35.452,37.0703,-115.1309,0.0,1.33,ml,4.0,171.43,0.342,0.0247,nn,nn00436847,2014-02-01T01:35:09.000Z,"32km S of Alamo, Nevada",earthquake
2014-01-31 23:47:24.000,64.6717,-149.2528,7.1,1.3,ml,,,,1.0,ak,ak11151142,2014-02-01T00:03:53.010Z,"12km NNW of North Nenana, Alaska",earthquake
2014-01-31 23:30:54.000,63.1887,-148.9575,96.5,0.8,ml,,,,1.07,ak,ak11151135,2014-01-31T23:41:25.007Z,"22km S of Cantwell, Alaska",earthquake
2014-01-31 23:30:52.210,32.616833,-115.6925,10.59,1.34,ml,6.0,285.0,0.04321,0.2,ci,ci37171541,2014-02-01T00:13:20.107Z,"10km WNW of Progreso, Mexico",earthquake


In [7]:
uri2 = 'http://karensmith.squarespace.com/storage/python_test.csv'

d2 = pd.read_csv(uri2) #default case (no arguments)

d2.head()


d2 = pd.read_csv(uri2,index_col=1) #can pass arguments to specify column order

print(d2)

       date  B    C
A                  
1  20151001  a  2.5
2  20151002  b  5.0
3  20151003  c  7.5


In [8]:
uri3 = 'http://karensmith.squarespace.com/storage/python_test.xls'

d3 = pd.read_excel(uri3) #pandas can also be used to read .xls files

d3.head()

Unnamed: 0,date,A,B,C
0,20151001,1,a,2.5
1,20151002,2,b,5.0
2,20151003,3,c,7.5


In [3]:
d1.to_csv('earthquakes_test.csv') #writing our d1 DataFrame object to a .csv file

d2.to_excel('new_python_test.xls', sheet_name='Sheet1') #writing our d2 DataFrame object to a .xls file

       date  B    C
A                  
1  20151001  a  2.5
2  20151002  b  5.0
3  20151003  c  7.5


## C) netcdf4

In [9]:
from netCDF4 import Dataset

uri = 'http://iridl.ldeo.columbia.edu/SOURCES/.NOAA/.NCDC/.ERSST/.version4/anom/.sst/T/(days%20since%201960-01-01)/streamgridunitconvert/dods'

#use 'Dataset' to read file as netcdf4
nc = Dataset(uri)

nc

<class 'netCDF4._netCDF4.Dataset'>
root group (NETCDF3_64BIT data model, file format UNDEFINED):
    Conventions: IRIDL
    dimensions(sizes): T(1941), X(180), Y(89), zlev(1)
    variables(dimensions): float32 [4mzlev[0m(zlev), float32 [4mX[0m(X), float32 [4mY[0m(Y), float32 [4mT[0m(T), int16 [4msst[0m(T,zlev,Y,X)
    groups: 

In [None]:
SST = nc.variables['sst'][:,0] #this is the same as ['sst'[:,0,:,:]] -> gets rid of a degenerate dimension (same as squeeze in matlab)
Lat = nc.variables['Y'][:]
Lon = nc.variables['X'][:]

In [4]:
#write a new netcdf file
new_nc = Dataset('python_test.nc', 'w', format='NETCDF3_CLASSIC') #w is for writing
new_nc.description = 'Example data'

# define dimensions
new_nc.createDimension('time', None) #record dimension
new_nc.createDimension('lat', 72)
new_nc.createDimension('lon', 144)

# define variables
times = new_nc.createVariable('time', 'f8', ('time',))
latitudes = new_nc.createVariable('latitude', 'f4', ('lat',))
longitudes = new_nc.createVariable('longitude', 'f4', ('lon',))
tmp = new_nc.createVariable('tmp', 'f4', ('time', 'lat', 'lon',))

# allocate data
lats =  np.arange(-90, 90, 2.5) #like Matlab's linspace
lons =  np.arange(-180, 180, 2.5)
latitudes[:] = lats
longitudes[:] = lons
for i in range(5):
    tmp[i,:,:] = np.random.uniform(size=(len(lats), len(lons))) #default uniform distribution between 0 and 1

new_nc.close()

## D) netcdf3

In [10]:
from scipy.io import netcdf #scipy.io can only read/write netcdf3

#read the file we just created above using netcdf4
f = netcdf.netcdf_file('python_test.nc', 'r')
f


print(f.description)
lat = f.variables['latitude']
print(lat.shape)
print(lat[:])

b'Example data'
(72,)
[-90.  -87.5 -85.  -82.5 -80.  -77.5 -75.  -72.5 -70.  -67.5 -65.  -62.5
 -60.  -57.5 -55.  -52.5 -50.  -47.5 -45.  -42.5 -40.  -37.5 -35.  -32.5
 -30.  -27.5 -25.  -22.5 -20.  -17.5 -15.  -12.5 -10.   -7.5  -5.   -2.5
   0.    2.5   5.    7.5  10.   12.5  15.   17.5  20.   22.5  25.   27.5
  30.   32.5  35.   37.5  40.   42.5  45.   47.5  50.   52.5  55.   57.5
  60.   62.5  65.   67.5  70.   72.5  75.   77.5  80.   82.5  85.   87.5]


In [None]:
f.close()
#data has to be copied to main memory if we want to process data after we close the netcdf file (see message below)

## E) hdf5

In [None]:
import h5py

## F) Using numpy to import regular columns of data from .CSV file

In [None]:
import numpy
signal = numpy.loadtxt(file_location_and_name, delimiter=',')

## G) KML files

In [None]:
from fastkml import  kml
doc = file("Allpoints.kml").read()
k = kml.KML()
k.from_string(doc)
len(k.features())

In [None]:
from osgeo import gdal
geo = gdal.Open
...

## H) VTK files

In [14]:
import vtk
from vtk.util.numpy_support import vtk_to_numpy

# load a vtk file as input
reader = vtk.vtkXMLUnstructuredGridReader()
reader.SetFileName("my_input_data.vtk")
reader.Update()

#Grab a scalar from the vtk file
my_vtk_array = reader.GetOutput().GetPointData().GetArray("my_scalar_name")

#Get the coordinates of the nodes and the scalar values
nodes_nummpy_array = vtk_to_numpy(nodes_vtk_array)
my_numpy_array = vtk_to_numpy(my_vtk_array )

x,y,z = nodes_nummpy_array[:,0] , nodes_nummpy_array[:,1] , nodes_nummpy_array[:,2]

ImportError: No module named 'vtk'