# Read a LAS file

With this example the reader reads a LAS file from remote storage and stores it into a DataFrame in Spark.

## Dependencies

In [7]:
#Add all dependencies to PYTHON_PATH
import sys
sys.path.append("/usr/lib/spark/python")
sys.path.append("/usr/lib/spark/python/lib/py4j-0.10.4-src.zip")
sys.path.append("/usr/lib/python3/dist-packages")

#Define environment variables
import os
os.environ["HADOOP_CONF_DIR"] = "/etc/hadoop/conf"
os.environ["PYSPARK_PYTHON"] = "python3"
os.environ["PYSPARK_DRIVER_PYTHON"] = "ipython"
os.environ["PATH"] = "/data/local/jupyterhub/modules/LAStools/bin:" + os.environ["PATH"]

#Load PySpark to connect to a Spark cluster
from pyspark import sql, SparkConf, SparkContext, SQLContext
from pyspark.sql.types import *

from osgeo import gdal
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pylab import *
#To read GeoTiffs as a ByteArray
from io import BytesIO

from laspy.file import File
import urllib  # the lib that handles the url stuff

## Connect to Spark

In [8]:
appName = "read LAS file"
masterURL="spark://emma0.emma.nlesc.nl:7077"

#A context needs to be created if it does not already exist
try:
    saprk.stop()
except NameError:
    print("A  new Spark Context will be created.")

spark = sql.SparkSession.builder.appName(appName).master(masterURL).config("parquet.enable.dictionary", "true").config("parquet.compression", "SNAPPY").getOrCreate()
sc = spark.sparkContext
sqlCtx = SQLContext(sc)

A  new Spark Context will be created.


## Read data

In [9]:
las_url = 'https://geodata.nationaalgeoregister.nl/ahn3/extract/ahn3_laz/C_25EZ2.LAZ'
#data = urllib.request.urlopen(las_url) # it's a file like object and works just like a file
file_path = '/data/local/jupyterhub/C_25EZ2.laz'

In [10]:
import requests
from pathlib import Path

las_file = Path(file_path)
if (las_file.is_file() == False):
    r = requests.get(las_url, allow_redirects=True)
    open(file_path, 'wb').write(r.content)

inFile = File(file_path, mode='r')    

In [11]:
#cols = np.array(['x', 'y', 'z', 'intensity', 'fkag_byte', 'raw_classification', 'scan_angle_rank', 'user_data', 'pt_src_id', 'gps_time'])
cols = np.array(['x', 'y', 'z', 'intensity', 'raw_classification'])

In [12]:
#dataset = pd.DataFrame(columns=cols, dtype=int32)
dataset = pd.DataFrame()#dtype=int32)

In [13]:
#dataset = dataset.assign(**{
#    'x' : inFile.X, 
#    'y' : inFile.X, 
#    'z' : inFile.X, 
#    'intensity' : inFile.intensity,
#    'flag_byte' : inFile.flag_byte,
#    'raw_classfication' : inFile.raw_classification,
#    'user_data' : inFile.user_data,
#    'pt_src_id' : inFile.pt_src_id,
#    'gps_time' : inFile.gps_time
#})

In [14]:
dataset = dataset.assign(**{
    'x' : inFile.X, 
    'y' : inFile.X, 
    'z' : inFile.X, 
    'intensity' : inFile.intensity,
    'raw_classification' : inFile.raw_classification
})

In [16]:
dataset.dtypes

intensity             uint16
raw_classification     uint8
x                      int32
y                      int32
z                      int32
dtype: object

# Create DataFrame

In [17]:
lasSchema = StructType([
    StructField("x", IntegerType(), True),
    StructField("y", IntegerType(), True),
    StructField("z", IntegerType(), True),
    StructField("intensity", ShortType(), True),
    StructField("raw_classification", ByteType(), True)
])

In [18]:
sdf = sqlCtx.createDataFrame(dataset, samplingRatio=None, verifySchema=False, schema=lasSchema)

In [19]:
sdf.write.parquet("/user/emma/C_25EZ2.parquet")

In [20]:
sdf.dtypes

[('x', 'int'),
 ('y', 'int'),
 ('z', 'int'),
 ('intensity', 'smallint'),
 ('raw_classification', 'tinyint')]