In [1]:
import pyspark
sc = pyspark.SparkContext('local[*]')

In [None]:
!rm ./metastore_db/*.lck

In [5]:
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## CSV

In [None]:
!wget https://github.com/databricks/spark-csv/raw/master/src/test/resources/cars.csv

In [None]:
df_cars = sqlc.read.format("com.databricks.spark.csv") \
                .option("header", "true") \
                .option("inferSchema", "true") \
                .load("cars.csv")

In [None]:
df_cars.show()

In [None]:
df_cars.printSchema()

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

customSchema = StructType([StructField("year", StringType(), True),
                           StructField("make", StringType(), True),
                           StructField("model", StringType(), True), 
                           StructField("comment", StringType(), True),
                           StructField("blank", StringType(), True)])

In [None]:
df_cars2 = sqlc.read.load(path="cars.csv", 
                          format="com.databricks.spark.csv", 
                          schema=customSchema,
                          header=True)

In [None]:
df_cars2.printSchema()

In [None]:
!rm -rf newcars.csv

selectedData = df_cars.select("year", "model","comment")
selectedData.coalesce(1).write.format("com.databricks.spark.csv") \
                        .option("header", "true") \
                        .option("nullValue","NA") \
                        .save("newcars.csv") \

In [None]:
!ls -l newcars.csv

In [None]:
!rm -rf newcars.csv.gz
selectedData.write.format("com.databricks.spark.csv") \
                    .option("header", "true") \
                    .option("codec", "gzip") \
                    .save("newcars.csv.gz")

In [None]:
!ls -l newcars.csv.gz

## XML

In [2]:
!wget https://github.com/databricks/spark-xml/raw/master/src/test/resources/books.xml

--2017-05-02 14:59:18--  https://github.com/databricks/spark-xml/raw/master/src/test/resources/books.xml
Resolving github.com (github.com)... 192.30.253.113, 192.30.253.112
Connecting to github.com (github.com)|192.30.253.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/databricks/spark-xml/master/src/test/resources/books.xml [following]
--2017-05-02 14:59:19--  https://raw.githubusercontent.com/databricks/spark-xml/master/src/test/resources/books.xml
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5542 (5.4K) [text/plain]
Saving to: ‘books.xml.1’


2017-05-02 14:59:19 (2.80 MB/s) - ‘books.xml.1’ saved [5542/5542]



In [3]:
!cat books.xml

<?xml version="1.0"?>
<catalog>
   <book id="bk101">
      <author>Gambardella, Matthew</author>
      <title>XML Developer's Guide</title>
      <genre>Computer</genre>
      <price>44.95</price>
      <publish_date>2000-10-01</publish_date>
      <description>


         An in-depth look at creating applications
         with XML.This manual describes Oracle XML DB, and how you can use it to store, generate, manipulate, manage,
         and query XML data in the database.


         After introducing you to the heart of Oracle XML DB, namely the XMLType framework and Oracle XML DB repository,
         the manual provides a brief introduction to design criteria to consider when planning your Oracle XML DB
         application. It provides examples of how and where you can use Oracle XML DB.


         The manual then describes ways you can store and retrieve XML data using Oracle XML DB, APIs for manipulating
         XMLType data, and ways you can view, generate

In [6]:
df_books = sqlc.read.format("com.databricks.spark.xml") \
                    .option("rowTag", "book") \
                    .load("books.xml")

In [7]:
df_books.printSchema()

root
 |-- _id: string (nullable = true)
 |-- author: string (nullable = true)
 |-- description: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- price: double (nullable = true)
 |-- publish_date: string (nullable = true)
 |-- title: string (nullable = true)



In [None]:
df_books.show()

In [None]:
!rm -rf newbooks.xml

selectedData = df_books.select("author", "_id")
selectedData.write.format("com.databricks.spark.xml") \
                .option("rootTag", "books") \
                .option("rowTag", "book") \
                .save("newbooks.xml")

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

customSchema = StructType([StructField("_id", StringType(), nullable = True), 
                           StructField("author", StringType(), nullable = True),
                           StructField("description", StringType(), nullable = True),
                           StructField("genre", StringType(),nullable = True), 
                           StructField("price", DoubleType(), nullable = True),
                           StructField("publish_date", StringType(), nullable = True),
                           StructField("title", StringType(), nullable = True)])

In [None]:
df_books = sqlc.read.format("com.databricks.spark.xml") \
                    .option("rowTag", "book") \
                    .schema(customSchema) \
                    .load("books.xml")
            
selectedData = df_books.select("author", "_id")
selectedData.write.format("com.databricks.spark.xml") \
                .option("rootTag", "books") \
                .option("rowTag", "book") \
                .mode("overwrite") \
                .save("newbooks.xml")

In [None]:
sc.stop()