In [None]:
import pyspark
sc = pyspark.SparkContext('local[*]')

In [None]:
!rm ./metastore_db/*.lck

In [None]:
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## CSV

In [None]:
!wget https://github.com/databricks/spark-csv/raw/master/src/test/resources/cars.csv

In [None]:
df_cars = sqlc.read.format("com.databricks.spark.csv") \
                .option("header", "true") \
                .option("inferSchema", "true") \
                .load("cars.csv")

In [None]:
df_cars.show()

In [None]:
df_cars.printSchema()

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

customSchema = StructType([StructField("year", StringType(), True),
                           StructField("make", StringType(), True),
                           StructField("model", StringType(), True), 
                           StructField("comment", StringType(), True),
                           StructField("blank", StringType(), True)])

In [None]:
df_cars2 = sqlc.read.load(path="cars.csv", 
                          format="com.databricks.spark.csv", 
                          schema=customSchema,
                          header=True)

In [None]:
df_cars2.printSchema()

In [None]:
!rm -rf newcars.csv

selectedData = df_cars.select("year", "model","comment")
selectedData.coalesce(1).write.format("com.databricks.spark.csv") \
                        .option("header", "true") \
                        .option("nullValue","NA") \
                        .save("newcars.csv") \

In [None]:
!ls -l newcars.csv

In [None]:
!rm -rf newcars.csv.gz
selectedData.write.format("com.databricks.spark.csv") \
                    .option("header", "true") \
                    .option("codec", "gzip") \
                    .save("newcars.csv.gz")

In [None]:
!ls -l newcars.csv.gz

## XML

In [None]:
!wget https://github.com/databricks/spark-xml/raw/master/src/test/resources/books.xml

In [None]:
!cat books.xml

In [None]:
df_books = sqlc.read.format("com.databricks.spark.xml") \
                    .option("rowTag", "book") \
                    .load("books.xml")

In [None]:
df_books.printSchema()

In [None]:
df_books.show()

In [None]:
!rm -rf newbooks.xml

selectedData = df_books.select("author", "_id")
selectedData.write.format("com.databricks.spark.xml") \
                .option("rootTag", "books") \
                .option("rowTag", "book") \
                .save("newbooks.xml")

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

customSchema = StructType([StructField("_id", StringType(), nullable = True), 
                           StructField("author", StringType(), nullable = True),
                           StructField("description", StringType(), nullable = True),
                           StructField("genre", StringType(),nullable = True), 
                           StructField("price", DoubleType(), nullable = True),
                           StructField("publish_date", StringType(), nullable = True),
                           StructField("title", StringType(), nullable = True)])

In [None]:
df_books = sqlc.read.format("com.databricks.spark.xml") \
                    .option("rowTag", "book") \
                    .schema(customSchema) \
                    .load("books.xml")
            
selectedData = df_books.select("author", "_id")
selectedData.write.format("com.databricks.spark.xml") \
                .option("rootTag", "books") \
                .option("rowTag", "book") \
                .mode("overwrite") \
                .save("newbooks.xml")

In [None]:
sc.stop()