# Apache Spark Writing and Reading using MongoDB on Docker 

In [1]:
import findspark
findspark.init()

## 1. Spark and MongoDb COnfiguration

#### Read csv file from local computer

In [2]:
film_df = spark.read.format("csv")\
.option("header","True")\
.option("inferSchema", "True")\
.option("sep", ",")\
.load("data/film_data.csv")

In [3]:
film_df.toPandas().head()

Unnamed: 0,Name,Genre,Length,Score,Country,Year,Budget
0,stand by Me,Adventure,89,8.1,USA,1986,8000000
1,ferris Bueller's Day Off,Comedy,103,7.8,USA,1986,6000000
2,Top Gun,Action,110,6.9,USA,1986,15000000
3,Aliens,Action,137,8.4,USA,1986,18500000
4,Flight of the Navigator,Adventure,90,6.9,USA,1986,9000000


In [4]:
film_df.show(5)

+--------------------+---------+------+-----+-------+----+--------+
|                Name|    Genre|Length|Score|Country|Year|  Budget|
+--------------------+---------+------+-----+-------+----+--------+
|         stand by Me|Adventure|    89|  8.1|    USA|1986| 8000000|
|ferris Bueller's ...|   Comedy|   103|  7.8|    USA|1986| 6000000|
|             Top Gun|   Action|   110|  6.9|    USA|1986|15000000|
|              Aliens|   Action|   137|  8.4|    USA|1986|18500000|
|Flight of the Nav...|Adventure|    90|  6.9|    USA|1986| 9000000|
+--------------------+---------+------+-----+-------+----+--------+
only showing top 5 rows



## 2. Writing data to MongoDb which is on the Docker

### Method 1

Use this method, if you define Database name and Collection name in SparkSession section

In [5]:
from pyspark.sql import SparkSession

pyspark = SparkSession \
.builder \
.appName("WriteToMongoDb")\
.master("local[4]")\
.config("spark.executer.memory","3g")\
.config("spark.driver.memory","3g")\
.config("spark.mongodb.input.uri", "mongodb://192.168.99.100:27017/filmdb.films") \
.config("spark.mongodb.output.uri", "mongodb://192.168.99.100:27017/filmdb.films") \
.getOrCreate()

In [6]:
film_df.write.format("mongo")\
.mode("append")\
.save()

### Method 2: We use this method, if Database and collection names are not defined in SparkSession 

Use this method, if you didn't define Database name and Collection name in SparkSession section

In [7]:
from pyspark.sql import SparkSession

pyspark = SparkSession \
.builder \
.appName("WriteToMongoDb")\
.master("local[4]")\
.config("spark.executer.memory","3g")\
.config("spark.driver.memory","3g")\
.config("spark.mongodb.input.uri", "mongodb://192.168.99.100:27017") \
.config("spark.mongodb.output.uri", "mongodb://192.168.99.100:27017") \
.getOrCreate()

In [8]:
film_df.write.format("mongo")\
.option("database","filmdb")\
.option("collection", "films")\
.mode("append")\
.save()

## 3. Docker MongoDb and Spark Reading Data

In [10]:
mongo_df = spark.read\
.format("mongo")\
.option("database","filmdb")\
.option("collection", "films")\
.option("replaceDocument", "true")\
.load()

In [11]:
mongo_df.printSchema()

root
 |-- Budget: integer (nullable = true)
 |-- Country: string (nullable = true)
 |-- Genre: string (nullable = true)
 |-- Length: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Score: double (nullable = true)
 |-- Year: string (nullable = true)
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)



In [12]:
mongo_df.toPandas().head(3)

Unnamed: 0,Budget,Country,Genre,Length,Name,Score,Year,_id
0,8000000,USA,Adventure,89,stand by Me,8.1,1986,"(5d8c7d0b6c14965ccf1dd18f,)"
1,6000000,USA,Comedy,103,ferris Bueller's Day Off,7.8,1986,"(5d8c7d0b6c14965ccf1dd190,)"
2,15000000,USA,Action,110,Top Gun,6.9,1986,"(5d8c7d0b6c14965ccf1dd191,)"


In [13]:
print("All films: ", mongo_df.count())

All films:  297


### Filtering lengths of movies which are higher than  >= 90 

In [14]:
length_films = mongo_df.filter(mongo_df["Length"] >= 90)
length_films.toPandas().head()

Unnamed: 0,Budget,Country,Genre,Length,Name,Score,Year,_id
0,6000000,USA,Comedy,103,ferris Bueller's Day Off,7.8,1986,"(5d8c7d0b6c14965ccf1dd190,)"
1,15000000,USA,Action,110,Top Gun,6.9,1986,"(5d8c7d0b6c14965ccf1dd191,)"
2,18500000,USA,Action,137,Aliens,8.4,1986,"(5d8c7d0b6c14965ccf1dd192,)"
3,9000000,USA,Adventure,90,Flight of the Navigator,6.9,1986,"(5d8c7d0b6c14965ccf1dd193,)"
4,6000000,UK,Drama,120,Platoon,8.1,1986,"(5d8c7d0b6c14965ccf1dd194,)"


### Writing filtered Dataframe to MongoDb

In [15]:
length_films.write.format("mongo")\
.option("database","filmdb")\
.option("collection", "filtered_films")\
.mode("append")\
.save()

In [16]:
print("Filtered films: ", length_films.count())

Filtered films:  246
