# Spark Activity 2

#### a. Load the required file from AWS s3 as a pandas dataframe

In [1]:
import pyspark
import pandas as pd

In [2]:
from pyspark import SparkContext
from pyspark import SQLContext

In [3]:
#start up spark and sql context for dataframes!
sc = SparkContext('local','WordCountApp')
sqlContext = SQLContext(sc)


In [4]:
my_bucket = '686springbucket-emmanuel'
my_key1 = 'movie_magic/movie_characters_metadata.csv'

from pyspark.sql.types import *
from s3fs.core import S3FileSystem

#this is a bit convoluted b/c we are using s3 but...
#first we read the data into pandas

s3 = S3FileSystem(anon=False)
df1 = pd.read_csv(s3.open('{}/{}'.format(my_bucket, my_key1), mode='rb'))

my_key2 = 'movie_magic/movie_titles_metadata.csv'

from pyspark.sql.types import *
from s3fs.core import S3FileSystem

#this is a bit convoluted b/c we are using s3 but...
#first we read the data into pandas

df2 = pd.read_csv(s3.open('{}/{}'.format(my_bucket, my_key2), mode='rb'))

#### b. Load the movie_titles_metadata csv as a pyspark dataframe

In [5]:
#now we convert to a "pyspark" dataframe which is distributed using
#our sqlContext, we also use a struct to make sure all the 
#columns have the names we want and the types (sometimes the conversion from
#pandas to dataframe gets messed up b/c of type issues)
movieSchema1 = StructType([StructField("character_id", StringType(), False), \
                      StructField("character_name", StringType(), False),\
                      StructField("movie_id", StringType(), False),\
                      StructField("movie_name", StringType(), False),\
                      StructField("character_gender", StringType(), False),\
                      StructField("position_in_credits", StringType(), False)])

movieSchema2 = StructType([StructField("movie_id", StringType(), True), \
                      StructField("movie_title", StringType(), True),\
                      StructField("movie_year", StringType(), True),\
                      StructField("IMDB_rating", DoubleType(), True),\
                      StructField("IMDB_votes", LongType(), True),\
                      StructField("genres", StringType(), True)])

pyDF1 = sqlContext.createDataFrame(df1,schema=movieSchema1)
pyDF2 = sqlContext.createDataFrame(df2,schema=movieSchema2)

In [6]:
#look at our schema to make sure everything read well
pyDF1.printSchema()
pyDF1.show(5)
pyDF2.printSchema()
pyDF2.show(5)

root
 |-- character_id: string (nullable = false)
 |-- character_name: string (nullable = false)
 |-- movie_id: string (nullable = false)
 |-- movie_name: string (nullable = false)
 |-- character_gender: string (nullable = false)
 |-- position_in_credits: string (nullable = false)

+------------+--------------+--------+--------------------+----------------+-------------------+
|character_id|character_name|movie_id|          movie_name|character_gender|position_in_credits|
+------------+--------------+--------+--------------------+----------------+-------------------+
|          u0|        BIANCA|      m0|10 things i hate ...|               f|                  4|
|          u1|         BRUCE|      m0|10 things i hate ...|               ?|                  ?|
|          u2|       CAMERON|      m0|10 things i hate ...|               m|                  3|
|          u3|      CHASTITY|      m0|10 things i hate ...|               ?|                  ?|
|          u4|          JOEY|      m0|

#### c. Now use the withColumn transformation to create a new column called IMDB_Movie_Avg which will be the IMDB_rating / IMDB_votes.

In [7]:
pyDF2 = pyDF2.withColumn('IMDB_Avg_Rating',pyDF2['IMDB_rating'] / pyDF2['IMDB_votes'])

#### d. Next, filter the pyDFTitle dataframe so that it only contains movies distributed after 1990.

In [8]:
pyDF2_after1990 = pyDF2.filter(pyDF2['movie_year']>'1990')

#### e. Join the pyDFTitle (that has movie information) and pyDF dataframes (which has character id) by movie_id (examples in slides).  Just use the default join type. Save the resulting dataframe in a dataframe called pyDFCharMovie

In [9]:
pyDFCharMovie = pyDF1.join(pyDF2_after1990,pyDF1.movie_id == pyDF2_after1990.movie_id)

#### f. Order the pyDFCharMovie by movie_year.

In [10]:
pyDFCharMovie = pyDFCharMovie.orderBy('movie_year')

#### g. Finally, write out your csv file to s3. **Note:** The fused_data.csv has also been uploaded in the data folder

In [11]:
bytes_to_write = pyDFCharMovie.toPandas().to_csv(None).encode()
with s3.open('s3://686springbucket-emmanuel/movie_magic/fused_data.csv', 'wb') as f:
   	f.write(bytes_to_write)