In [None]:
!pip install pyspark
!pip install -U -q PyDrive
!apt update
!apt install openjdk-8-jdk-headless -qq
!apt install default-jre
!apt install default-jdk
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Hit:1 http://security.ubuntu.com/ubuntu bionic-security InRelease
Hit:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Hit:3 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Hit:4 http://archive.ubuntu.com/ubuntu bionic InRelease
Ign:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:7 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:8 http://archive.ubuntu.com/ubuntu bionic-updates InRelease
Hit:9 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:10 http://archive.ubuntu.com/ubuntu bionic-backports InRelease
Hit:11 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Hit:12 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext
from pyspark.sql import SQLContext
import pandas as pd

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
downloaded = drive.CreateFile({'id':"17CeUiEqwSMOLf7WBNqqVY7zLyRMqZbl3"})
downloaded.GetContentFile('disney_movies.csv') 

In [None]:
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [None]:
sqlContext = SQLContext(sc)
df = sqlContext.read.csv('disney_movies.csv', header=True, inferSchema=True)
rdd = df.rdd



In [None]:
rdd.count()

579

In [None]:
rdd.first()

Row(movie_title='Snow White and the Seven Dwarfs', release_date=datetime.datetime(1937, 12, 21, 0, 0), genre='Musical', mpaa_rating='G', total_gross=184925485, inflation_adjusted_gross=5228953251)

In [None]:
rdd.take(5)

[Row(movie_title='Snow White and the Seven Dwarfs', release_date=datetime.datetime(1937, 12, 21, 0, 0), genre='Musical', mpaa_rating='G', total_gross=184925485, inflation_adjusted_gross=5228953251),
 Row(movie_title='Pinocchio', release_date=datetime.datetime(1940, 2, 9, 0, 0), genre='Adventure', mpaa_rating='G', total_gross=84300000, inflation_adjusted_gross=2188229052),
 Row(movie_title='Fantasia', release_date=datetime.datetime(1940, 11, 13, 0, 0), genre='Musical', mpaa_rating='G', total_gross=83320000, inflation_adjusted_gross=2187090808),
 Row(movie_title='Song of the South', release_date=datetime.datetime(1946, 11, 12, 0, 0), genre='Adventure', mpaa_rating='G', total_gross=65000000, inflation_adjusted_gross=1078510579),
 Row(movie_title='Cinderella', release_date=datetime.datetime(1950, 2, 15, 0, 0), genre='Drama', mpaa_rating='G', total_gross=85000000, inflation_adjusted_gross=920608730)]

In [None]:
rdd.map(lambda x: x.genre).take(5)

['Musical', 'Adventure', 'Musical', 'Adventure', 'Drama']

In [None]:
rdd.map(lambda x: ((x[2],x[1]),(1,"hola"))).take(5)

[(('Musical', datetime.datetime(1937, 12, 21, 0, 0)), (1, 'hola')),
 (('Adventure', datetime.datetime(1940, 2, 9, 0, 0)), (1, 'hola')),
 (('Musical', datetime.datetime(1940, 11, 13, 0, 0)), (1, 'hola')),
 (('Adventure', datetime.datetime(1946, 11, 12, 0, 0)), (1, 'hola')),
 (('Drama', datetime.datetime(1950, 2, 15, 0, 0)), (1, 'hola'))]

In [None]:
rdd.map(lambda x: x.genre).distinct().count()

13

In [None]:
rdd.map(lambda x: x.genre).distinct().collect()

['Musical',
 'Adventure',
 'Drama',
 'Comedy',
 None,
 'Action',
 'Horror',
 'Romantic Comedy',
 'Thriller/Suspense',
 'Western',
 'Black Comedy',
 'Documentary',
 'Concert/Performance']

In [None]:
rdd.map(lambda x: (x.genre,1)).countByKey()

defaultdict(int,
            {'Musical': 16,
             'Adventure': 129,
             'Drama': 114,
             'Comedy': 182,
             None: 17,
             'Action': 40,
             'Horror': 6,
             'Romantic Comedy': 23,
             'Thriller/Suspense': 24,
             'Western': 7,
             'Black Comedy': 3,
             'Documentary': 16,
             'Concert/Performance': 2})

In [None]:
rdd.map(lambda x: (x.genre,1)).reduceByKey(lambda x,y: x+y).reduce(lambda x,y: x if x[1] > y[1] else y)

('Comedy', 182)

In [None]:
rdd.map(lambda x: (x.genre,1)).reduceByKey(lambda x,y: x+y).takeOrdered(20, lambda x: -x[1])

[('Comedy', 182),
 ('Adventure', 129),
 ('Drama', 114),
 ('Action', 40),
 ('Thriller/Suspense', 24),
 ('Romantic Comedy', 23),
 (None, 17),
 ('Musical', 16),
 ('Documentary', 16),
 ('Western', 7),
 ('Horror', 6),
 ('Black Comedy', 3),
 ('Concert/Performance', 2)]

In [None]:
rdd.reduce(lambda x,y: x if x.inflation_adjusted_gross > y.inflation_adjusted_gross else y)

Row(movie_title='Snow White and the Seven Dwarfs', release_date=datetime.datetime(1937, 12, 21, 0, 0), genre='Musical', mpaa_rating='G', total_gross=184925485, inflation_adjusted_gross=5228953251)

In [None]:
rdd.takeOrdered(10, lambda x: x.inflation_adjusted_gross)

[Row(movie_title='The Many Adventures of Winnie the Pooh', release_date=datetime.datetime(1977, 3, 11, 0, 0), genre=None, mpaa_rating=None, total_gross=0, inflation_adjusted_gross=0),
 Row(movie_title='Amy', release_date=datetime.datetime(1981, 3, 20, 0, 0), genre='Drama', mpaa_rating=None, total_gross=0, inflation_adjusted_gross=0),
 Row(movie_title='Condorman', release_date=datetime.datetime(1981, 8, 7, 0, 0), genre='Action', mpaa_rating=None, total_gross=0, inflation_adjusted_gross=0),
 Row(movie_title='Frank McKlusky C.I.', release_date=datetime.datetime(2002, 1, 1, 0, 0), genre=None, mpaa_rating=None, total_gross=0, inflation_adjusted_gross=0),
 Row(movie_title='Zokkomon', release_date=datetime.datetime(2011, 4, 22, 0, 0), genre='Adventure', mpaa_rating='PG', total_gross=2815, inflation_adjusted_gross=2984),
 Row(movie_title='Walt and El Grupo', release_date=datetime.datetime(2009, 9, 10, 0, 0), genre='Documentary', mpaa_rating='PG', total_gross=20521, inflation_adjusted_gross=230