# Instalamos e importamos librerías

In [25]:
!pip install pyspark
!pip install -U -q PyDrive
!apt update
!apt install openjdk-8-jdk-headless -qq
#!apt install default-jre
#!apt install default-jdk+
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:6 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:7 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
22 packages can be upgraded. Run 'apt list --upgradable' to see them.
openjdk-8-jdk-headless is already the newest version (8u382-ga-1~22.04.1).
0 upgraded, 0 newly ins

In [26]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext
from pyspark.sql import SQLContext
import pandas as pd

# Autenticamos con Google Drive

In [27]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Bajamos archivo con los datos del índice BigMac

In [28]:
#https://drive.google.com/file/d/1y4_n4twjE4VSLSC-V7QsdPJ5JR9fiVgG/view?usp=sharing GooglePlayStore.csv
#https://drive.google.com/file/d/1HBgBzv4HU1wQKKblj8p--9lJEMSGYczg/view?usp=sharing GooglePlayStore_User_Reviews.csv

id1='1y4_n4twjE4VSLSC-V7QsdPJ5JR9fiVgG'
id2='1HBgBzv4HU1wQKKblj8p--9lJEMSGYczg'
downloaded1 = drive.CreateFile({'id': id1})
downloaded1.GetContentFile('GooglePlayStore.csv')
downloaded2 = drive.CreateFile({'id': id2})
downloaded2.GetContentFile('GooglePlayStore_User_Reviews.csv')

# Creamos el Spark Context

In [29]:
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

## Leemos CSV

In [30]:
sqlContext = SQLContext(sc)
dfAppDetails = sqlContext.read.csv('GooglePlayStore.csv', header=True, inferSchema=True, mode='DROPMALFORMED')
dfAppReviews = sqlContext.read.csv('GooglePlayStore_User_Reviews.csv', header=True, inferSchema=True, mode='DROPMALFORMED')
rddAppDetails = dfAppDetails.rdd
rddAppReviews = dfAppReviews.rdd

## Preparación de Datasets

Droppeamos los apps completamente idénticos.

In [31]:
rddAppDetails = rddAppDetails.distinct()

In [32]:
rddAppDetails.take(5)

[Row(App='Photo Editor & Candy Camera & Grid & ScrapBook', Category='ART_AND_DESIGN', Rating='4.1', Reviews='159', Size='19M', Installs='10,000+', Type='Free', Price='0', Content Rating='Everyone', Genres='Art & Design', Last Updated='January 7, 2018', Current Ver='1.0.0', Android Ver='4.0.3 and up'),
 Row(App='Coloring book moana', Category='ART_AND_DESIGN', Rating='3.9', Reviews='967', Size='14M', Installs='500,000+', Type='Free', Price='0', Content Rating='Everyone', Genres='Art & Design;Pretend Play', Last Updated='January 15, 2018', Current Ver='2.0.0', Android Ver='4.0.3 and up'),
 Row(App='U Launcher Lite – FREE Live Cool Themes, Hide Apps', Category='ART_AND_DESIGN', Rating='4.7', Reviews='87510', Size='8.7M', Installs='5,000,000+', Type='Free', Price='0', Content Rating='Everyone', Genres='Art & Design', Last Updated='August 1, 2018', Current Ver='1.2.4', Android Ver='4.0.3 and up'),
 Row(App='Sketch - Draw & Paint', Category='ART_AND_DESIGN', Rating='4.5', Reviews='215644', S

# Ejercicio 16
Ordenar de forma descendente todas las categorías por cantidad de descargas.

Primero mappeo el rdd appDetails a (Category, Installs formateado). Inicialmente Installs tiene formato x,xxx,xxx+, y se le quitó el signo + y las comas, obteniéndolo en forma de entero. Utilizo take para ver cómo quedaron.

In [35]:
appsCategoryAndDownloads = rddAppDetails.map(lambda x: (x.Category, int(x.Installs.rstrip('+').replace(',',''))))
appsCategoryAndDownloads.take(3)

[('ART_AND_DESIGN', 10000),
 ('ART_AND_DESIGN', 500000),
 ('ART_AND_DESIGN', 5000000)]

Mediante reduceByKey se obtuvo la suma total de descargas para cada categoría, y luego se ordenó mediante sortBy con key -x[1], con el signo menos ya que por default se ordena de forma ascendente y se pidió hacerlo de forma descendente.

In [36]:
appsCategoryAndDownloads = appsCategoryAndDownloads.reduceByKey(lambda a,b: a+b)
appsCategoryAndDownloads = appsCategoryAndDownloads.sortBy(lambda x: -x[1])

Utilizo collect sabiendo que la cantidad de categorías es acotada para obtener la lista entera. La cantidad de descargas de cada categoría, ordenada de forma ascendente:

In [37]:
appsCategoryAndDownloads.collect()

[('GAME', 31544024415),
 ('COMMUNICATION', 24152276251),
 ('SOCIAL', 12513867902),
 ('PRODUCTIVITY', 12463091369),
 ('TOOLS', 11452771915),
 ('FAMILY', 10041692505),
 ('PHOTOGRAPHY', 9721247655),
 ('TRAVEL_AND_LOCAL', 6361887146),
 ('VIDEO_PLAYERS', 6222002720),
 ('NEWS_AND_MAGAZINES', 5393217760),
 ('SHOPPING', 2573348785),
 ('ENTERTAINMENT', 2455660000),
 ('PERSONALIZATION', 2074494782),
 ('BOOKS_AND_REFERENCE', 1916469576),
 ('SPORTS', 1528574498),
 ('HEALTH_AND_FITNESS', 1360022512),
 ('BUSINESS', 863664865),
 ('FINANCE', 770348734),
 ('MAPS_AND_NAVIGATION', 719281890),
 ('LIFESTYLE', 534823539),
 ('EDUCATION', 533952000),
 ('WEATHER', 426100520),
 ('FOOD_AND_DRINK', 257898751),
 ('DATING', 206536107),
 ('HOUSE_AND_HOME', 125212461),
 ('ART_AND_DESIGN', 124338100),
 ('LIBRARIES_AND_DEMO', 62995910),
 ('COMICS', 56086150),
 ('AUTO_AND_VEHICLES', 53130211),
 ('MEDICAL', 42204177),
 ('PARENTING', 31521110),
 ('BEAUTY', 27197050),
 ('EVENTS', 15973161)]