In [0]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField,StringType,IntegerType
from pyspark.sql.functions import *

In [0]:
df = spark.read.csv("/FileStore/tables/googleplaystore.csv", header="true",inferSchema='true')

In [0]:
df.show(5)

+--------------------+--------------+------+-------+--------+----+-----+--------------------+------------------+
|                 App|      Category|Rating|Reviews|Installs|Type|Price|              Genres|       Current Ver|
+--------------------+--------------+------+-------+--------+----+-----+--------------------+------------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159|   10000|Free|    0|        Art & Design|             1.0.0|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967|  500000|Free|    0|Art & Design;Pret...|             2.0.0|
|U Launcher Lite –...|ART_AND_DESIGN|   4.7|  87510| 5000000|Free|    0|        Art & Design|             1.2.4|
|Sketch - Draw & P...|ART_AND_DESIGN|   4.5| 215644|50000000|Free|    0|        Art & Design|Varies with device|
|Pixel Draw - Numb...|ART_AND_DESIGN|   4.3|    967|  100000|Free|    0|Art & Design;Crea...|               1.1|
+--------------------+--------------+------+-------+--------+----+-----+--------------------+---

In [0]:
df.printSchema()

root
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- Reviews: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- Installs: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Content Rating: string (nullable = true)
 |-- Genres: string (nullable = true)
 |-- Last Updated: string (nullable = true)
 |-- Current Ver: string (nullable = true)
 |-- Android Ver: string (nullable = true)



In [0]:
df = df.drop("size","Content Rating","Last Updated","Android Ver")
df.show(5)

+--------------------+--------------+------+-------+--------+----+-----+--------------------+------------------+
|                 App|      Category|Rating|Reviews|Installs|Type|Price|              Genres|       Current Ver|
+--------------------+--------------+------+-------+--------+----+-----+--------------------+------------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159|   10000|Free|    0|        Art & Design|             1.0.0|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967|  500000|Free|    0|Art & Design;Pret...|             2.0.0|
|U Launcher Lite –...|ART_AND_DESIGN|   4.7|  87510| 5000000|Free|    0|        Art & Design|             1.2.4|
|Sketch - Draw & P...|ART_AND_DESIGN|   4.5| 215644|50000000|Free|    0|        Art & Design|Varies with device|
|Pixel Draw - Numb...|ART_AND_DESIGN|   4.3|    967|  100000|Free|    0|Art & Design;Crea...|               1.1|
+--------------------+--------------+------+-------+--------+----+-----+--------------------+---

In [0]:
df = df.withColumn("Reviews",col("Reviews").cast(IntegerType())).withColumn("Installs",regexp_replace(col("Installs"),"[^0-9]",""))\
    .withColumn("Installs",col("Installs").cast(IntegerType()))\
    .withColumn("Price",regexp_replace(col("Price"),"[$]",""))\
    .withColumn("Price",col("Price").cast(IntegerType()))

In [0]:
df.show(5)

+--------------------+--------------+------+-------+--------+----+-----+--------------------+------------------+
|                 App|      Category|Rating|Reviews|Installs|Type|Price|              Genres|       Current Ver|
+--------------------+--------------+------+-------+--------+----+-----+--------------------+------------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159|   10000|Free|    0|        Art & Design|             1.0.0|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967|  500000|Free|    0|Art & Design;Pret...|             2.0.0|
|U Launcher Lite –...|ART_AND_DESIGN|   4.7|  87510| 5000000|Free|    0|        Art & Design|             1.2.4|
|Sketch - Draw & P...|ART_AND_DESIGN|   4.5| 215644|50000000|Free|    0|        Art & Design|Varies with device|
|Pixel Draw - Numb...|ART_AND_DESIGN|   4.3|    967|  100000|Free|    0|Art & Design;Crea...|               1.1|
+--------------------+--------------+------+-------+--------+----+-----+--------------------+---

In [0]:
df.createOrReplaceTempView("apps")

In [0]:
%sql select * from apps limit 10

App,Category,Rating,Reviews,Installs,Type,Price,Genres,Current Ver
Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,10000,Free,0,Art & Design,1.0.0
Coloring book moana,ART_AND_DESIGN,3.9,967,500000,Free,0,Art & Design;Pretend Play,2.0.0
"U Launcher Lite – FREE Live Cool Themes, Hide Apps",ART_AND_DESIGN,4.7,87510,5000000,Free,0,Art & Design,1.2.4
Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,50000000,Free,0,Art & Design,Varies with device
Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,100000,Free,0,Art & Design;Creativity,1.1
Paper flowers instructions,ART_AND_DESIGN,4.4,167,50000,Free,0,Art & Design,1.0
Smoke Effect Photo Maker - Smoke Editor,ART_AND_DESIGN,3.8,178,50000,Free,0,Art & Design,1.1
Infinite Painter,ART_AND_DESIGN,4.1,36815,1000000,Free,0,Art & Design,6.1.61.1
Garden Coloring Book,ART_AND_DESIGN,4.4,13791,1000000,Free,0,Art & Design,2.9.2
Kids Paint Free - Drawing Fun,ART_AND_DESIGN,4.7,121,10000,Free,0,Art & Design;Creativity,2.8


###Top Installed Apps###

In [0]:
%sql select App,sum(Reviews) from apps group by App order by sum(Reviews) desc limit 10

App,sum(Reviews)
Instagram,266241989
WhatsApp Messenger,207348304
Clash of Clans,179558781
Messenger – Text and Video Chat for Free,169932272
Subway Surfers,166331958
Candy Crush Saga,156993136
Facebook,156286514
8 Ball Pool,99386198
Clash Royale,92530298
Snapchat,68045010


###Top Installed Apps###

In [0]:
%sql select App,sum(Installs) from apps group by App order by sum(Installs) desc limit 10

App,sum(Installs)
Subway Surfers,6000000000
Instagram,4000000000
Hangouts,4000000000
Google Drive,4000000000
Google News,4000000000
Google Photos,4000000000
Candy Crush Saga,3500000000
WhatsApp Messenger,3000000000
Messenger – Text and Video Chat for Free,3000000000
Google Chrome: Fast & Secure,3000000000


###Category wise distribution###

In [0]:
%sql select Category,sum(Installs) from apps group by Category order by sum(Installs) desc limit 10

Category,sum(Installs)
GAME,35086024415
COMMUNICATION,32647276251
PRODUCTIVITY,14176091369
SOCIAL,14069867902
TOOLS,11452771915
FAMILY,10258263505
PHOTOGRAPHY,10088247655
NEWS_AND_MAGAZINES,7496317760
TRAVEL_AND_LOCAL,6868887146
VIDEO_PLAYERS,6222002720


###Top paid apps###

In [0]:
%sql select App,sum(price) from apps where type = "Paid" group by App order by sum(price) desc limit 10

App,sum(price)
I'm Rich - Trump Edition,400
most expensive app (H),399
I am rich(premium),399
I Am Rich Premium,399
I am Rich Plus,399
I'm Rich/Eu sou Rico/أنا غني/我很有錢,399
I Am Rich Pro,399
I AM RICH PRO PLUS,399
I am Rich,399
I am Rich!,399
