In [1]:
import os

# Set spark environments
os.environ['PYSPARK_PYTHON'] = 'C:\\Users\\dmate\\Anaconda3\\python.exe'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'C:\\Users\\dmate\\Anaconda3\\python.exe'

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, LongType

In [3]:
spark = SparkSession.builder.appName("GooglePlayAnalytics").getOrCreate()

In [4]:
schema = StructType([
    StructField("App", StringType(), True),
    StructField("Category", StringType(), True),
    StructField("Rating", FloatType(), True),
    StructField("Reviews", IntegerType(), True),
    StructField("Size", StringType(), True),
    StructField("Installs", StringType(), True),
    StructField("Type", StringType(), True),
    StructField("Price", FloatType(), True),
    StructField("ContentRating", StringType(), True),
    StructField("Genre", StringType(), True),
    StructField("LastUpdated", LongType(), True),
    StructField("CurrentVer", StringType(), True),
    StructField("AndroidVer", StringType(), True)                                
])

In [5]:
df = spark.read.schema(schema).option("header", "true")\
.csv("C:\\Users\\dmate\\Desktop\\Sample DataSets\\GooglePlayStore\\GooglePlayStore.csv")

In [6]:
df.printSchema()

root
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: float (nullable = true)
 |-- Reviews: integer (nullable = true)
 |-- Size: string (nullable = true)
 |-- Installs: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: float (nullable = true)
 |-- ContentRating: string (nullable = true)
 |-- Genre: string (nullable = true)
 |-- LastUpdated: long (nullable = true)
 |-- CurrentVer: string (nullable = true)
 |-- AndroidVer: string (nullable = true)



In [7]:
categoryGroup = df.select("App", "Category", "Type")\
.filter(~df.Category.isin("traffic jams", "1.9", "Face"))\
.groupBy("Category").agg(func.count("Category").alias("Quantity")).orderBy(func.desc("Quantity"))

__Quantity of Phone Apps Per Category__

In [8]:
categoryGroup.show()

+-------------------+--------+
|           Category|Quantity|
+-------------------+--------+
|             FAMILY|    1972|
|               GAME|    1144|
|              TOOLS|     843|
|            MEDICAL|     463|
|           BUSINESS|     460|
|       PRODUCTIVITY|     424|
|    PERSONALIZATION|     392|
|      COMMUNICATION|     387|
|             SPORTS|     384|
|          LIFESTYLE|     382|
|            FINANCE|     366|
| HEALTH_AND_FITNESS|     340|
|        PHOTOGRAPHY|     335|
|             SOCIAL|     295|
| NEWS_AND_MAGAZINES|     283|
|           SHOPPING|     260|
|   TRAVEL_AND_LOCAL|     258|
|             DATING|     234|
|BOOKS_AND_REFERENCE|     231|
|      VIDEO_PLAYERS|     175|
+-------------------+--------+
only showing top 20 rows



In [9]:
priceGroup = df.select("App", "Price", "Genre", "Type")\
.filter(df.Type == 'Paid')\
.groupBy("Genre").agg(func.round(func.avg("Price"),2).alias("AveragePrice"))\
.orderBy(func.desc("AveragePrice"))

__The Average Price of an App Per Genre__

In [10]:
priceGroup.show()

+--------------------+------------+
|               Genre|AveragePrice|
+--------------------+------------+
|             Finance|      170.64|
|           Lifestyle|      124.26|
|              Events|      109.99|
|       Entertainment|       87.64|
|              Casino|        14.0|
|            Business|       13.23|
|             Medical|       13.21|
|        Productivity|        8.96|
|        Role Playing|        8.25|
|           Education|        6.63|
|         Photography|         6.1|
|   Maps & Navigation|        5.39|
|              Social|        5.32|
|          Simulation|        5.21|
|Adventure;Action ...|        4.99|
|           Parenting|        4.79|
|              Dating|        4.49|
|     Auto & Vehicles|        4.49|
|           Adventure|        4.49|
|Educational;Educa...|        4.36|
+--------------------+------------+
only showing top 20 rows



In [31]:
ratingGroup = df.select("App", "Rating", "Genre")\
.filter(df.Genre != "Free").filter(df.Rating != "NaN").groupBy("Genre")\
.agg(func.round(func.avg("Rating"),2).alias("AverageRating"))\
.orderBy(func.desc("AverageRating"))

__Average App Ratings Per Genre__

In [32]:
ratingGroup.show()

+--------------------+-------------+
|               Genre|AverageRating|
+--------------------+-------------+
|   Comics;Creativity|          4.8|
|  Board;Pretend Play|          4.8|
|Health & Fitness;...|          4.7|
|Adventure;Brain G...|          4.6|
|    Puzzle;Education|          4.6|
|Strategy;Action &...|          4.6|
|Entertainment;Cre...|         4.53|
| Music;Music & Video|         4.53|
|  Strategy;Education|          4.5|
| Arcade;Pretend Play|          4.5|
|     Tools;Education|          4.5|
| Racing;Pretend Play|          4.5|
|  Casual;Brain Games|         4.47|
|              Events|         4.44|
|Education;Brain G...|         4.43|
|Adventure;Action ...|         4.42|
|Simulation;Action...|         4.42|
|                Word|         4.41|
|Art & Design;Crea...|          4.4|
|   Puzzle;Creativity|          4.4|
+--------------------+-------------+
only showing top 20 rows



In [33]:
ratingMaxGroup = df.select("App", "Rating", "Genre")\
.filter(df.Genre != "Free").filter(df.Rating != "NaN").groupBy("Genre")\
.agg(func.round(func.max("Rating"),2).alias("MaxRating")).orderBy(func.desc("MaxRating"))

__Max App Rating Per Genre__

In [34]:
ratingMaxGroup.show()

+-----------------+---------+
|            Genre|MaxRating|
+-----------------+---------+
|            Tools|      5.0|
|           Dating|      5.0|
|           Comics|      5.0|
|           Social|      5.0|
|           Trivia|      5.0|
|Books & Reference|      5.0|
|    Entertainment|      5.0|
| Libraries & Demo|      5.0|
|           Arcade|      5.0|
|         Shopping|      5.0|
|   Travel & Local|      5.0|
| Health & Fitness|      5.0|
|          Finance|      5.0|
|     Productivity|      5.0|
|           Racing|      5.0|
|      Photography|      5.0|
| News & Magazines|      5.0|
|           Events|      5.0|
|           Sports|      5.0|
|        Lifestyle|      5.0|
+-----------------+---------+
only showing top 20 rows



In [35]:
reviewGroup = df.select("App", "Reviews", "ContentRating")\
.filter(~df.ContentRating.isin("5,000,000+", "1,000,000+")).groupBy("ContentRating")\
.agg(func.round(func.sum("Reviews"),2).alias("SumReviews")).orderBy(func.desc("SumReviews"))

__Total Quantity of Reviews Per Content Rating__

In [36]:
reviewGroup.show()

+---------------+----------+
|  ContentRating|SumReviews|
+---------------+----------+
|       Everyone|2801745128|
|           Teen|1131523721|
|   Everyone 10+| 683997228|
|     Mature 17+| 197164024|
|Adults only 18+|     81348|
|        Unrated|      1187|
+---------------+----------+

