This is a Python script performs analysis on a Google Play Store dataset. It aims to extract insights such as top app reviews, distribution of installed apps, top paid apps, etc., from the dataset.

**Dependencies:**
- pandas
- pyspark
- Apache Spark (and Hadoop for local setup)

## requirements

In [1]:
%pip install pyspark


Note: you may need to restart the kernel to use updated packages.


## import libraries

In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pandas as pd
import matplotlib.pyplot as plt

## Initialize SparkSession

In [None]:
spark = SparkSession.builder \
    .appName("playstore") \
    .getOrCreate()


## create dataframe

In [None]:
df = spark.read.csv("data/googlestore.csv", header=True, sep=",", escape='"')

In [None]:
print(df.count())

In [None]:
print(df.show(1))

## check schema

In [None]:
print(df.printSchema())

## data cleaning


In [None]:
df = df.drop('Size', 'Content Rating', 'Last Updated', 'Current Ver', 'Android Ver')

In [None]:
print(df)

In [None]:
print(df.show(2))

In [None]:
print(df.printSchema())

In [None]:
df = df.withColumn('Reviews', col('Reviews').cast(IntegerType())) \
       .withColumn("Installs", regexp_replace(col('Installs'), "[^0-9]", "")) \
       .withColumn('Installs', col('Installs').cast(IntegerType())) \
       .withColumn("Price", regexp_replace(col('Price'), "[$]", "")) \
       .withColumn('Price', col('Price').cast(IntegerType())) \
       .withColumn("Rating", regexp_replace(col('Rating'), "[$]", "")) \
       .withColumn('Rating', col('Rating').cast(IntegerType()))


In [None]:
print(df.show(5))

In [None]:
df.createOrReplaceTempView("apps")

##  SQL Select query

**load and activate the SQL extention to allow us to execute SQL in notebook**

In [None]:
%load_ext sql

In [None]:
spark.sql("SELECT * from apps") \
.show()

## top reviews given to apps

In [None]:
reviews = spark.sql("SELECT App, SUM(Reviews) AS TotalReviews FROM apps GROUP BY App ORDER BY TotalReviews DESC")

In [None]:
reviews.show()

In [None]:
# Convert Spark DataFrame to Pandas DataFrame
df_reviews = reviews.toPandas()

In [None]:
# Create a bar chart of the top reviewed apps
plt.figure(figsize=(15, 10))
plt.bar(df_reviews['App'][:10], df_reviews['TotalReviews'][:10])
plt.title('Top Reviewed Apps')
plt.xlabel('App Name')
plt.ylabel('Total Reviews')
plt.xticks(rotation=90)
plt.show()

In [None]:
# Top 10 reviewed apps
top_10_apps = df_reviews.head(10)

# Plotting the pie chart
plt.figure(figsize=(10, 8))
plt.pie(top_10_apps['TotalReviews'], labels=top_10_apps['App'], autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Total Reviews Among Top Reviewed Apps')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle
plt.show()

## top 10 installs apps

In [None]:
installs = spark.sql("SELECT App, Type, SUM(Installs) FROM apps GROUP BY 1,2 ORDER BY 3 DESC")


In [None]:
installs.show()

In [None]:
# Convert Spark DataFrame to Pandas DataFrame
df_installs = installs.toPandas()

In [None]:
# Create a bar chart of the top installed apps
plt.figure(figsize=(15, 10))
plt.bar(df_installs['App'][:10], df_installs['sum(Installs)'][:10])
plt.title('Top Installed Apps')
plt.xlabel('App Name')
plt.ylabel('Total Installs')
plt.xticks(rotation=90)
plt.show()


In [None]:
# Extracting relevant data for the pie chart
top_apps = df_installs['App'][:10]
installs = df_installs['sum(Installs)'][:10]

# Create a pie chart of the top installed apps
plt.figure(figsize=(10, 8))
plt.pie(installs, labels=top_apps, autopct='%1.1f%%', startangle=140)
plt.title('Top Installed Apps Distribution')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()


## Category-wise Distribution of Installed Apps

In [None]:
Category = spark.sql("SELECT Category, SUM(Installs) FROM apps GROUP BY 1 ORDER BY 2 DESC")


In [None]:
Category.show()

In [None]:
# Convert Spark DataFrame to Pandas DataFrame
df_Category = Category.toPandas()

In [None]:
# Create a bar chart of the top installed apps
plt.figure(figsize=(15, 10))
plt.bar(df_Category['Category'][:10], df_Category['sum(Installs)'][:10])
plt.title('Category Distribution')
plt.xlabel('App Name')
plt.ylabel('Category Distribution')
plt.xticks(rotation=90)
plt.show()


In [None]:
# Extracting relevant data for the pie chart
top_apps = df_Category['Category'][:10]
installs = df_Category['sum(Installs)'][:10]

# Create a pie chart of the top installed apps
plt.figure(figsize=(10, 8))
plt.pie(installs, labels=top_apps, autopct='%1.1f%%', startangle=140)
plt.title('Category Distribution')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()


## Top Paid Apps

In [None]:
paid = spark.sql("SELECT App, SUM(Price) FROM apps WHERE Type = 'Paid' GROUP BY 1 ORDER BY 2 DESC")


In [None]:
paid.show()

In [None]:
# Convert Spark DataFrame to Pandas DataFrame
df_paid = paid.toPandas()

In [None]:
# Create a bar chart of the top installed apps
plt.figure(figsize=(15, 10))
plt.bar(df_paid['App'][:10], df_paid['sum(Price)'][:10])
plt.title('Top Paid Apps')
plt.xlabel('App Name')
plt.ylabel('Top Paid Apps')
plt.xticks(rotation=90)
plt.show()


In [None]:
# Extracting relevant data for the pie chart
top_apps = df_paid['App'][:10]
installs = df_paid['sum(Price)'][:10]

# Create a pie chart of the top installed apps
plt.figure(figsize=(10, 8))
plt.pie(installs, labels=top_apps, autopct='%1.1f%%', startangle=140)
plt.title('Top Paid Apps')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()


## Top Paid Rating Apps


In [None]:
rating_paid = spark.sql("SELECT App, SUM(Rating) AS TotalRating FROM apps WHERE Type = 'Paid' GROUP BY App ORDER BY TotalRating DESC")

In [None]:
rating_paid.show()

In [None]:
# Convert Spark DataFrame to Pandas DataFrame
df_rating_paid = rating_paid.toPandas()

In [None]:
# Create a bar chart of the top rated paid apps
plt.figure(figsize=(15, 10))
plt.bar(df_rating_paid['App'][:10], df_rating_paid['TotalRating'][:10])
plt.title('Top Rated Paid Apps')
plt.xlabel('App Name')
plt.ylabel('Total Rating')
plt.xticks(rotation=90)
plt.show()


In [None]:
# Extracting relevant data for the pie chart
top_apps = df_rating_paid['App'][:10]
installs = df_rating_paid['TotalRating'][:10]

# Create a pie chart of the top installed apps
plt.figure(figsize=(10, 8))
plt.pie(installs, labels=top_apps, autopct='%1.1f%%', startangle=140)
plt.title('Top Paid Rating Apps')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
