In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType,FloatType

In [8]:
spark = SparkSession.builder.appName("eda").getOrCreate()

In [9]:
spark

In [12]:
schema = StructType([
    StructField('App',
                StringType(), True),
    StructField('Category',
                StringType(), True),
    StructField('Rating',
                FloatType(), True),
    StructField('Reviews',
                IntegerType(), True),
    StructField('Size',
                FloatType(), True),
    StructField('Installs',
                IntegerType(), True),
    StructField('Type',
                StringType(), True),
    StructField('Price',
                FloatType(), True),
    StructField('Content Rating',
                StringType(), True),
    StructField('Genres',
                StringType(), True),
    StructField('Last Updated',
                StringType(), True),
    StructField('Current Ver',
                StringType(), True),
    StructField('Android Ver',
                StringType(), True),
    StructField('day',
                IntegerType(), True),
    StructField('month',
                IntegerType(), True),
    StructField('year',
                IntegerType(), True)
])

In [13]:
df = spark.read.format(
    "csv").schema(schema).option(
    "header", True).load("/config/workspace/clead_google_playstore_data copy.csv")
df.printSchema()

root
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: float (nullable = true)
 |-- Reviews: integer (nullable = true)
 |-- Size: float (nullable = true)
 |-- Installs: integer (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: float (nullable = true)
 |-- Content Rating: string (nullable = true)
 |-- Genres: string (nullable = true)
 |-- Last Updated: string (nullable = true)
 |-- Current Ver: string (nullable = true)
 |-- Android Ver: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)



In [14]:
import numpy as np
import matplotlib.pyplot as pyplot
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [18]:
df.show(5)

+--------------------+--------------+------+-------+-------+--------+----+-----+--------------+--------------------+------------+------------------+-----------+---+-----+----+
|                 App|      Category|Rating|Reviews|   Size|Installs|Type|Price|Content Rating|              Genres|Last Updated|       Current Ver|Android Ver|day|month|year|
+--------------------+--------------+------+-------+-------+--------+----+-----+--------------+--------------------+------------+------------------+-----------+---+-----+----+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159|19000.0|   10000|Free|  0.0|      Everyone|        Art & Design|  2018-01-07|             1.0.0|     4.0.3 |  7|    1|2018|
| Coloring book moana|ART_AND_DESIGN|   3.9|    967|14000.0|  500000|Free|  0.0|      Everyone|Art & Design;Pret...|  2018-01-15|             2.0.0|     4.0.3 | 15|    1|2018|
|U Launcher Lite –...|ART_AND_DESIGN|   4.7|  87510| 8700.0| 5000000|Free|  0.0|      Everyone|        Art & Design|  20

In [21]:
import pandas as pd
df.toPandas().head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,day,month,year
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159.0,19000.0,10000.0,Free,0.0,Everyone,Art & Design,2018-01-07,1.0.0,4.0.3,7.0,1.0,2018.0
1,Coloring book moana,ART_AND_DESIGN,3.9,967.0,14000.0,500000.0,Free,0.0,Everyone,Art & Design;Pretend Play,2018-01-15,2.0.0,4.0.3,15.0,1.0,2018.0
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510.0,8700.0,5000000.0,Free,0.0,Everyone,Art & Design,2018-08-01,1.2.4,4.0.3,1.0,8.0,2018.0
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644.0,25000.0,50000000.0,Free,0.0,Teen,Art & Design,2018-06-08,Varies with device,4.2,8.0,6.0,2018.0
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967.0,2800.0,100000.0,Free,0.0,Everyone,Art & Design;Creativity,2018-06-20,1.1,4.4,20.0,6.0,2018.0


In [22]:
df.toPandas().sample()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,day,month,year
3340,NETGEAR WiFi Analytics,TOOLS,4.1,9496.0,696.0,1000000.0,Free,0.0,Everyone,Tools,2014-04-25,1.0.19,2.2,25.0,4.0,2014.0


In [24]:
df.shape()

AttributeError: 'DataFrame' object has no attribute 'shape'

In [25]:
df.count()

10840

In [26]:
print((df.count(), len(df.columns)))

(10840, 16)


In [27]:
df.select([col[0] for col in df.dtypes if col[1] != 'string']).describe().show()

+-------+------------------+-----------------+------------------+-------------------+------------------+------------------+------------------+------------------+
|summary|            Rating|          Reviews|              Size|           Installs|             Price|               day|             month|              year|
+-------+------------------+-----------------+------------------+-------------------+------------------+------------------+------------------+------------------+
|  count|              9364|            10838|              9144|              10838|             10839|             10838|             10838|             10838|
|   mean| 4.191734297720605|444225.1924709356|21508.339238845143|1.546663900046134E7|1.4887600148534719|15.609983391769699| 6.422402657316848|2017.3996124746263|
| stddev|0.5152693805741332|2928025.838407947| 22596.59699806101| 8.50370342845836E7|50.595990194027294| 9.561379976894772|2.5784765206882225|1.1009855900018612|
|    min|               1.0|

In [29]:
df.toPandas().describe()

Unnamed: 0,Rating,Reviews,Size,Installs,Price,day,month,year
count,9364.0,10838.0,9144.0,10838.0,10839.0,10838.0,10838.0,10838.0
mean,4.191734,444225.2,21508.339844,15466640.0,1.48876,15.609983,6.422403,2017.399612
std,0.515269,2928026.0,22596.595703,85037030.0,50.595989,9.56138,2.578477,1.100986
min,1.0,0.0,10.0,0.0,0.0,1.0,1.0,2010.0
25%,4.0,38.0,4900.0,1000.0,0.0,6.0,5.0,2017.0
50%,4.3,2093.5,13000.0,100000.0,0.0,16.0,7.0,2018.0
75%,4.5,54706.0,30000.0,5000000.0,0.0,24.0,8.0,2018.0
max,5.0,78158310.0,100000.0,1000000000.0,5000.0,31.0,12.0,2018.0


In [30]:
df.toPandas().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10840 entries, 0 to 10839
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10840 non-null  object 
 1   Category        10840 non-null  object 
 2   Rating          9364 non-null   float32
 3   Reviews         10838 non-null  float64
 4   Size            9144 non-null   float32
 5   Installs        10838 non-null  float64
 6   Type            10839 non-null  object 
 7   Price           10839 non-null  float32
 8   Content Rating  10840 non-null  object 
 9   Genres          10840 non-null  object 
 10  Last Updated    10840 non-null  object 
 11  Current Ver     10832 non-null  object 
 12  Android Ver     10838 non-null  object 
 13  day             10838 non-null  float64
 14  month           10838 non-null  float64
 15  year            10838 non-null  float64
dtypes: float32(3), float64(5), object(8)
memory usage: 1.2+ MB


In [31]:
df.printSchema()

root
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: float (nullable = true)
 |-- Reviews: integer (nullable = true)
 |-- Size: float (nullable = true)
 |-- Installs: integer (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: float (nullable = true)
 |-- Content Rating: string (nullable = true)
 |-- Genres: string (nullable = true)
 |-- Last Updated: string (nullable = true)
 |-- Current Ver: string (nullable = true)
 |-- Android Ver: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)



In [33]:
pd.DataFrame(df.select([col[0] for col in df.dtypes if col[1] != 'string']).describe().show())

+-------+------------------+-----------------+------------------+-------------------+------------------+------------------+------------------+------------------+
|summary|            Rating|          Reviews|              Size|           Installs|             Price|               day|             month|              year|
+-------+------------------+-----------------+------------------+-------------------+------------------+------------------+------------------+------------------+
|  count|              9364|            10838|              9144|              10838|             10839|             10838|             10838|             10838|
|   mean| 4.191734297720605|444225.1924709356|21508.339238845143|1.546663900046134E7|1.4887600148534719|15.609983391769699| 6.422402657316848|2017.3996124746263|
| stddev|0.5152693805741332|2928025.838407947| 22596.59699806101| 8.50370342845836E7|50.595990194027294| 9.561379976894772|2.5784765206882225|1.1009855900018612|
|    min|               1.0|

In [34]:
df.select([col[0] for col in df.dtypes if col[1] != 'string']).describe().show()

+-------+------------------+-----------------+------------------+-------------------+------------------+------------------+------------------+------------------+
|summary|            Rating|          Reviews|              Size|           Installs|             Price|               day|             month|              year|
+-------+------------------+-----------------+------------------+-------------------+------------------+------------------+------------------+------------------+
|  count|              9364|            10838|              9144|              10838|             10839|             10838|             10838|             10838|
|   mean| 4.191734297720605|444225.1924709356|21508.339238845143|1.546663900046134E7|1.4887600148534719|15.609983391769699| 6.422402657316848|2017.3996124746263|
| stddev|0.5152693805741332|2928025.838407947| 22596.59699806101| 8.50370342845836E7|50.595990194027294| 9.561379976894772|2.5784765206882225|1.1009855900018612|
|    min|               1.0|

In [37]:
df.describe().show()

+-------+--------------------+-------------+------------------+-----------------+------------------+-------------------+-----------------+------------------+----------------+------+------------+--------------------+-----------+------------------+------------------+------------------+
|summary|                 App|     Category|            Rating|          Reviews|              Size|           Installs|             Type|             Price|  Content Rating|Genres|Last Updated|         Current Ver|Android Ver|               day|             month|              year|
+-------+--------------------+-------------+------------------+-----------------+------------------+-------------------+-----------------+------------------+----------------+------+------------+--------------------+-----------+------------------+------------------+------------------+
|  count|               10840|        10840|              9364|            10838|              9144|              10838|            10839|       

In [38]:
df.summary().show()

[Stage 37:>                                                         (0 + 1) / 1]

+-------+--------------------+-------------+------------------+-----------------+------------------+-------------------+-----------------+------------------+----------------+------+------------+--------------------+-----------+------------------+------------------+------------------+
|summary|                 App|     Category|            Rating|          Reviews|              Size|           Installs|             Type|             Price|  Content Rating|Genres|Last Updated|         Current Ver|Android Ver|               day|             month|              year|
+-------+--------------------+-------------+------------------+-----------------+------------------+-------------------+-----------------+------------------+----------------+------+------------+--------------------+-----------+------------------+------------------+------------------+
|  count|               10840|        10840|              9364|            10838|              9144|              10838|            10839|       

                                                                                

In [43]:
(df.select([col[0] for col in df.dtypes if col[1] != 'string']).describe()).T.sort_index()

AttributeError: 'DataFrame' object has no attribute 'T'

In [45]:
df.describe().toPandas().T

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
App,10840,,,"""""""i DT"""" Fútbol. Todos Somos Técnicos.""",🔥 Football Wallpapers 4K | Full HD Backgrounds 😍
Category,10840,,,traffic jams,WEATHER
Rating,9364,4.191734297720605,0.5152693805741332,1.0,5.0
Reviews,10838,444225.1924709356,2928025.838407947,0,78158306
Size,9144,21508.339238845143,22596.59699806101,10.0,100000.0
Installs,10838,1.546663900046134E7,8.50370342845836E7,0,1000000000
Type,10839,52378.5,70526.12324876507,102248,Paid
Price,10839,1.4887600148534719,50.595990194027294,0.0,5000.0
Content Rating,10840,3000000.0,2828427.12474619,1000000,Unrated


In [46]:
df.toPandas().describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Rating,9364.0,4.191734,0.5152694,1.0,4.0,4.3,4.5,5.0
Reviews,10838.0,444225.2,2928026.0,0.0,38.0,2093.5,54706.0,78158310.0
Size,9144.0,21508.34,22596.6,10.0,4900.0,13000.0,30000.0,100000.0
Installs,10838.0,15466640.0,85037030.0,0.0,1000.0,100000.0,5000000.0,1000000000.0
Price,10839.0,1.48876,50.59599,0.0,0.0,0.0,0.0,5000.0
day,10838.0,15.60998,9.56138,1.0,6.0,16.0,24.0,31.0
month,10838.0,6.422403,2.578477,1.0,5.0,7.0,8.0,12.0
year,10838.0,2017.4,1.100986,2010.0,2017.0,2018.0,2018.0,2018.0


In [50]:
df.columns

['App',
 'Category',
 'Rating',
 'Reviews',
 'Size',
 'Installs',
 'Type',
 'Price',
 'Content Rating',
 'Genres',
 'Last Updated',
 'Current Ver',
 'Android Ver',
 'day',
 'month',
 'year']

In [51]:
type(df.columns)

list