## References

- https://towardsdatascience.com/a-neanderthals-guide-to-apache-spark-in-python-9ef1f156d427

In [1]:
%config IPCompleter.greedy=True

In [2]:
import findspark
findspark.init()

import pyspark
import random

sc = pyspark.SparkContext(appName="Pi")
# num_samples = 100000000
num_samples = 10000

def inside(p):     
  x, y = random.random(), random.random()
  return x*x + y*y < 1

count = sc.parallelize(range(0, num_samples)).filter(inside).count()

pi = 4 * count / num_samples
print(pi)

sc.stop()

3.1196


In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Learning_Spark") \
    .getOrCreate()

In [4]:
data = spark.read.csv('data/vgsales.csv',inferSchema=True, header=True)

In [5]:
data

DataFrame[Rank: int, Name: string, Platform: string, Year: string, Genre: string, Publisher: string, NA_Sales: double, EU_Sales: double, JP_Sales: double, Other_Sales: double, Global_Sales: double]

In [6]:
data.count(), len(data.columns)

(16598, 11)

In [7]:
data.show(5)

+----+--------------------+--------+----+------------+---------+--------+--------+--------+-----------+------------+
|Rank|                Name|Platform|Year|       Genre|Publisher|NA_Sales|EU_Sales|JP_Sales|Other_Sales|Global_Sales|
+----+--------------------+--------+----+------------+---------+--------+--------+--------+-----------+------------+
|   1|          Wii Sports|     Wii|2006|      Sports| Nintendo|   41.49|   29.02|    3.77|       8.46|       82.74|
|   2|   Super Mario Bros.|     NES|1985|    Platform| Nintendo|   29.08|    3.58|    6.81|       0.77|       40.24|
|   3|      Mario Kart Wii|     Wii|2008|      Racing| Nintendo|   15.85|   12.88|    3.79|       3.31|       35.82|
|   4|   Wii Sports Resort|     Wii|2009|      Sports| Nintendo|   15.75|   11.01|    3.28|       2.96|        33.0|
|   5|Pokemon Red/Pokem...|      GB|1996|Role-Playing| Nintendo|   11.27|    8.89|   10.22|        1.0|       31.37|
+----+--------------------+--------+----+------------+---------+

In [8]:
data.printSchema()

root
 |-- Rank: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Platform: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Genre: string (nullable = true)
 |-- Publisher: string (nullable = true)
 |-- NA_Sales: double (nullable = true)
 |-- EU_Sales: double (nullable = true)
 |-- JP_Sales: double (nullable = true)
 |-- Other_Sales: double (nullable = true)
 |-- Global_Sales: double (nullable = true)



In [9]:
data.select("Name","Platform","NA_Sales","EU_Sales").show(15, truncate=False)

+---------------------------+--------+--------+--------+
|Name                       |Platform|NA_Sales|EU_Sales|
+---------------------------+--------+--------+--------+
|Wii Sports                 |Wii     |41.49   |29.02   |
|Super Mario Bros.          |NES     |29.08   |3.58    |
|Mario Kart Wii             |Wii     |15.85   |12.88   |
|Wii Sports Resort          |Wii     |15.75   |11.01   |
|Pokemon Red/Pokemon Blue   |GB      |11.27   |8.89    |
|Tetris                     |GB      |23.2    |2.26    |
|New Super Mario Bros.      |DS      |11.38   |9.23    |
|Wii Play                   |Wii     |14.03   |9.2     |
|New Super Mario Bros. Wii  |Wii     |14.59   |7.06    |
|Duck Hunt                  |NES     |26.93   |0.63    |
|Nintendogs                 |DS      |9.07    |11.0    |
|Mario Kart DS              |DS      |9.81    |7.57    |
|Pokemon Gold/Pokemon Silver|GB      |9.0     |6.18    |
|Wii Fit                    |Wii     |8.94    |8.03    |
|Wii Fit Plus               |Wi

In [10]:
data.describe(["NA_Sales","EU_Sales"]).show()

+-------+-------------------+------------------+
|summary|           NA_Sales|          EU_Sales|
+-------+-------------------+------------------+
|  count|              16598|             16598|
|   mean|0.26466742981084057|0.1466520062658483|
| stddev| 0.8166830292988798|0.5053512312869136|
|    min|                0.0|               0.0|
|    max|              41.49|             29.02|
+-------+-------------------+------------------+



In [11]:
data.groupBy("Platform") \
.count() \
.orderBy("count", ascending=False) \
.show(10)

+--------+-----+
|Platform|count|
+--------+-----+
|      DS| 2163|
|     PS2| 2161|
|     PS3| 1329|
|     Wii| 1325|
|    X360| 1265|
|     PSP| 1213|
|      PS| 1196|
|      PC|  960|
|      XB|  824|
|     GBA|  822|
+--------+-----+
only showing top 10 rows



In [14]:
condition1 = (data.NA_Sales.isNotNull()) | (data.EU_Sales.isNotNull())
condition2 = data.JP_Sales.isNotNull()
data = data.filter(condition1).filter(condition2)
data

DataFrame[Rank: int, Name: string, Platform: string, Year: string, Genre: string, Publisher: string, NA_Sales: double, EU_Sales: double, JP_Sales: double, Other_Sales: double, Global_Sales: double]