In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from modules.my_pyspark import *
from modules.my_drawer import MyDrawer
from pyspark.sql.functions import mean

**Start a simple Spark Session**

In [3]:
spark = MyPySpark(session=True, sql=True)
drawer = MyDrawer()

In [4]:
spark.context

**Load the Walmart Stock CSV file, have Spark infer the data types**

In [5]:
file_path = r'data/walmart_stock.csv'

In [6]:
walmart_stock = spark.readFile(file_path)

**What are the column names**

In [7]:
walmart_stock.columns

['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']

**What does the scheme look like**

In [8]:
walmart_stock.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



**Print out the first 5 rows**

In [9]:
walmart_stock.show(5)

+----------+------------------+---------+---------+------------------+--------+------------------+
|      Date|              Open|     High|      Low|             Close|  Volume|         Adj Close|
+----------+------------------+---------+---------+------------------+--------+------------------+
|2012-01-03|         59.970001|61.060001|59.869999|         60.330002|12668800|52.619234999999996|
|2012-01-04|60.209998999999996|60.349998|59.470001|59.709998999999996| 9593300|         52.078475|
|2012-01-05|         59.349998|59.619999|58.369999|         59.419998|12768200|         51.825539|
|2012-01-06|         59.419998|59.450001|58.869999|              59.0| 8069400|          51.45922|
|2012-01-09|         59.029999|59.549999|58.919998|             59.18| 6679300|51.616215000000004|
+----------+------------------+---------+---------+------------------+--------+------------------+
only showing top 5 rows



**Use `describe()` to learn about the dataframe**

In [10]:
walmart_stock.describe().show()

+-------+----------+------------------+-----------------+-----------------+-----------------+-----------------+-----------------+
|summary|      Date|              Open|             High|              Low|            Close|           Volume|        Adj Close|
+-------+----------+------------------+-----------------+-----------------+-----------------+-----------------+-----------------+
|  count|      1258|              1258|             1258|             1258|             1258|             1258|             1258|
|   mean|      null| 72.35785375357709|72.83938807631165| 71.9186009594594|72.38844998012726|8222093.481717011|67.23883848728146|
| stddev|      null|  6.76809024470826|6.768186808159218|6.744075756255496|6.756859163732991|  4519780.8431556|6.722609449996857|
|    min|2012-01-03|56.389998999999996|        57.060001|        56.299999|        56.419998|          2094900|        50.363689|
|    max|2016-12-30|         90.800003|        90.970001|            89.25|        90.4700

**Create a new dataframe with a column called `HV Ratio` that is the ratio of the `High Price` versus volume of stock traded for a day**

In [11]:
walmart_stock = walmart_stock.withColumn('HV Ratio', walmart_stock['High'] / walmart_stock['Volume'])

In [12]:
walmart_stock.select('HV Ratio').show()

+--------------------+
|            HV Ratio|
+--------------------+
|4.819714653321546E-6|
|6.290848613094555E-6|
|4.669412994783916E-6|
|7.367338463826307E-6|
|8.915604778943901E-6|
|8.644477436914568E-6|
|9.351828421515645E-6|
| 8.29141562102703E-6|
|7.712212102001476E-6|
|7.071764823529412E-6|
|1.015495466386981E-5|
|6.576354146362592...|
| 5.90145296180676E-6|
|8.547679455011844E-6|
|8.420709512685392E-6|
|1.041448341728929...|
|8.316075414862431E-6|
|9.721183814992126E-6|
|8.029436027707578E-6|
|6.307432259386365E-6|
+--------------------+
only showing top 20 rows



**What day had the Peak High in Price**

In [13]:
walmart_stock.orderBy(walmart_stock['High'].desc()).first()['Date']

'2015-01-13'

**What is the mean of the Close column**

In [14]:
walmart_stock.select(mean(walmart_stock['Close'])).show()

+-----------------+
|       avg(Close)|
+-----------------+
|72.38844998012726|
+-----------------+



In [15]:
walmart_stock.agg({'Close': 'avg'}).show()

+-----------------+
|       avg(Close)|
+-----------------+
|72.38844998012726|
+-----------------+



In [16]:
walmart_stock.groupBy().avg('Close').show()

+-----------------+
|       avg(Close)|
+-----------------+
|72.38844998012726|
+-----------------+



**What is the max and min of the Volume column**

In [17]:
walmart_stock.agg({'Volume': 'max'}).show()

+-----------+
|max(Volume)|
+-----------+
|   80898100|
+-----------+



In [18]:
walmart_stock.agg({'Volume': 'min'}).show()

+-----------+
|min(Volume)|
+-----------+
|    2094900|
+-----------+



**How many dates was the Close lower than 60 dollars**

In [19]:
walmart_stock.filter(walmart_stock['Close'] <= 60).count()

81

**What percentage of the time was the High greater than 80 dollars**

In [20]:
walmart_stock.filter(walmart_stock['High'] >= 80).count() / walmart_stock.count()

0.09141494435612083

**What is the Pearson correlation between High and Volume**

In [21]:
walmart_stock.select(corr('High', 'Volume')).show()

NameError: name 'corr' is not defined

**What is the max High per year**

In [40]:
walmart_stock = walmart_stock.withColumn('Year', year('Date'))

In [43]:
walmart_stock.groupBy('Year').max('High').show()

+----+---------+
|Year|max(High)|
+----+---------+
|2015|90.970001|
|2013|81.370003|
|2014|88.089996|
|2012|77.599998|
|2016|75.190002|
+----+---------+



**What is the average Close for each Calendar Month**

In [44]:
walmart_stock = walmart_stock.withColumn('Month', month('Date'))

In [47]:
walmart_stock.select('Month', 'Close').groupBy('Month').mean('Close').orderBy('Month').show()

+-----+-----------------+
|Month|       avg(Close)|
+-----+-----------------+
|    1|71.44801958415842|
|    2|  71.306804443299|
|    3|71.77794377570092|
|    4|72.97361900952382|
|    5|72.30971688679247|
|    6| 72.4953774245283|
|    7|74.43971943925233|
|    8|73.02981855454546|
|    9|72.18411785294116|
|   10|71.57854545454543|
|   11| 72.1110893069307|
|   12|72.84792478301885|
+-----+-----------------+

