In [1]:
import findspark

In [2]:
findspark.init('/home/ubuntu/spark-2.4.5-bin-hadoop2.7')

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('Samsung').getOrCreate()

In [5]:
df = spark.read.csv('Samsung_SSUN.F.csv',inferSchema=True, header=True)

In [6]:
df.columns

['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']

In [7]:
df.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Adj Close: double (nullable = true)
 |-- Volume: integer (nullable = true)



In [8]:
for row in df.head(5):
    print(row)

Row(Date=datetime.datetime(2015, 3, 2, 0, 0), Open=430.403015, High=434.071991, Low=430.403015, Close=431.451996, Adj Close=369.496979, Volume=972)
Row(Date=datetime.datetime(2015, 3, 3, 0, 0), Open=442.002014, High=444.670013, Low=441.200012, Close=441.200012, Adj Close=377.845184, Volume=585)
Row(Date=datetime.datetime(2015, 3, 4, 0, 0), Open=444.727997, High=450.062012, Low=444.727997, Close=450.062012, Adj Close=385.43457, Volume=596)
Row(Date=datetime.datetime(2015, 3, 5, 0, 0), Open=444.401001, High=448.024994, Low=444.401001, Close=444.434998, Adj Close=380.615631, Volume=509)
Row(Date=datetime.datetime(2015, 3, 6, 0, 0), Open=450.980011, High=458.460999, Low=450.980011, Close=455.0, Adj Close=389.663513, Volume=895)


In [9]:
df.describe().show()

+-------+------------------+-----------------+------------------+------------------+------------------+-----------------+
|summary|              Open|             High|               Low|             Close|         Adj Close|           Volume|
+-------+------------------+-----------------+------------------+------------------+------------------+-----------------+
|  count|              1268|             1268|              1268|              1268|              1268|             1268|
|   mean|  626.833718040221|630.9788122507887| 621.9069379408515| 626.4052489179816| 578.2232215828078|531.9566246056783|
| stddev|164.57297843236915|165.9979473445254|163.10247039214485|164.68402703310642|172.31907072676466|553.1427142820753|
|    min|             306.0|       307.298004|        287.869995|        303.471985|        260.169067|                0|
|    max|            1028.0|           1028.0|            1016.0|            1020.0|            1020.0|             4750|
+-------+---------------

In [10]:
df.describe().printSchema()

root
 |-- summary: string (nullable = true)
 |-- Open: string (nullable = true)
 |-- High: string (nullable = true)
 |-- Low: string (nullable = true)
 |-- Close: string (nullable = true)
 |-- Adj Close: string (nullable = true)
 |-- Volume: string (nullable = true)



In [11]:
from pyspark.sql.functions import format_number

In [12]:
result = df.describe()

In [13]:
result.select(result['summary'],
              format_number(result['Open'].cast('float'),2).alias('Open'),
              format_number(result['High'].cast('float'),2).alias('High'),
              format_number(result['Low'].cast('float'),2).alias('Low'),
              format_number(result['Close'].cast('float'),2).alias('Close'),
              result['Volume'].cast('int').alias('Volume')).show()

+-------+--------+--------+--------+--------+------+
|summary|    Open|    High|     Low|   Close|Volume|
+-------+--------+--------+--------+--------+------+
|  count|1,268.00|1,268.00|1,268.00|1,268.00|  1268|
|   mean|  626.83|  630.98|  621.91|  626.41|   531|
| stddev|  164.57|  166.00|  163.10|  164.68|   553|
|    min|  306.00|  307.30|  287.87|  303.47|     0|
|    max|1,028.00|1,028.00|1,016.00|1,020.00|  4750|
+-------+--------+--------+--------+--------+------+



In [14]:
result.describe().printSchema()

root
 |-- summary: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- Open: string (nullable = true)
 |-- High: string (nullable = true)
 |-- Low: string (nullable = true)
 |-- Close: string (nullable = true)
 |-- Adj Close: string (nullable = true)
 |-- Volume: string (nullable = true)



In [15]:
df2 = df.withColumn("HV ration", df['High']/df['Volume'])

In [16]:
df2.select('HV ration').show()

+-------------------+
|          HV ration|
+-------------------+
|0.44657612242798356|
| 0.7601196803418803|
| 0.7551376040268456|
| 0.8802062750491159|
| 0.5122469262569832|
| 0.8286532664233577|
| 1.0884684587378641|
| 1.0163216958424508|
| 2.0297835714285712|
| 1.3572457457142857|
| 0.6704531150568181|
| 0.9057692307692308|
| 0.7785107562189054|
| 0.7526677406807132|
| 1.8743647581967213|
| 0.6748938097345133|
| 0.4951086956521739|
| 0.8356934142335767|
|                0.3|
|  2.085745351851852|
+-------------------+
only showing top 20 rows



In [17]:
df.head(11)

[Row(Date=datetime.datetime(2015, 3, 2, 0, 0), Open=430.403015, High=434.071991, Low=430.403015, Close=431.451996, Adj Close=369.496979, Volume=972),
 Row(Date=datetime.datetime(2015, 3, 3, 0, 0), Open=442.002014, High=444.670013, Low=441.200012, Close=441.200012, Adj Close=377.845184, Volume=585),
 Row(Date=datetime.datetime(2015, 3, 4, 0, 0), Open=444.727997, High=450.062012, Low=444.727997, Close=450.062012, Adj Close=385.43457, Volume=596),
 Row(Date=datetime.datetime(2015, 3, 5, 0, 0), Open=444.401001, High=448.024994, Low=444.401001, Close=444.434998, Adj Close=380.615631, Volume=509),
 Row(Date=datetime.datetime(2015, 3, 6, 0, 0), Open=450.980011, High=458.460999, Low=450.980011, Close=455.0, Adj Close=389.663513, Volume=895),
 Row(Date=datetime.datetime(2015, 3, 9, 0, 0), Open=450.001007, High=454.10199, Low=447.299988, Close=450.121002, Adj Close=385.485138, Volume=548),
 Row(Date=datetime.datetime(2015, 3, 10, 0, 0), Open=447.324005, High=448.449005, Low=443.001007, Close=446

In [18]:
df.orderBy(df['High'].desc()).head(1)

[Row(Date=datetime.datetime(2020, 2, 17, 0, 0), Open=1028.0, High=1028.0, Low=1014.0, Close=1018.0, Adj Close=1018.0, Volume=308)]

In [19]:
df.orderBy(df['High'].desc()).head(1)[0][0]

datetime.datetime(2020, 2, 17, 0, 0)

In [20]:
from pyspark.sql.functions import mean

In [21]:
df.select(mean('Close')).show()

+-----------------+
|       avg(Close)|
+-----------------+
|626.4052489179816|
+-----------------+



In [22]:
from pyspark.sql.functions import max,min

In [23]:
df.select(max('Volume'),min('Volume')).show()

+-----------+-----------+
|max(Volume)|min(Volume)|
+-----------+-----------+
|       4750|          0|
+-----------+-----------+



In [24]:
df.filter("Close < 500").count()

376

In [25]:
df.filter(df['Close']<500).count()

376

In [26]:
from pyspark.sql.functions import count

In [27]:
result = df.filter(df['Close']<500)

In [28]:
result.select(count('Close')).show()

+------------+
|count(Close)|
+------------+
|         376|
+------------+



In [29]:
(df.filter(df['High']>500).count()*1.0/df.count())*100

70.89905362776025

In [30]:
from pyspark.sql.functions import corr

In [31]:
df.select(corr('High','Volume')).show()

+-------------------+
| corr(High, Volume)|
+-------------------+
|0.10137876606310263|
+-------------------+



In [32]:
from pyspark.sql.functions import year

In [33]:
yeardf = df.withColumn('Year',year(df['Date']))

In [34]:
max_df = yeardf.groupBy('Year').max()

In [35]:
max_df.select('Year','max(Low)','max(High)').show()

+----+----------+----------+
|Year|  max(Low)| max(High)|
+----+----------+----------+
|2018|     825.0|     848.0|
|2015|495.100006|499.899994|
|2019|     888.0|     894.0|
|2020|    1016.0|    1028.0|
|2016|575.000977|579.356018|
|2017|     890.0|     987.0|
+----+----------+----------+



In [36]:
from pyspark.sql.functions import month

In [37]:
monthdf = df.withColumn('Month',month('Date'))

In [38]:
monthavga = monthdf.select('Month','Open','Close').groupBy('Month').mean()

In [39]:
monthavga.select('Month','avg(Open)','avg(Close)').orderBy('Month').show()

+-----+-----------------+-----------------+
|Month|        avg(Open)|       avg(Close)|
+-----+-----------------+-----------------+
|    1|678.8277130092591|678.6112164814816|
|    2|681.8094853564359|680.8845250792081|
|    3|594.8133805092592|594.2579729814817|
|    4|606.3922738383839| 605.423030030303|
|    5|604.2577951121494| 603.731999457944|
|    6|599.8140474339623|598.9598305188678|
|    7|605.9957000818183|606.9227910454547|
|    8| 597.704955232143|596.9848569910715|
|    9|607.6713208490567| 607.265982528302|
|   10| 634.734324101852|634.6637120925925|
|   11|668.9955277592593|668.1095657314816|
|   12|645.4945698210526|645.5071687578948|
+-----+-----------------+-----------------+

