In [1]:
import findspark

In [2]:
findspark.init('/home/ubuntu/spark-2.4.5-bin-hadoop2.7')

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('Hyunadi').getOrCreate()

In [5]:
df = spark.read.csv('Hyundai Motor Company.csv',inferSchema=True, header=True)

In [6]:
df.columns

['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']

In [7]:
df.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Adj Close: double (nullable = true)
 |-- Volume: integer (nullable = true)



In [8]:
for row in df.head(5):
    print(row)

Row(Date=datetime.datetime(2015, 3, 2, 0, 0), Open=48.950001, High=48.950001, Low=48.950001, Close=48.950001, Adj Close=39.317558, Volume=0)
Row(Date=datetime.datetime(2015, 3, 3, 0, 0), Open=48.950001, High=48.950001, Low=48.950001, Close=48.950001, Adj Close=39.317558, Volume=500)
Row(Date=datetime.datetime(2015, 3, 4, 0, 0), Open=48.950001, High=48.950001, Low=48.950001, Close=48.950001, Adj Close=39.317558, Volume=0)
Row(Date=datetime.datetime(2015, 3, 5, 0, 0), Open=48.950001, High=48.950001, Low=48.950001, Close=48.950001, Adj Close=39.317558, Volume=0)
Row(Date=datetime.datetime(2015, 3, 6, 0, 0), Open=51.139999, High=51.139999, Low=49.599998, Close=49.599998, Adj Close=39.839649, Volume=31500)


In [9]:
df.describe().show()

+-------+------------------+------------------+-----------------+-----------------+-----------------+-----------------+
|summary|              Open|              High|              Low|            Close|        Adj Close|           Volume|
+-------+------------------+------------------+-----------------+-----------------+-----------------+-----------------+
|  count|              1259|              1259|             1259|             1259|             1259|             1259|
|   mean|39.494741768069844| 39.54116751072275|39.40830014138199| 39.4606194495631| 35.2082239046863| 950.833995234313|
| stddev| 6.559876473950293|6.5369870174235025|6.576103128354254|6.551472221533396|4.286853086778444|4626.548348592509|
|    min|             25.25|             25.25|            25.25|            25.25|        23.582779|                0|
|    max|              58.5|              58.5|             58.5|             58.5|        46.988308|            97900|
+-------+------------------+------------

In [10]:
df.describe().printSchema()

root
 |-- summary: string (nullable = true)
 |-- Open: string (nullable = true)
 |-- High: string (nullable = true)
 |-- Low: string (nullable = true)
 |-- Close: string (nullable = true)
 |-- Adj Close: string (nullable = true)
 |-- Volume: string (nullable = true)



In [11]:
from pyspark.sql.functions import format_number

In [12]:
result = df.describe()

In [13]:
result.select(result['summary'],
              format_number(result['Open'].cast('float'),2).alias('Open'),
              format_number(result['High'].cast('float'),2).alias('High'),
              format_number(result['Low'].cast('float'),2).alias('Low'),
              format_number(result['Close'].cast('float'),2).alias('Close'),
              result['Volume'].cast('int').alias('Volume')).show()

+-------+--------+--------+--------+--------+------+
|summary|    Open|    High|     Low|   Close|Volume|
+-------+--------+--------+--------+--------+------+
|  count|1,259.00|1,259.00|1,259.00|1,259.00|  1259|
|   mean|   39.49|   39.54|   39.41|   39.46|   950|
| stddev|    6.56|    6.54|    6.58|    6.55|  4626|
|    min|   25.25|   25.25|   25.25|   25.25|     0|
|    max|   58.50|   58.50|   58.50|   58.50| 97900|
+-------+--------+--------+--------+--------+------+



In [14]:
result.describe().printSchema()

root
 |-- summary: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- Open: string (nullable = true)
 |-- High: string (nullable = true)
 |-- Low: string (nullable = true)
 |-- Close: string (nullable = true)
 |-- Adj Close: string (nullable = true)
 |-- Volume: string (nullable = true)



In [15]:
df2 = df.withColumn("HV ration", df['High']/df['Volume'])

In [16]:
df2.select('HV ration').show()

+--------------------+
|           HV ration|
+--------------------+
|                null|
|         0.097900002|
|                null|
|                null|
|0.001623492031746...|
|0.004242734957264957|
|                null|
|           0.0124375|
|               0.245|
|                null|
| 0.01745517206896552|
|                null|
|                null|
|                null|
|         0.127299995|
|0.021956521739130434|
| 0.01596774193548387|
|                null|
|                null|
|                null|
+--------------------+
only showing top 20 rows



In [17]:
df.head(11)

[Row(Date=datetime.datetime(2015, 3, 2, 0, 0), Open=48.950001, High=48.950001, Low=48.950001, Close=48.950001, Adj Close=39.317558, Volume=0),
 Row(Date=datetime.datetime(2015, 3, 3, 0, 0), Open=48.950001, High=48.950001, Low=48.950001, Close=48.950001, Adj Close=39.317558, Volume=500),
 Row(Date=datetime.datetime(2015, 3, 4, 0, 0), Open=48.950001, High=48.950001, Low=48.950001, Close=48.950001, Adj Close=39.317558, Volume=0),
 Row(Date=datetime.datetime(2015, 3, 5, 0, 0), Open=48.950001, High=48.950001, Low=48.950001, Close=48.950001, Adj Close=39.317558, Volume=0),
 Row(Date=datetime.datetime(2015, 3, 6, 0, 0), Open=51.139999, High=51.139999, Low=49.599998, Close=49.599998, Adj Close=39.839649, Volume=31500),
 Row(Date=datetime.datetime(2015, 3, 9, 0, 0), Open=49.639999, High=49.639999, Low=49.639999, Close=49.639999, Adj Close=39.871784, Volume=11700),
 Row(Date=datetime.datetime(2015, 3, 10, 0, 0), Open=49.639999, High=49.639999, Low=49.639999, Close=49.639999, Adj Close=39.871784,

In [18]:
df.orderBy(df['High'].desc()).head(1)

[Row(Date=datetime.datetime(2015, 4, 27, 0, 0), Open=58.5, High=58.5, Low=58.5, Close=58.5, Adj Close=46.988308, Volume=400)]

In [19]:
df.orderBy(df['High'].desc()).head(1)[0][0]

datetime.datetime(2015, 4, 27, 0, 0)

In [20]:
from pyspark.sql.functions import mean

In [21]:
df.select(mean('Close')).show()

+----------------+
|      avg(Close)|
+----------------+
|39.4606194495631|
+----------------+



In [22]:
from pyspark.sql.functions import max,min

In [23]:
df.select(max('High'),min('High')).show()

+---------+---------+
|max(High)|min(High)|
+---------+---------+
|     58.5|    25.25|
+---------+---------+



In [24]:
df.filter("Close < 28").count()

38

In [25]:
df.filter(df['Close']<28).count()

38

In [26]:
from pyspark.sql.functions import count

In [27]:
result = df.filter(df['Close']<28)

In [28]:
result.select(count('Close')).show()

+------------+
|count(Close)|
+------------+
|          38|
+------------+



In [29]:
(df.filter(df['High']>30).count()*1.0/df.count())*100

91.26290706910247

In [30]:
from pyspark.sql.functions import corr

In [31]:
df.select(corr('High','Volume')).show()

+-------------------+
| corr(High, Volume)|
+-------------------+
|0.05385715370896034|
+-------------------+



In [32]:
from pyspark.sql.functions import year

In [33]:
yeardf = df.withColumn('Year',year(df['Date']))

In [34]:
max_df = yeardf.groupBy('Year').max()

In [35]:
max_df.select('Year','max(Low)','max(High)').show()

+----+---------+---------+
|Year| max(Low)|max(High)|
+----+---------+---------+
|2018|     47.5|     47.5|
|2015|     58.5|     58.5|
|2019|     45.0|     45.0|
|2020|    33.98|    33.98|
|2016|46.349998|46.349998|
|2017|49.650002|     50.0|
+----+---------+---------+



In [36]:
from pyspark.sql.functions import month

In [37]:
monthdf = df.withColumn('Month',month('Date'))

In [38]:
monthavga = monthdf.select('Month','Open','Close').groupBy('Month').mean()

In [39]:
monthavga.select('Month','avg(Open)','avg(Close)').orderBy('Month').show()

+-----+------------------+------------------+
|Month|         avg(Open)|        avg(Close)|
+-----+------------------+------------------+
|    1|  37.9899020490196| 37.86813727450979|
|    2|37.343645708333334|37.485104041666666|
|    3|41.529541339449544| 41.48192671559634|
|    4| 41.29980534951457|  41.2720383300971|
|    5| 42.18158863551404| 42.16551389719628|
|    6|41.648130467289704| 41.61887816822429|
|    7| 39.34276150476192|39.319999609523826|
|    8| 38.29526786607141|  38.2748214642857|
|    9|39.135148415841606|39.105148396039624|
|   10| 38.81639663963962|38.802522756756744|
|   11| 38.55320406796118| 38.47941765048545|
|   12| 37.48766983495146| 37.34815525242719|
+-----+------------------+------------------+

