<a href="https://colab.research.google.com/github/dubeyabhi07/big-data-spark/blob/master/src/main/pyspark/stocks/stockAnalysisDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# 1. nstall all the dependencies in Colab environment i.e. Apache Spark 2.4.4 with hadoop 2.7, Java 8 and Findspark to locate the spark in the system
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://apache.osuosl.org/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

In [0]:
# 2. Setup Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

In [0]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [11]:
stocksToBeLoaded = {"HDFC" : "../DLF.NS.csv",
                    "ICICI" : "../ICICIBANK.NS.csv",
                    "cipla" : "../CIPLA.NS.csv",
                    "HUL" :"../HINDUNILVR.NS.csv",
                    "sunpharma" : "../SUNPHARMA.NS.csv",
                    "DLF":"../DLF.NS.csv",
                    "indiaBulls":"../IBREALEST.NS.csv",                    
                    "reliance":"../RELIANCE.NS.csv",
                    "ONGC":"../ONGC.NS.csv",
                    "tataMotor":"../TATAMOTORS.NS.csv",
                    "hero":"../HEROMOTOCO.NS.csv",
                    "symphony":"../SYMPHONY.NS.csv",
                    "whirlpool":"../WHIRLPOOL.NS.csv",
                    "donear":"../DONEAR.NS.csv",
                    "raymond":"../RAYMOND.NS.csv"}

dataFrameMap = dict()
for key, value in stocksToBeLoaded.items():
  dataFrameMap.update({key:spark.read.csv(stocksToBeLoaded[key], inferSchema = True, header = True)})

for key,value in dataFrameMap.items():
  print(key+" : ")
  value.show(2)

HDFC : 
+-------------------+----------+----------+----------+----------+----------+-------+
|               Date|      Open|      High|       Low|     Close| Adj Close| Volume|
+-------------------+----------+----------+----------+----------+----------+-------+
|2019-01-01 00:00:00|177.100006|182.149994|177.000000|180.300003|177.382294|6246700|
|2019-01-02 00:00:00|179.899994|179.899994|172.250000|173.100006|170.298813|6604177|
+-------------------+----------+----------+----------+----------+----------+-------+
only showing top 2 rows

ICICI : 
+-------------------+----------+----------+----------+----------+----------+--------+
|               Date|      Open|      High|       Low|     Close| Adj Close|  Volume|
+-------------------+----------+----------+----------+----------+----------+--------+
|2019-01-01 00:00:00|361.500000|364.799988|356.399994|363.750000|361.981659| 7759872|
|2019-01-02 00:00:00|361.500000|367.899994|360.299988|364.600006|362.827515|18243156|
+-----------------

In [51]:
from pyspark.sql.window import Window
from pyspark.sql.functions import *

#if this orderBy is added to windowSpec then columns with agg will have cumulative results
windowSpecMonth = Window.partitionBy("month");

'''
     * 1. Most profitable stock-month combination in 2019 (descending order)
     * Description : which stock would have given most profit in any month of year 2019,
     *  assuming it was bought on first day of that month and sold on last day of that month in 2019.
     
'''
derivedMap = dict()

for key,value in dataFrameMap.items():
  value = value.withColumn("stockName",lit(key))\
  .withColumn("month", substring(col("Date"), 6, 2))\
  .withColumn("row", row_number().over(windowSpecMonth.orderBy(col("Date").desc())))\
  .withColumn("maxRow", max(col("row")).over(windowSpecMonth))\
  .withColumn("relevantPrice", when(col("row") == 1, col("Close"))\
    .when(col("row") == col("maxRow"), col("Open") * (-1))\
    .otherwise(0))\
  .withColumn("referencePrice", when(col("row") == col("maxRow"), col("Open"))\
    .otherwise(0))\
  .select(col("stockName"), col("month"), col("maxRow"),
    col("row"), col("relevantPrice"), col("referencePrice"))\
  .where((col("row") == 1) | (col("row") == col("maxRow")))\
  .drop(col("row")).drop(col("maxRow"))\
  .groupBy(col("month"), col("stockName").alias("stock"))\
  .agg(
      sum("relevantPrice").alias("maxProfitPerUnit"),
      sum("referencePrice").alias("referencePrice"))\
  .select(col("month"), col("stock"), col("referenceprice").alias("baseOpeningPrice"), col("maxProfitPerUnit"),
        ((col("maxProfitPerUnit") * 100) / col("referencePrice")).alias("maxProfitPercent"))
  
  derivedMap.update({key:value})

print("Monthly profit/loss for stocks in 2019 : ")
for key,value in derivedMap.items():
  print(key+" : ")
  value.show(100)
  


Monthly profit/loss for stocks in 2019 : 
HDFC : 
+-----+-----+----------------+-------------------+--------------------+
|month|stock|baseOpeningPrice|   maxProfitPerUnit|    maxProfitPercent|
+-----+-----+----------------+-------------------+--------------------+
|   07| HDFC|      189.699997|-11.550003000000004|  -6.088562563340475|
|   11| HDFC|      184.399994| 35.300003000000004|   19.14316927797731|
|   01| HDFC|      177.100006|             -12.25|  -6.916995813088792|
|   09| HDFC|           164.5| -8.899993999999992|  -5.410330699088141|
|   05| HDFC|      171.050003| 20.149993999999992|  11.780177519201793|
|   08| HDFC|      176.949997| -8.800003000000004|  -4.973158038538991|
|   03| HDFC|           166.0| 28.050003000000004|  16.897592168674702|
|   02| HDFC|      164.949997|              -0.25|-0.15156108187137463|
|   06| HDFC|           191.0| -2.449996999999996|  -1.282720942408375|
|   10| HDFC|           156.0| 27.300003000000004|  17.500001923076926|
|   12| HDFC| 

In [61]:
print("Most profitable stock-month combination stock in 2019 (descending order) : ")

stockList = list(derivedMap.keys())
result = derivedMap[stockList[0]]
derivedMap.pop(stockList[0])   
for key,value in derivedMap.items():
  result = finalDf.union(value)

result = result.orderBy(col("maxProfitPercent").desc())
result.show(20)

Most profitable stock-month combination stock in 2019 (descending order) : 
+-----+----------+----------------+------------------+------------------+
|month|     stock|baseOpeningPrice|  maxProfitPerUnit|  maxProfitPercent|
+-----+----------+----------------+------------------+------------------+
|   10| tataMotor|           118.0|59.699996999999996| 50.59321779661016|
|   03|indiaBulls|            71.0|             20.25| 28.52112676056338|
|   11|   raymond|          579.75|             154.0|  26.5631737818025|
|   11|   raymond|          579.75|             154.0|  26.5631737818025|
|   10|indiaBulls|       43.650002|         11.549999|26.460477596312597|
|   08|    donear|            22.9| 5.950000000000003|25.982532751091714|
|   04|indiaBulls|            93.5|21.550003000000004|23.048131550802147|
|   09| whirlpool|          1545.0|342.44995100000006| 22.16504537216829|
|   04| tataMotor|      176.350006|37.949996999999996|21.519702698507416|
|   11|indiaBulls|            56.0|1

In [64]:
'''
     * 2. Most profitable stock for each month in 2019
     * Description :  which stock would have given most profit in each month of year 2019,
     *  assuming it was bought on first day of that month and sold on last day of that month in 2019.  
'''
#groupBy operation here makes it difficult to retrieve the name of stock
result2 = result.withColumn("row", row_number().over(windowSpecMonth.orderBy(col("maxProfitPercent").desc())))\
      .where(col("row") == 1)\
      .drop(col("row"))\
      .orderBy(col("month"))

print("Most profitable stock for each month in 2019 : ")
result2.show()

Most profitable stock for each month in 2019 : 
+-----+----------+----------------+------------------+------------------+
|month|     stock|baseOpeningPrice|  maxProfitPerUnit|  maxProfitPercent|
+-----+----------+----------------+------------------+------------------+
|   01|  reliance|         1125.25|101.90002400000003| 9.055767518329263|
|   02|  symphony|     1173.949951|119.60009799999989|10.187836193367657|
|   03|indiaBulls|            71.0|             20.25| 28.52112676056338|
|   04|indiaBulls|            93.5|21.550003000000004|23.048131550802147|
|   05|       DLF|      171.050003|20.149993999999992|11.780177519201793|
|   06| whirlpool|          1456.0|134.05004899999994| 9.206734134615381|
|   07| sunpharma|           402.0|             24.75| 6.156716417910448|
|   08|    donear|            22.9| 5.950000000000003|25.982532751091714|
|   09| whirlpool|          1545.0|342.44995100000006| 22.16504537216829|
|   10| tataMotor|           118.0|59.699996999999996| 50.593217