#### Enabling Jupyter shell to print multiple results form a single shell

In [1]:
## Enable the shell to print multiple results (instead of only the last result)

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

###### Import modules form another ipynb (jupuyter notebook written by me

In [2]:
import ipynb.fs  # Boilerplate required

# Do a full import
# from .full.Chinmay_Utilities import foo

# Do a definitions-only import
from .defs.Chinmay_Utilities import getCallLogXmlFromSuperbackup, getSparkDFfromPandasDF, getJsonFromSparkDF, printTextFile  #, getPandasDFfromSparkDF

# We can "import ipynb.fs.defs.Chinmay_Utilities" instead of two imports "import ipynb.fs" followed by ".defs.Chinmay_Utilities"

#### Printing contents from a plain text file (.txt, csv, .xml, .csv etc)

<pre>
#from codecs import open
#Print contents of a file
def printTextFile(file_name):
    f = open(file_name, 'r', encoding='utf-8')
    file_contents = f.read()
    print (file_contents)
    f.close()
</ore>

###### First define all the local sample dat files in local variables to be used in the exercises below

In [3]:
my_csv_app_stocks = "../Python-and-Spark-for-Big-Data-master/Spark_DataFrames/appl_stock.csv"
my_csv_sales_info = "../Python-and-Spark-for-Big-Data-master/Spark_DataFrames/sales_info.csv"
my_csv_contains_null = "../Python-and-Spark-for-Big-Data-master/Spark_DataFrames/ContainsNull.csv"
my_json_people = "../Python-and-Spark-for-Big-Data-master/Spark_DataFrames/people.json"

### Spark DataFrame Experiments

###### Refer Documentation for pyspqrk.sql package at https://spark.apache.org/docs/latest/api/python/pyspark.sql.html

In [4]:
from pyspark.sql import SparkSession

#### Getting help on a method in a builder pattern

In [5]:
# To get the help of a method / attribute in a builder pattern, 
#     split the pattern just before that method
#     set a variable with the builder pattern result just before that method call, 
#         so that the method call can be performed on the variable.
#     Now execute (SHIFT+ENTER) the help syntax i.e. "method?"" NOT "method()?"" on that variable
#     
# Below is an example for getting help on getOrCreate() method in "SparkSession.builder.appName('Basics').getOrCreate()"
#     
bld = SparkSession.builder.appName('Basics')
# bld.getOrCreate??  ### Uncomment this line to get the help ("?") and code implementation ("??")
#     
# Here we can not use "SparkSession.builder.appName('Basics').getOrCreate?", because there is a use input involved (i.e. parameter of appName())

In [6]:
# Uncomment specific function below and run this shell to get help
# getCallLogXmlFromSuperbackup?
# getSparkDFfromPandasDF?
# getJsonFromSparkDF?
# printTextFile??
# getPandasDFfromSparkDF?


###### Read a json using SparkSession and analyse the result databrame

In [7]:
#Get a SparkSession
spark1 = SparkSession.builder.appName('Basics').getOrCreate()

In [8]:
# Read a sample spark session into a spark dataframe
printTextFile(my_json_people)
sdf = spark1.read.json(my_json_people)

# Types of files that can be read csv/format/jdbc/json/load/option/options/orc/parquet/schema/table/text

{"name":"Michael"}
{"name":"Andy", "age":30}
{"name":"Justin", "age":19}



In [9]:
#printSchema automatically decides the schema based on data.
sdf.printSchema()

sdf.columns

sdf.describe

sdf.describe()

# describe() given summary of numeric columns in the dataframe
sdf.describe().show()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



['age', 'name']

<bound method DataFrame.describe of DataFrame[age: bigint, name: string]>

DataFrame[summary: string, age: string, name: string]

+-------+------------------+-------+
|summary|               age|   name|
+-------+------------------+-------+
|  count|                 2|      3|
|   mean|              24.5|   null|
| stddev|7.7781745930520225|   null|
|    min|                19|   Andy|
|    max|                30|Michael|
+-------+------------------+-------+



###### Modify the JsonSchema using a user defined schema

In [10]:
# Many times spark can not determine the data types in a json correctly and specifies each of the fields as String.
# In this case we can define a schema and attach it to the json
from pyspark.sql.types import StructField, IntegerType, StringType, StructType

In [11]:
# StructField(field_name, field_type(), is_field_nullable)
# To enforce the user defined scheme to a json pass a list of structfields one for each column
data_schema = [StructField ('age', IntegerType(), False), 
                StructField('name', StringType(), True)]

final_type = StructType(fields=data_schema)

final_type
final_type["age"]
final_type["name"]

StructType(List(StructField(age,IntegerType,false),StructField(name,StringType,true)))

StructField(age,IntegerType,false)

StructField(name,StringType,true)

###### Read the same json using the user defined schema (earlier it was the default one)

In [12]:
printTextFile(my_json_people)
sdf2 = spark1.read.json(my_json_people, schema=final_type)

sdf2.printSchema()
sdf2.show()

{"name":"Michael"}
{"name":"Andy", "age":30}
{"name":"Justin", "age":19}

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



###### Refer back the spark dataframe with builtin default schema

In [13]:
sdf.printSchema()
sdf.show()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



### Invoking a method to get the sample call log using a module from another ipynb file in the same folder
* ###### sparkCallSession.createDataFrame(p_df) --> converts pandas dataframe into spark dataframe
* ###### s_df.select("*").toPandas() --> converts spark dataframe into pandas dataframe

In [14]:
# Get the call log xml data in a Pandas DataFrame
dfCallLogs = getCallLogXmlFromSuperbackup()   ## Calling from Chinmay_Utilities.ipynb

# Convert the Pandas DataFrame into Spark DataFrame
sdfCallLogs = getSparkDFfromPandasDF(dfCallLogs)   ## Calling from Chinmay_Utilities.ipynb

  An error occurred while calling z:org.apache.spark.sql.api.python.PythonSQLUtils.readArrowStreamFromFile.
: java.lang.IllegalArgumentException
	at java.nio.ByteBuffer.allocate(ByteBuffer.java:334)
	at org.apache.arrow.vector.ipc.message.MessageSerializer.readMessage(MessageSerializer.java:543)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$$anon$3.readNextBatch(ArrowConverters.scala:243)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$$anon$3.<init>(ArrowConverters.scala:229)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$.getBatchesFromStream(ArrowConverters.scala:228)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$$anonfun$readArrowStreamFromFile$2.apply(ArrowConverters.scala:216)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$$anonfun$readArrowStreamFromFile$2.apply(ArrowConverters.scala:214)
	at org.apache.spark.util.Utils$.tryWithResource(Utils.scala:2543)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$.readArro

#### Using Select statements with Spark
* We can use limited select statement (upto selection of columns, adding computed columns and without any where clause)
* * spark dataframe can use complex where clause as explained below
* To use full version of select sql along with where clause we need to register the Spark DataFrame as a table using the method below.
* * sparkDF.createOrReplaceTempView(pseudo_tableView_name)
* * This is used only with sparkSessn.sql()

In [15]:
# Register the spark dataframe as a table/view to be used like standard sql using sparkSession.sql()
sdfCallLogs.createOrReplaceTempView("call_logs")

##### Using pure SQL with DataFrames
* Register the dataframe as a table and
* Use sparkSession1.sql(SQL_STMT_table)

##### Filtering using SQL
* ###### sparkSesn.sql(full_sql)
* ###### This works but supports limited where clause

##### Using pure SQL with DataFrames
* Register the dataframe as a table and
* Use sparkSession1.sql(SQL_STMT_table)spark1.sql("SELECT * FROM call_logs WHERE dur > 100 ORDER BY dur DESC").show()
* The where clause of this sql is not supporting LIKE clause
* This complex where clause (LIKE clause) is possible through direct "where" clause on spark dataframe (next statement)

###### Quick filtering: (one of the two alternate ways on sparkDF)
* ###### sparkDF.select(*).where(my_condition)
* ###### sparkDF.filter(my_condition)

In [16]:
my_where_clause = "upper(name) like '%SEEC%QA%' AND dur > 50"

In [17]:
# Querying the Spark DataFrame directly with WHERE clause
# This sort of complex where clauses (e.g. LIKE clauses) are not possible with pandas dataframe

# sdfCallLogs.select("*").where("upper(name) like '%SEEC%QA%'").show()

# Convert the filtered data into pandas data frame which can be processed or outputted into a file
df_qa = sdfCallLogs.select("*").where(my_where_clause).toPandas()
df_qa

Unnamed: 0,number,time,date,type,name,dur
0,919490578459,"Apr 14, 2020 10:12:17",1586839337615,2,Ex Seec QA Brahmanand,87
1,919490578459,"Mar 17, 2020 08:00:02",1584412202747,2,Ex Seec QA Brahmanand,698
2,919490578459,"Feb 29, 2020 16:07:50",1582972670439,2,Ex Seec QA Brahmanand,1910
3,919490436219,"Feb 16, 2020 21:27:45",1581868665565,1,Ex Seec Shanmuk Donkada SQA Manager,471
4,919490436219,"Feb 11, 2020 09:11:31",1581392491179,1,Ex Seec Shanmuk Donkada SQA Manager,107
5,919848554459,"Jan 27, 2020 20:39:51",1580137791944,2,Ex Seec QA Brahmanand,2311
6,919848554459,"Jan 27, 2020 19:12:35",1580132555904,1,Ex Seec QA Brahmanand,302
7,919985395760,"Nov 29, 2019 09:40:05",1575000605417,1,Ex Seec QA Srikanth Gunti,1171


In [18]:
sdfCallLogs.filter(my_where_clause).toPandas()

Unnamed: 0,number,time,date,type,name,dur
0,919490578459,"Apr 14, 2020 10:12:17",1586839337615,2,Ex Seec QA Brahmanand,87
1,919490578459,"Mar 17, 2020 08:00:02",1584412202747,2,Ex Seec QA Brahmanand,698
2,919490578459,"Feb 29, 2020 16:07:50",1582972670439,2,Ex Seec QA Brahmanand,1910
3,919490436219,"Feb 16, 2020 21:27:45",1581868665565,1,Ex Seec Shanmuk Donkada SQA Manager,471
4,919490436219,"Feb 11, 2020 09:11:31",1581392491179,1,Ex Seec Shanmuk Donkada SQA Manager,107
5,919848554459,"Jan 27, 2020 20:39:51",1580137791944,2,Ex Seec QA Brahmanand,2311
6,919848554459,"Jan 27, 2020 19:12:35",1580132555904,1,Ex Seec QA Brahmanand,302
7,919985395760,"Nov 29, 2019 09:40:05",1575000605417,1,Ex Seec QA Srikanth Gunti,1171


In [19]:
# Rename the columns before converting to Pandas DataFrame
df_qa2 = sdfCallLogs.filter(my_where_clause).select('name', 'number', 'dur', 'time') \
    .withColumnRenamed('dur', 'Duration (Sec)').withColumnRenamed('name', 'Name')\
    .withColumnRenamed('time', 'Date').withColumnRenamed('number', 'Phone Number').toPandas()

# Below line inserts a new column with value double of the current 'dur' columns value
df_qa2B = sdfCallLogs.filter(my_where_clause).select('name', 'number', 'dur', 'time') \
            .withColumn('double_duration',sdfCallLogs['dur']*2).toPandas()

# To show a pd.DataFrame without column index
df_qa2.style.hide_index()
df_qa2B.style.hide_index()

# To write an dataframe to an excel file without index column
df_qa2.to_excel('1.xlsx',index=False)

Name,Phone Number,Duration (Sec),Date
Ex Seec QA Brahmanand,919490578459,87,"Apr 14, 2020 10:12:17"
Ex Seec QA Brahmanand,919490578459,698,"Mar 17, 2020 08:00:02"
Ex Seec QA Brahmanand,919490578459,1910,"Feb 29, 2020 16:07:50"
Ex Seec Shanmuk Donkada SQA Manager,919490436219,471,"Feb 16, 2020 21:27:45"
Ex Seec Shanmuk Donkada SQA Manager,919490436219,107,"Feb 11, 2020 09:11:31"
Ex Seec QA Brahmanand,919848554459,2311,"Jan 27, 2020 20:39:51"
Ex Seec QA Brahmanand,919848554459,302,"Jan 27, 2020 19:12:35"
Ex Seec QA Srikanth Gunti,919985395760,1171,"Nov 29, 2019 09:40:05"


name,number,dur,time,double_duration
Ex Seec QA Brahmanand,919490578459,87,"Apr 14, 2020 10:12:17",174
Ex Seec QA Brahmanand,919490578459,698,"Mar 17, 2020 08:00:02",1396
Ex Seec QA Brahmanand,919490578459,1910,"Feb 29, 2020 16:07:50",3820
Ex Seec Shanmuk Donkada SQA Manager,919490436219,471,"Feb 16, 2020 21:27:45",942
Ex Seec Shanmuk Donkada SQA Manager,919490436219,107,"Feb 11, 2020 09:11:31",214
Ex Seec QA Brahmanand,919848554459,2311,"Jan 27, 2020 20:39:51",4622
Ex Seec QA Brahmanand,919848554459,302,"Jan 27, 2020 19:12:35",604
Ex Seec QA Srikanth Gunti,919985395760,1171,"Nov 29, 2019 09:40:05",2342


In [20]:
sdfCallLogs.filter(my_where_clause).filter((sdfCallLogs['dur']>300) & ~(sdfCallLogs['dur']<1000)).show()

+-------------+--------------------+-------------+----+--------------------+----+
|       number|                time|         date|type|                name| dur|
+-------------+--------------------+-------------+----+--------------------+----+
|+919490578459|Feb 29, 2020 16:0...|1582972670439|   2|Ex Seec QA Brahma...|1910|
|+919848554459|Jan 27, 2020 20:3...|1580137791944|   2|Ex Seec QA Brahma...|2311|
|+919985395760|Nov 29, 2019 09:4...|1575000605417|   1|Ex Seec QA Srikan...|1171|
+-------------+--------------------+-------------+----+--------------------+----+



In [21]:
sdfCallLogs.filter(my_where_clause).filter('dur>300 and dur>=1000').show()

+-------------+--------------------+-------------+----+--------------------+----+
|       number|                time|         date|type|                name| dur|
+-------------+--------------------+-------------+----+--------------------+----+
|+919490578459|Feb 29, 2020 16:0...|1582972670439|   2|Ex Seec QA Brahma...|1910|
|+919848554459|Jan 27, 2020 20:3...|1580137791944|   2|Ex Seec QA Brahma...|2311|
|+919985395760|Nov 29, 2019 09:4...|1575000605417|   1|Ex Seec QA Srikan...|1171|
+-------------+--------------------+-------------+----+--------------------+----+



###### Convert the Spark DataFrame into Json
###### Convert a Pandas DataFrame into Json by first converting into a Spark DataFrame

In [22]:
getJsonFromSparkDF(sdfCallLogs)   ## Calling from Chinmay_Utilities.ipynb

'{"number": "+919052656567", "time": "May 12, 2020 11:27:23", "date": "1589263043752", "type": "2", "name": "HDFC Preferred Banker", "dur": "111"}, {"number": "+919052656567", "time": "May 12, 2020 11:24:12", "date": "1589262852040", "type": "1", "name": "HDFC Preferred Banker", "dur": "72"}, {"number": "+919052656567", "time": "May 11, 2020 14:36:56", "date": "1589188016416", "type": "1", "name": "HDFC Preferred Banker", "dur": "143"}, {"number": "+919989726495", "time": "May 10, 2020 12:54:39", "date": "1589095479969", "type": "1", "name": "", "dur": "40"}, {"number": "+917978882337", "time": "May 10, 2020 11:27:54", "date": "1589090274761", "type": "2", "name": "Maa", "dur": "376"}, {"number": "+918249658490", "time": "May 10, 2020 11:26:39", "date": "1589090199512", "type": "2", "name": "Mama (Papa)", "dur": "17"}, {"number": "+918249658490", "time": "May 10, 2020 11:10:01", "date": "1589089201436", "type": "2", "name": "Mama (Papa)", "dur": "965"}, {"number": "+919822069274", "tim

In [23]:
# To get Json from a pandas DataFrame fist convert itinto a Spark DataFrame and then get Json from it
getJsonFromSparkDF(getSparkDFfromPandasDF(dfCallLogs))   ## Calling from Chinmay_Utilities.ipynb

  An error occurred while calling z:org.apache.spark.sql.api.python.PythonSQLUtils.readArrowStreamFromFile.
: java.lang.IllegalArgumentException
	at java.nio.ByteBuffer.allocate(ByteBuffer.java:334)
	at org.apache.arrow.vector.ipc.message.MessageSerializer.readMessage(MessageSerializer.java:543)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$$anon$3.readNextBatch(ArrowConverters.scala:243)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$$anon$3.<init>(ArrowConverters.scala:229)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$.getBatchesFromStream(ArrowConverters.scala:228)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$$anonfun$readArrowStreamFromFile$2.apply(ArrowConverters.scala:216)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$$anonfun$readArrowStreamFromFile$2.apply(ArrowConverters.scala:214)
	at org.apache.spark.util.Utils$.tryWithResource(Utils.scala:2543)
	at org.apache.spark.sql.execution.arrow.ArrowConverters$.readArro

'{"number": "+919052656567", "time": "May 12, 2020 11:27:23", "date": "1589263043752", "type": "2", "name": "HDFC Preferred Banker", "dur": "111"}, {"number": "+919052656567", "time": "May 12, 2020 11:24:12", "date": "1589262852040", "type": "1", "name": "HDFC Preferred Banker", "dur": "72"}, {"number": "+919052656567", "time": "May 11, 2020 14:36:56", "date": "1589188016416", "type": "1", "name": "HDFC Preferred Banker", "dur": "143"}, {"number": "+919989726495", "time": "May 10, 2020 12:54:39", "date": "1589095479969", "type": "1", "name": "", "dur": "40"}, {"number": "+917978882337", "time": "May 10, 2020 11:27:54", "date": "1589090274761", "type": "2", "name": "Maa", "dur": "376"}, {"number": "+918249658490", "time": "May 10, 2020 11:26:39", "date": "1589090199512", "type": "2", "name": "Mama (Papa)", "dur": "17"}, {"number": "+918249658490", "time": "May 10, 2020 11:10:01", "date": "1589089201436", "type": "2", "name": "Mama (Papa)", "dur": "965"}, {"number": "+919822069274", "tim

##### Check types of DataFrame, Columns and displaying selected columns as DataFrame

In [24]:
type(sdf)
type(sdf['Age'])
#sdf['Age'].show()         # This line does nto work as we can not display columns
sdf.select('Age').show()   # This returns a dataframe of selected columns
type(sdf.select('Age'))

sdf[sdf['Age']>0].show()    # This works similar to regular pandas dataframe filtering

pyspark.sql.dataframe.DataFrame

pyspark.sql.column.Column

+----+
| Age|
+----+
|null|
|  30|
|  19|
+----+



pyspark.sql.dataframe.DataFrame

+---+------+
|age|  name|
+---+------+
| 30|  Andy|
| 19|Justin|
+---+------+



##### Display rows form top of dataframe

In [25]:
sdf.head(10) # display atmost 10 rows from top of df

[Row(age=None, name='Michael'),
 Row(age=30, name='Andy'),
 Row(age=19, name='Justin')]

##### Renaming a column and Inserting a computed column

In [26]:
sdf1 = sdf.withColumnRenamed('age', 'old_age')
type(sdf1)
sdf1.show()

pyspark.sql.dataframe.DataFrame

+-------+-------+
|old_age|   name|
+-------+-------+
|   null|Michael|
|     30|   Andy|
|     19| Justin|
+-------+-------+



##### Renaming a column and Inserting a computed column

In [27]:
sdf2 = sdf.withColumn('double_age',sdf['age']*2)
type(sdf2)
sdf2.show()

pyspark.sql.dataframe.DataFrame

+----+-------+----------+
| age|   name|double_age|
+----+-------+----------+
|null|Michael|      null|
|  30|   Andy|        60|
|  19| Justin|        38|
+----+-------+----------+



#### Experimenting with GroupBy and Aggregates

In [3]:
# REPEATING DEFINITION (for readability)
my_csv_sales_info = "../Python-and-Spark-for-Big-Data-master/Spark_DataFrames/sales_info.csv"

In [30]:
from pyspark.sql import SparkSession
sparkSesnGrpby = SparkSession.builder.appName("chin_groupby").getOrCreate()

In [39]:
sdf_sales = sparkSesnGrpby.read.csv(my_csv_sales_info, inferSchema=True, header=True)
# allow spark to assume first row as the column names and to decide the data type from data value

In [75]:
sdf_sales.printSchema()

sdf_sales.show()

root
 |-- Company: string (nullable = true)
 |-- Person: string (nullable = true)
 |-- Sales: double (nullable = true)

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|    Sam|200.0|
|   GOOG|Charlie|120.0|
|   GOOG|  Frank|340.0|
|   MSFT|   Tina|600.0|
|   MSFT|    Amy|124.0|
|   MSFT|Vanessa|243.0|
|     FB|   Carl|870.0|
|     FB|  Sarah|350.0|
|   APPL|   John|250.0|
|   APPL|  Linda|130.0|
|   APPL|   Mike|750.0|
|   APPL|  Chris|350.0|
+-------+-------+-----+



In [53]:
sdf_sales.groupBy("Company")    # returns pysdf_sales.groupBy("Company").mean().show()spark.sql.group.GroupedData

# Aggregators on top of GroupedData returns a spark DataFrame for our consumption
# Various gorubby aggregator functions: mean/sum/max/min
sdf_sales.groupBy("Company").count().show()
sdf_sales.groupBy("Company").mean().withColumnRenamed('avg(Sales)','Average Sales').show()
sdf_sales.groupBy("Company").sum().show()
sdf_sales.groupBy("Company").max().show()
sdf_sales.groupBy("Company").min().show()

<pyspark.sql.group.GroupedData at 0x1c140c1dfc8>

+-------+-----+
|Company|count|
+-------+-----+
|   APPL|    4|
|   GOOG|    3|
|     FB|    2|
|   MSFT|    3|
+-------+-----+

+-------+-----------------+
|Company|    Average Sales|
+-------+-----------------+
|   APPL|            370.0|
|   GOOG|            220.0|
|     FB|            610.0|
|   MSFT|322.3333333333333|
+-------+-----------------+

+-------+----------+
|Company|sum(Sales)|
+-------+----------+
|   APPL|    1480.0|
|   GOOG|     660.0|
|     FB|    1220.0|
|   MSFT|     967.0|
+-------+----------+

+-------+----------+
|Company|max(Sales)|
+-------+----------+
|   APPL|     750.0|
|   GOOG|     340.0|
|     FB|     870.0|
|   MSFT|     600.0|
+-------+----------+

+-------+----------+
|Company|min(Sales)|
+-------+----------+
|   APPL|     130.0|
|   GOOG|     120.0|
|     FB|     350.0|
|   MSFT|     124.0|
+-------+----------+



In [56]:
sdf_sales.groupBy("Company").sum().collect()

[Row(Company='APPL', sum(Sales)=1480.0),
 Row(Company='GOOG', sum(Sales)=660.0),
 Row(Company='FB', sum(Sales)=1220.0),
 Row(Company='MSFT', sum(Sales)=967.0)]

In [79]:
group_data_by_company = sdf_sales.groupBy('Company')
group_data_by_company.sum().show()

+-------+----------+
|Company|sum(Sales)|
+-------+----------+
|   APPL|    1480.0|
|   GOOG|     660.0|
|     FB|    1220.0|
|   MSFT|     967.0|
+-------+----------+



In [80]:
getJsonFromSparkDF(group_data_by_company.sum())  # using function from Chinmay_Utilities.ipynb

'{"Company": "APPL", "sum(Sales)": 1480.0}, {"Company": "GOOG", "sum(Sales)": 660.0}, {"Company": "FB", "sum(Sales)": 1220.0}, {"Company": "MSFT", "sum(Sales)": 967.0}'