###### Using the functions from one .ipynb notenook in another .ipynb file
* https://stackoverflow.com/questions/44116194/import-a-function-from-another-ipynb-file
* https://github.com/ipython/ipynb

##### Enable the shell to print multiple results (instead of only the last result)

In [None]:
## Enable the shell to print multiple results (instead of only the last result)

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

###### Conversion from Spark DataFrame to a Pandas DataFrame and viceversa

In [None]:
from pyspark.sql import SparkSession

def getSparkDFfromPandasDF(pandasDF):
    """
    Takes a Pandas DataFrame and converts it into a Spark DataFrame
    """
    tempSparkSession = SparkSession.builder.appName("chin_conv").getOrCreate()

    # Enable Arrow-based columnar data transfers
    tempSparkSession.conf.set("spark.sql.execution.arrow.enabled", "true")
    # on some machines it may give a warning which may be ignored

    # Generate a random pandas DataFrame
    # temp_pandasDF = pd.DataFrame(np.random.rand(100, 3))

    # Create a Spark DataFrame from a pandas DataFrame using Arrow
    sparkDF = tempSparkSession.createDataFrame(pandasDF)
    
    return sparkDF

def getPandasDFfromSparkDF(sparkDF):
    """
    Takes a Spark DataFrame and converts it into a Pandas DataFrame
    """
    # Convert the Spark DataFrame back to a pandas DataFrame using Arrow
    result_pandasDF = sparkDF.select("*").toPandas()
    return result_pandasDF

import json
import pyspark

"""
Here I am importing full pyspark for readability
in the method getJsonFromSparkDF(), I am validating whether the parameter is a spark DataFrame
I could hev comapred with DataFrame after doing a selective import (from pyspark.sql.dataframe import DataFrame)
But the users may get confuse it with pandas DataFrame if they accidentally miss to notice the import statement.

Now I am using fully qualitied modules for comparison, thus improving readability
"""

def getJsonFromSparkDF(sparkDF):
    """
    Takes a Spark DataFrame and processes its elements as a list of Spark Rows (which is returned from a sparkDF.collect()), i.e. type of each list member is pyspark.sql.types.Row
    Converts the input into a Json and returns it.
    If input is not a list of spark rows or invalid, it returns an empty json string
    sdfCallLogs.collect()
    """
    if not (isinstance(sparkDF, pyspark.sql.dataframe.DataFrame)):
        return '{}'
    sparkRowList = sparkDF.collect()               # The collect() method returns the sparkDF rows as a list of spark rows
    if not (isinstance(sparkRowList, list)):
        return '{}'
    if not (isinstance(sparkRowList[0], pyspark.sql.types.Row)):
        return '{}'

    resultJson = json.dumps(sparkRowList[0].asDict())

    # For second element onwards append the json for individual elements to the result with a comma separator
    for x in range(len(sparkRowList)-1):
        resultJson += (", " + json.dumps(sparkRowList[x+1].asDict()))  # This starts from second row onwards as first is already added to output
    return resultJson

##### Reading call logs (from Superbackup app) in python (where values are node attributes)

In [None]:
import pandas as pd 
import xml.etree.ElementTree as etree

def getCallLogXmlFromSuperbackup(call_log_xml_file="calllogs_20200512130135.xml"):
    """
    Parses the exported call logs (xml) from the SuperBackUp android application. The xml lfile structure is "alllogs^log"
    The each of the log record has attributes ["number", "time", "date", "type", "name", "duration"].
    To read the xml we use the module "xml.etree.ElementTree"

    Parameters
    ----------
    call_log_xml_file : A string representing the xml file name (may be a full path) - This is the log file produced from teh SuperBackUp android application.

    """
    tree = etree.parse(call_log_xml_file)
    root = tree.getroot()
    columns = ["number", "time", "date", "type", "name", "dur"] #The column list is closely tied to the call log xml
    df_Calllogs = pd.DataFrame(columns = columns)

    for node in root: 
        number = node.attrib.get("number")
        time = node.attrib.get("time") # if node is not None else None
        date = node.attrib.get("date")
        type = node.attrib.get("type")
        name = node.attrib.get("name")
        # name = node.find("name")
        dur = node.attrib.get("dur")
        df_Calllogs = df_Calllogs.append(pd.Series([number, time, date, type, name, dur], index = columns), ignore_index = True)
    
    return df_Calllogs

#### Reading Plain Text Files

In [None]:
#from codecs import open
# Print contents of a file

def printTextFile(file_name):
    """
    Opens a text file (txt, csv, xml etc..) in 'utf-8' encoding format and prints its contents.
    params
    ------
    It can accept a relative path or a full path to a file in the same file system as this utility
    """
    f = open(file_name, 'r', encoding='utf-8')
    file_contents = f.read()
    print (file_contents)
    f.close()

### Conversion between Pandas DataFrame and Spark DataFrame
###### https://docs.databricks.com/spark/latest/spark-sql/spark-pandas.html
###### https://stackoverflow.com/questions/50958721/convert-a-spark-dataframe-to-pandas-df

<pre>
from pyspark.sql import SparkSession
spark1 = SparkSession.builder.appName("chin_conv").getOrCreate()

import numpy as np
import pandas as pd

* Enable Arrow-based columnar data transfers
spark1.conf.set("spark.sql.execution.arrow.enabled", "true")

* Generate a pandas DataFrame
pdf = pd.DataFrame(np.random.rand(100, 3))

* Create a Spark DataFrame from a pandas DataFrame using Arrow
sdf = spark1.createDataFrame(df_Calllogs)

* Convert the Spark DataFrame back to a pandas DataFrame using Arrow
result_pdf = sdf.select("*").toPandas()
</pre>

### TEST CODE

In [None]:
my_xml_file = "calllogs_20200512130135.xml"
my_json_file = "../Python-and-Spark-for-Big-Data-master/Spark_DataFrames/people.json"

In [None]:
printTextFile(my_xml_file)

In [None]:
#df_Calllogs[df_Calllogs["number"]=="+919052656567"]
#df_Calllogs["duration"].max()
# df_Calllogs

### %timeit getCallLogXmlFromSuperbackup()
### 7.65 s ± 201 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

p_df = getCallLogXmlFromSuperbackup()    # returns teh sample call log data as a pandas DataFrame
p_df

In [None]:
s_df = getSparkDFfromPandasDF(p_df)
s_df.show()

In [None]:
temp_p_df = getPandasDFfromSparkDF(s_df)
temp_p_df

In [None]:
getJsonFromSparkDF([1,2,3])  # Testing for wrong input

callog_json = getJsonFromSparkDF(s_df)
callog_json