# Pyspark Hive Playground
## A playground for enthusiasts

Author: GGordon (https://github.com/gggordon)

## Utility Functions

In [2]:
def run_process(command):
    """
    Runs a process on the command line
    """
    import subprocess
    
    command = str(command)
    command = command.split(" ")
    
    try:
        result = subprocess.run(command, stdout=subprocess.PIPE, stderr = subprocess.PIPE)
        output = result.stdout.decode("utf8")
        
        return {
            "output": None if len(output.strip()) == 0 else output , 
            "error":result.stderr.decode("utf8")
        }
    except Exception as e:
        return {"output":None,"error":str(e)}
    
def pfile_exists(file_path):
    return run_process("ls {}".format(file_path))["output"] is not None

def print_process_output(process_output, separator=None):
    """
    Prints output of `run_process`
    
    params:
        process_output: dict with keys "output" and "error" 
        separator: str , marker to separate output and error
    """
    if process_output is not None:
        print( process_output["output"] if "output" in process_output else "")
        print("-"*32 if separator is None else separator)
        print( process_output["error"] if "error" in process_output else "")
            
        

ppo = print_process_output

## Install Pyspark

In [3]:
if run_process("which pyspark")["output"] is None:
    print("Installing Pyspark...")
    result = run_process("pip install pyspark")
    ppo(result)
else:
    print("Pyspark already installed")

Installing Pyspark...
Collecting pyspark
  Downloading pyspark-3.0.2.tar.gz (204.8 MB)
Collecting py4j==0.10.9
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py): started
  Building wheel for pyspark (setup.py): finished with status 'done'
  Created wheel for pyspark: filename=pyspark-3.0.2-py2.py3-none-any.whl size=205186689 sha256=6b7ef173d2635cf95a3e53f3996569393f70c28c46c8047a312878319d5f78f6
  Stored in directory: /root/.cache/pip/wheels/9a/39/f6/970565f38054a830e9a8593f388b36e14d75dba6c6fdafc1ec
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.0.2

--------------------------------
You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.



## Retrieve retail db files as json

In [4]:
# Uncomment Below to get some retail data
# !mkdir -p data/retaildbjson
# !wget -O data/retaildbjson/products.json https://github.com/dgadiraju/data/raw/master/retail_db_json/products/part-r-00000-158b7037-4a23-47e6-8cb3-8cbf878beff7
# !wget -O data/retaildbjson/orders.json https://github.com/dgadiraju/data/raw/master/retail_db_json/orders/part-r-00000-990f5773-9005-49ba-b670-631286032674
# !wget -O data/retaildbjson/orderitems.json https://github.com/dgadiraju/data/raw/master/retail_db_json/order_items/part-r-00000-6b83977e-3f20-404b-9b5f-29376ab1419e
# !wget -O data/retaildbjson/departments.json https://github.com/dgadiraju/data/raw/master/retail_db_json/departments/part-r-00000-3db7cfae-3ad2-4fc7-88ff-afe0ec709f49
# !wget -O data/retaildbjson/customers.json https://github.com/dgadiraju/data/raw/master/retail_db_json/customers/part-r-00000-70554560-527b-44f6-9e80-4e2031af5994
# !wget -O data/retaildbjson/categories.json https://github.com/dgadiraju/data/raw/master/retail_db_json/categories/part-r-00000-ce1d8208-178d-48d3-bfb2-1a97d9c05094 

# Analytics Using Spark

In [5]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

In [6]:
spark_master = "local[*]"

In [7]:
sqlContext = SparkSession.builder.appName("SCHAD Data Profiling")
if spark_master:
    sqlContext = sqlContext.master(spark_master)
sqlContext = sqlContext.getOrCreate()

In [8]:
sqlContext

In [9]:
from IPython.display import display as  idisplay

In [10]:
sqlContext.sql("""

SELECT color from (
SELECT * from VALUES ("Red"),("Blue"), ("Green"), ("Yellow") as Colors (color)
) colors
""").show()

+------+
| color|
+------+
|   Red|
|  Blue|
| Green|
|Yellow|
+------+

