In [None]:
#  Last amended:09th Sep, 2022
#  Myfolder: /home/ashok/Documents/spark
# Ref:
# Tutorials (slightly dated):
#      https://changhsinlee.com/pyspark-dataframe-basics/
#      https://www.analyticsvidhya.com/blog/2016/10/spark-dataframe-and-operations/
# Cheat Sheet
#      https://s3.amazonaws.com/assets.datacamp.com/blog_assets/PySpark_SQL_Cheat_Sheet_Python.pdf

#  Objectives:
#           Dataframe operations in spark cluster

pyspark APIs<br>
> i)  [DataFrame APIs](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html#dataframe-apis)<br>
>> df.select(columnName).where(colObject > 30).orderBy(desc(columnName))<br>
>> df.select(columnName).where("colName > 30").orderBy(desc(columnName))<br>

> ii) [Column APIs](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html#column-apis)<br>
>> df.select(df.age.isNull())<br>
>> df.select(df["age"].isNull())<br>
>> df.select(col("age").isNull())<br>

> iii)[Data Tyoes](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html#data-types)<br>
> iv) [Functions](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html#functions)<br>
>> df.select(sum("age"))<br>
>> df.select(sum(col(booleanColumn).cast("int")))<br>
>> <u>but you must import the functions</u>

> v)  [Grouping](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html#grouping)<br>

A. Initial operations:
1.0 Start hadoop in a terminal:

            ./allstart.sh
            OR
            ./quick_allstart.sh

## Transfer files to hadoop

In [None]:
# 1.1 Transfer data file 'blackfridayless.csv' to hadoop
#     Linux File folder:  /cdata/misc_datasets/black_friday
#     In Hadoop first make a folder: /user/ashok/datadir 
#     and then transfer the file 'blackfridayless.csv' to 
#     this folder: /user/ashok/datadir

"""

cd ~
hdfs dfs -rm -f -r  /user/ashok/datadir
hdfs dfs -mkdir /user/ashok/datadir
hdfs dfs -put /cdata/misc_datasets/black_friday/blackfridayless.csv  /user/ashok/datadir
hdfs dfs -ls /user/ashok/datadir


"""

!cd ~
!hdfs dfs -rm -f -r  /user/ashok/datadir
!hdfs dfs -mkdir /user/ashok/datadir
!hdfs dfs -put /cdata/misc_datasets/black_friday/blackfridayless.csv  /user/ashok/datadir
!hdfs dfs -ls /user/ashok/datadir




## Set jupyter notebook options
Start pyspark with jupyter notebook interface. There is no need to create SparkContext and Spark session. pyspark creates them when starting.

In [None]:
# 1.2 Display multiple outputs from a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
# 1.3 Increase cell width to display wide columnar output
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

### Read the csv file from hadoop

In [None]:
###### Read file 'blackfridayless.csv' from hadoop

# 2.0 What is the URL of folder in hadoop where blackfriday file existshaving the file?
#     url: "http://localhost:9000/<folderPath>"


URL_of_folder= "hdfs://localhost:9000/user/ashok/datadir/"

In [None]:
# 2.1 Read the file blackfridayless.csv. Takes time.
#      Use 'spark.read.csv' session object to read file:
#       Here is reading template:

blackfriday = spark.read.csv(
                             path = URL_of_folder + "blackfridayless.csv" ,
                             inferSchema = True,                 # True or False
                             header = True,                      # True or False
                             sep = ",",                         # Which one: , ;, | etc
                             ignoreLeadingWhiteSpace = True,     # True or False
                             ignoreTrailingWhiteSpace = True    # True of False
    
                            )

## Explore the dataframe

In [None]:
# 2.2 Show five rows of data:

blackfriday.head()
blackfriday.show(3)


In [None]:
# 2.3 Show data columns

blackfriday.columns


In [None]:
# 2.4 Show dtypes:

blackfriday.dtypes

In [None]:
# 2.5 Print schema of blackfriday:

blackfriday.printSchema()

In [None]:
# 3.0 Describe the statistics of data, few columns at a time:
blackfriday.select("gender", "age").describe()
blackfriday.select("gender", "age").describe().show()



In [None]:
# 3.1 Count How many distinct userids are there 
#     Use distinct() and count()

blackfriday.select('userid').count()


In [None]:
# 3.2 Count how many distinct age-groups exist

blackfriday.select('age').distinct().show()

In [None]:
# 4.0 How many null values occur in each column

from pyspark.sql.functions import isnan, isnull,col, sum, max

In [None]:
# 4.1
for i in blackfriday.columns:
    blackfriday.select(sum(col(i).isNull().alias("nullcol").cast("int")).alias(i)).show()    

These columns have null values. Most probably it means that there is no sub-category or sub-categories present. <br>
How would you plan to fill them?<br>
productCat1 :  0 <br>
productCat2 :  31429 <br>
productCat3 :  70100 <br>

In [None]:
# 5.0 Get a list of all integer columns and string columns
#     Use list comprhension along with dtypes:

[ i[0]  for i in blackfriday.dtypes   if i[1] == 'int']

In [None]:
from pyspark.sql import functions as F

In [None]:
# 5.1 Display maximum of productCat2 and productCat3
#     Use select() along with 'max' function




In [None]:
# 5.2 Find minimum and max values of 'occupation' column



In [None]:
# 5.3 Fill null values in productCat2 and productCat3 with 999
#     Use df.na.fill({})




In [None]:
# 6.0 Transform spark dataframe to pandas dataframe:
#     Use df.toPandas()




In [None]:
# 6.1 Show a value count of levels of column 'cityCategory':
#      Use groupby and count




In [None]:
# 7.0 Perform a stratified sampling of data.
#      Stratified sampling be by column: 'cityCategory'
#        Take 80% from 'B' and 20% from 'C'
#          df.sampleBy('colName', fractions = {})




## Using verbs
>select, <br>
><i>select(x).where()</i>,<br>
><i>select().distinct()</i>,<br>
>filter,<br>
>groupby

### select syntax
> DataFrame.select(\*cols)<br>
> cols: column names (string) or expressions (Column). If one of the column names is ‘*’, that column is expanded to include all columns in the current DataFrame.


In [None]:
# 8.0 Show columns 3rd till 5th



### filter syntax
>DataFrame.filter(condition)<br>
>condition: <i>columnObject > 34</i> or string format: <i>"age > 34"</i>
>>  df.age > 3 or col("age") > 3<br>
>>  "age > 3" <br>
>>Logical Operators<br>
>>> If string: AND OR NOT<br>
>>> If columnObject: &, |, ~ <br>

In [None]:
# 8.1 Filter purchases less than 9000
#      Use filter()



In [None]:
# 8.2 Filter purchases less than 9000 and maritalStatus is 0



In [None]:
# 8.3 Filter purchases less than 9000 or maritalStatus is 0



In [None]:
# 9.0 Combining verbs: select, filter and distinct
#      select columns 'age' ,'purchase' 
#       filter purchaes for 'age' of 0-17





## Aggregation with groupby
Use: <i>.agg({'colName1' : 'mean', 'colName2' : 'sum'})</i> <br>
>With <i>agg()</i> one can use only builtin functions and not any other <i>pyspark.sql.function</i>.<br>
Some common functions are: <i>mean, avg, sum, count, first, last,stddev </i>. There is no need to import builtin function in advance.<br>
For a complete list of builtin functions see [here](https://sparkbyexamples.com/pyspark/pyspark-aggregate-functions/).


In [None]:
# Find max of 'occupation' and max of 'purchase'

blackfriday.dtypes

In [None]:
# 12. groupby. Can apply sum, min, max, count



# 12.1




In [None]:
########### I am done ####################