In [2]:
 
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
from pyspark.sql import SparkSession

import pandas as pd   

data = [['pig1', 'line1','antiox', 50], ['pig2','line2','standard', 45],
        ['pig3','line2','antiox', 54],['pig4','line1','antiox',34]] 
  
# Create the pandas DataFrame 
pandasDF = pd.DataFrame(data, columns = ['animal_code', 'species','diet','m_antiox']) 
  
# print dataframe. 
print(pandasDF)


spark = SparkSession.builder \
    .master("local[1]") \
    .appName("pandas-dataset") \
    .getOrCreate()

sparkDF=spark.createDataFrame(pandasDF) 
sparkDF.printSchema()
sparkDF.show()





#sparkDF=spark.createDataFrame(pandasDF.astype(str)) 
mySchema = StructType([ StructField("animal_code", StringType(), True)\
                         , StructField("species", StringType(), True)\
                         ,StructField("diet", StringType(), True)\
                         ,StructField("m_antiox", IntegerType(), True)])

sparkDF2 = spark.createDataFrame(pandasDF,schema=mySchema)
sparkDF2.printSchema()
sparkDF2.show()






spark.conf.set("spark.sql.execution.arrow.enabled","true")
spark.conf.set("spark.sql.execution.arrow.pyspark.fallback.enabled","true")

pandasDF2=sparkDF2.select("*").toPandas
print(pandasDF2)


test=spark.conf.get("spark.sql.execution.arrow.enabled")
print(test)

test123=spark.conf.get("spark.sql.execution.arrow.pyspark.fallback.enabled")
print(test123)

  animal_code species      diet  m_antiox
0        pig1   line1    antiox        50
1        pig2   line2  standard        45
2        pig3   line2    antiox        54
3        pig4   line1    antiox        34
root
 |-- animal_code: string (nullable = true)
 |-- species: string (nullable = true)
 |-- diet: string (nullable = true)
 |-- m_antiox: long (nullable = true)

+-----------+-------+--------+--------+
|animal_code|species|    diet|m_antiox|
+-----------+-------+--------+--------+
|       pig1|  line1|  antiox|      50|
|       pig2|  line2|standard|      45|
|       pig3|  line2|  antiox|      54|
|       pig4|  line1|  antiox|      34|
+-----------+-------+--------+--------+

root
 |-- animal_code: string (nullable = true)
 |-- species: string (nullable = true)
 |-- diet: string (nullable = true)
 |-- m_antiox: integer (nullable = true)

+-----------+-------+--------+--------+
|animal_code|species|    diet|m_antiox|
+-----------+-------+--------+--------+
|       pig1|  line1| 

In [3]:
inputfile="antiox.csv"
sparkDF3 = spark.read.format("csv") \
      .option("header", True) \
      .schema(mySchema) \
      .load(inputfile)
sparkDF3.show()

+-----------+-------+--------+--------+
|animal_code|species|    diet|m_antiox|
+-----------+-------+--------+--------+
|          1|  line1|  antiox|      55|
|          2|  line2|standard|      45|
|          3|  line2|  antiox|      58|
|          4|  line1|  antiox|      33|
|          5|  line2|standard|      45|
|          6|  line1|standard|      51|
|          7|  line1|  antiox|      47|
|          8|  line2|  antiox|      52|
|          9|  line2|standard|      34|
|         10|  line1|standard|      39|
|         11|  line2|  antiox|      54|
|         12|  line1|standard|      45|
|         13|  line1|standard|      39|
|         14|  line2|  antiox|      34|
|         15|  line1|  antiox|      45|
|         16|  line1|  antiox|      53|
|         17|  line2|standard|      44|
|         18|  line1|standard|      39|
|         19|  line1|  antiox|      20|
|         20|  line2|  antiox|      39|
+-----------+-------+--------+--------+
only showing top 20 rows



In [4]:

sparkDF3.createOrReplaceTempView("EMP")
spark.sql("select species, sum(m_antiox) as sum_m_antiox , avg(m_antiox) as avg_m_antiox  from EMP " +
          "group by species having avg_m_antiox > 40 " + 
          "order by sum_m_antiox desc").show()

+-------+------------+------------------+
|species|sum_m_antiox|      avg_m_antiox|
+-------+------------+------------------+
|  line1|         880|41.904761904761905|
|  line2|         859| 45.21052631578947|
+-------+------------+------------------+



In [16]:

line = [("line1",10,43,"from NZ breeding",1), \
    ("line2",20,48, "local selection",2), \
    ("line3",30,32,"chinese x gascon crossing",3), \
    ("line4",40,32,"stress sensible",4) \
  ]

lineColumns = ["line","efficiency","animal_count","comment","source"]
lineDF = spark.createDataFrame(data=line, schema = lineColumns)
lineDF.printSchema()
lineDF.show(truncate=False)


joinedDF=sparkDF3.join(lineDF,sparkDF3.species ==  lineDF.line,"inner") 

joinedDF.show(truncate=False)
joinedDF.createOrReplaceTempView("EMP2")

spark.sql("select animal_code, species, animal_count,comment  from EMP2 " +
          "where diet='antiox'").show()


root
 |-- line: string (nullable = true)
 |-- efficiency: long (nullable = true)
 |-- animal_count: long (nullable = true)
 |-- comment: string (nullable = true)
 |-- source: long (nullable = true)

+-----+----------+------------+-------------------------+------+
|line |efficiency|animal_count|comment                  |source|
+-----+----------+------------+-------------------------+------+
|line1|10        |43          |from NZ breeding         |1     |
|line2|20        |48          |local selection          |2     |
|line3|30        |32          |chinese x gascon crossing|3     |
|line4|40        |32          |stress sensible          |4     |
+-----+----------+------------+-------------------------+------+

+-----------+-------+--------+--------+-----+----------+------------+----------------+------+
|animal_code|species|diet    |m_antiox|line |efficiency|animal_count|comment         |source|
+-----------+-------+--------+--------+-----+----------+------------+----------------+------

In [29]:
 
source = [(1,"Noorsvaal University","open"), \
    (2,"Lifegenx gmbh","patent"), \
    (3,"European breeding association","member"), \
    (3,"Local farmers association","member")\
  ]

sourceColumns = ["provider_id","provider_name","access"]
sourceDF = spark.createDataFrame(data=source, schema = sourceColumns)
sourceDF.printSchema()
sourceDF.show(truncate=False)


joinedDF2=joinedDF.join(sourceDF,joinedDF.source ==  sourceDF.provider_id,"inner") 

joinedDF2.show(truncate=False)

joinedDF2.createOrReplaceTempView("EMP3")

spark.sql("select animal_code,provider_name, access from EMP3 " +
          "where source in (1,2)").show()


root
 |-- provider_id: long (nullable = true)
 |-- provider_name: string (nullable = true)
 |-- access: string (nullable = true)

+-----------+-----------------------------+------+
|provider_id|provider_name                |access|
+-----------+-----------------------------+------+
|1          |Noorsvaal University         |open  |
|2          |Lifegenx gmbh                |patent|
|3          |European breeding association|member|
|3          |Local farmers association    |member|
+-----------+-----------------------------+------+

+-----------+-------+--------+--------+-----+----------+------------+----------------+------+-----------+--------------------+------+
|animal_code|species|diet    |m_antiox|line |efficiency|animal_count|comment         |source|provider_id|provider_name       |access|
+-----------+-------+--------+--------+-----+----------+------------+----------------+------+-----------+--------------------+------+
|39         |line1  |standard|20      |line1|10        |43 