In [1]:
import pyspark 
sc = pyspark.SparkContext('local[*]')# do something to prove it works
rdd = sc.parallelize(range(1000))
rdd.takeSample(False, 5)

[123, 649, 59, 553, 322]

In [2]:

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType 
from pyspark.sql.types import ArrayType, DoubleType, BooleanType
from pyspark.sql.functions import col,array_contains
from pyspark.sql import functions as f

inputfile="activity.csv"
outputfile="activity-out.csv"

spark = SparkSession.builder.appName('readcsv').getOrCreate()

df = spark.read.csv(inputfile)

df.printSchema()

df2 = spark.read.option("header",True) \
     .csv(inputfile)
df2.printSchema()
   

 

df3 = spark.read.options(header='True', delimiter=',') \
  .csv(inputfile)
df3.printSchema()
 
schema = StructType() \
      .add("animal_id",IntegerType(),True) \
      .add("group",IntegerType(),True) \
      .add("sex",StringType(),True) \
      .add("building",StringType(),True) \
      .add("diet",StringType(),True) \
      .add("weight",DoubleType(),True) \
      .add("activity_score",DoubleType(),True)

df4 = spark.read.format("csv") \
      .option("header", True) \
      .schema(schema) \
      .load(inputfile)
df4.printSchema()





root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)

root
 |-- animal_id: string (nullable = true)
 |-- group: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- building: string (nullable = true)
 |-- diet: string (nullable = true)
 |-- weight: string (nullable = true)
 |-- activity_score: string (nullable = true)

root
 |-- animal_id: string (nullable = true)
 |-- group: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- building: string (nullable = true)
 |-- diet: string (nullable = true)
 |-- weight: string (nullable = true)
 |-- activity_score: string (nullable = true)

root
 |-- animal_id: integer (nullable = true)
 |-- group: integer (nullable = true)
 |-- sex: string (nullable = true)
 |-- building: string (nullable = true)
 |-- diet: string (nullable = t

In [3]:
df4.groupBy("diet").sum("activity_score").show()

+-----+-------------------+
| diet|sum(activity_score)|
+-----+-------------------+
|prot+|  5.600000000000001|
|prot-|                3.4|
+-----+-------------------+



In [4]:

df4.show()
df4.printSchema()


dfGroup=df4.groupBy("diet") \
          .agg(  f.sum("weight").alias("sum_weight"),\
                 f.max("weight").alias("max_weight") ,\
                 f.min("weight").alias("min_weight"),\
                 f.mean("weight").alias("mean_weight")
              )
          
dfGroup.show(truncate=False)

 
 

+---------+-----+---+--------+-----+------+--------------+
|animal_id|group|sex|building| diet|weight|activity_score|
+---------+-----+---+--------+-----+------+--------------+
|        1|    1|  M|      12|prot+|  43.0|           0.2|
|        2|    1|  F|      12|prot-|  31.0|           0.9|
|        3|    1|  M|      13|prot+|  45.0|           0.4|
|        4|    2|  M|      13|prot-|  30.0|           0.2|
|        5|    2|  F|       2|prot+|  37.0|           0.5|
|        6|    3|  M|       4|prot+|  33.0|           0.8|
|        7|    3|  M|      11|prot-|  34.0|           0.2|
|        8|    1|  F|      12|prot+|  39.0|           0.4|
|        9|    2|  M|       4|prot-|  35.0|           0.3|
|       10|    3|  F|       3|prot+|  41.0|           0.2|
|       11|    1|  M|      13|prot+|  36.0|           0.6|
|       12|    2|  F|       3|prot+|  34.0|           0.3|
|       13|    3|  F|       4|prot-|  33.0|           0.7|
|       14|    1|  M|      12|prot+|  35.0|           0.

In [7]:
dfFilter=df4.filter(df4.weight >40)
dfFilter.show()

+---------+-----+---+--------+-----+------+--------------+
|animal_id|group|sex|building| diet|weight|activity_score|
+---------+-----+---+--------+-----+------+--------------+
|        1|    1|  M|      12|prot+|  43.0|           0.2|
|        3|    1|  M|      13|prot+|  45.0|           0.4|
|       10|    3|  F|       3|prot+|  41.0|           0.2|
|       21|    3|  F|       3|prot+|  48.0|           0.4|
+---------+-----+---+--------+-----+------+--------------+



In [19]:

# write csv


import os
import shutil
dir_path="activity-part"

try:
     shutil.rmtree(dir_path)
except OSError as e:
    print("Error: %s : %s" % (dir_path, e.strerror))
    
    
dfFilter.write.option("header",True) \
 .csv(dir_path)

try:
     shutil.rmtree(outputfile)
except OSError as e:
    print("Error: %s : %s" % (dir_path, e.strerror))
    

dfFilter.repartition(1).write.option("header",True).csv(outputfile)
print("done") 

done
