In [1]:
import findspark

findspark.init('/usr/local/spark/')

In [2]:
# start master 
!$SPARK_HOME/sbin/start-master.sh --host localhost \
    --port 7077 --webui-port 8080
    
# start worker
!$SPARK_HOME/sbin/start-worker.sh spark://localhost:7077 \
    --cores 2 --memory 1g


starting org.apache.spark.deploy.master.Master, logging to /usr/local/spark//logs/spark-saverio-org.apache.spark.deploy.master.Master-1-saverio-PU301LA.out
starting org.apache.spark.deploy.worker.Worker, logging to /usr/local/spark//logs/spark-saverio-org.apache.spark.deploy.worker.Worker-1-saverio-PU301LA.out


In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("spark://localhost:7077")\
    .appName("First")\
    .getOrCreate()

21/08/30 16:34:06 WARN Utils: Your hostname, saverio-PU301LA resolves to a loopback address: 127.0.1.1; using 192.168.1.24 instead (on interface wlp3s0)
21/08/30 16:34:06 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
21/08/30 16:34:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
spark

In [5]:
# get spark context -> entry point used to work with RDD
sc = spark.sparkContext
sc

In [6]:
import json
import numpy as np
import time
from pyspark.sql.functions import col, when

In [7]:
from pyspark.sql.functions import from_json, col, when
from pyspark.sql.types import StructField, StructType, DoubleType, IntegerType

schema = StructType(
        [
                StructField("HEAD",        IntegerType()),
                StructField("FPGA",        IntegerType()),
                StructField("TDC_CHANNEL", IntegerType()),
                StructField("ORBIT_CNT",   DoubleType()),
                StructField("BX_COUNTER",  IntegerType()),
                StructField("TDC_MEAS",    DoubleType())
        ]  
)

In [8]:
# load dataset on dataset/lecture2/dimuon
inputDF = spark.read \
    .option("inferTimestamp","false") \
    .option("prefersDecimal","false") \
    .option("header","True")\
    .schema(schema)\
    .format('csv') \
    .load('./Data/data_000000.csv')

In [9]:
#inputDF.show()

In [10]:
results = {}
results["Total Count"] = {}
chamber_name = ["Chamber_1", "Chamber_2", "Chamber_3", "Chamber_4"]
for chamber in chamber_name:
    results[chamber] = {}
    results[chamber]["Count"] = {}
    for hist in ["Hist_1","Hist_2"]:
        results[chamber][hist] = {}
        results[chamber][hist]["Bins"] = {}
        results[chamber][hist]["Counts"] = {}

In [11]:
#Keep the events where "HEAD"=2
cleanDF = inputDF.where(col('HEAD')==2)

In [12]:
#cleanDF.show()

In [13]:
chamberDF = cleanDF.withColumn('chamber',when((col("FPGA") == 0) & (col("TDC_CHANNEL")<=63),1).
                                 when((col("FPGA") == 0) & (col("TDC_CHANNEL")>=64),2).
                                 when((col("FPGA") == 1) & (col("TDC_CHANNEL")<=63),3).
                                 when((col("FPGA") == 1) & (col("TDC_CHANNEL")>=64),4)).\
                                 select([ col('TDC_CHANNEL'), col('ORBIT_CNT'),
                                    col('BX_COUNTER'),col('TDC_MEAS'),
                                    col('chamber')])

In [14]:
start =time.time()
for i in [1,2,3,4]:
    #Now we can count the number of events in each chamber
    chamber = chamberDF.filter(col("chamber") == i).persist()
    results[f"Chamber_{i}"]["Count"] = chamber.count()
    
    if(results[f"Chamber_{i}"]["Count"]!=0):
            
        #Histogram 1
        bins, counts = (
        chamber.select("TDC_CHANNEL")
                .rdd.map(lambda x: x.TDC_CHANNEL)
                .histogram(list(np.arange(0,170,5)))
        )
            
        results[f"Chamber_{i}"]["Hist_1"]["Bins"] = bins
        results[f"Chamber_{i}"]["Hist_1"]["Counts"] = counts
            
        #Histogram 2
        bins, counts = (
        chamber.groupBy("TDC_CHANNEL","ORBIT_CNT")
        .count()
        .select("ORBIT_CNT")
        .rdd.map(lambda x: x.ORBIT_CNT)
        .histogram(list(np.arange(6.e5,1.e7,0.5e6)))
        )
            
        results[f"Chamber_{i}"]["Hist_2"]["Bins"] = bins
        results[f"Chamber_{i}"]["Hist_2"]["Counts"] = counts
    chamber.unpersist()
end =time.time()
print(end-start)



41.836082458496094


                                                                                

In [40]:
from pyspark.sql import Window

In [47]:
def computations_8(DF):
    start=time.time()
    #This function perform the whole operations on the received batch,
    
    #Add a column with the chamber number
    DF_clean     = DF.filter(col("HEAD")==2)
    DF_clean.filter(col("HEAD") < 128 )
    DF_hit = DF_clean.withColumn('CHAMBER',when(col("FPGA") == 0, 
                                                when(col("TDC_CHANNEL")<=63,1).\
                                                otherwise(2)).\
                                           otherwise(when(col("TDC_CHANNEL")<=63,3).\
                                                otherwise(4)
                                           )).\
                                           select([ col('TDC_CHANNEL'), col('ORBIT_CNT'),
                                           col('BX_COUNTER'),col('TDC_MEAS'),
                                           col('CHAMBER')])
    
    #Initialize results dictionary
    results = {}
    results["Total Count"] = {}
    results["Index"] = time.time()
    chamber_name = ["Chamber_1", "Chamber_2", "Chamber_3", "Chamber_4"]
    for chamber in chamber_name:
        results[chamber] = {}
        results[chamber]["Count"] = {}
        for hist in ["Hist_1","Hist_2","Hist_3","Hist_4"]:
            results[chamber][hist] = {}
            results[chamber][hist]["Bins"] = {}
            results[chamber][hist]["Counts"] = {}
            
    #We prepare the scilantor data
    #First we filter the events encoding the passage time,
    #then we try to keep only one scilantor hit per orbit (the samaller)
    DF_hit.filter(col('ORBIT_CNT')==617015).show()
    
    w = Window.partitionBy(['ORBIT_CNT'])
    DF_test = DF_hit.withColumn('minTDC', f.min('TDC_MEAS').over(w))\
        .where(f.col('TDC_MEAS') == f.col('minTDC'))\
        .drop('minTDC')
    
    DF_test.filter(col('ORBIT_CNT')==617015).show()
    '''DF_scilantor = DF_hit.groupBy("ORBIT_CNT").agg(f.max('BX_COUNTER')).\
                      drop("ORBIT_CNT").\
                      withColumnRenamed("min(ORBIT_CNT)","ORBIT_CNT_sci").\
                      withColumnRenamed("min(BX_COUNTER)","BX_COUNTER_sci").\
                      withColumnRenamed("min(TDC_MEAS)","TDC_MEAS_sci")'''
    
    
    DF_scilantor.filter(col('ORBIT_CNT')==617015).show()
    
    #Add the PASSAGETIME time
    DF_scilantor = DF_scilantor.withColumn("PASSAGETIME", 25 * (col("ORBIT_CNT_sci") * 3564 +
                                                                col("BX_COUNTER_sci") + 
                                                                col("TDC_MEAS_sci")/30))
    
    #Drop the columns with null values from DF_hit
    DF_hit.na.drop(subset=["CHAMBER"])
    
    # Compute histograms for each chamber   
    for i in [1,2,3,4]:     
        #Now we can count the number of events in each chamber
        chamber = DF_hit.filter(col("CHAMBER") == i).persist()
        results[f"Chamber_{i}"]["Count"] = chamber.count()
        
        if(results[f"Chamber_{i}"]["Count"]!=0):
            
            #Histogram 1
            bins_1, counts_1 = (
            chamber.select("TDC_CHANNEL")
                 .rdd.map(lambda x: x.TDC_CHANNEL)
                 .histogram(list(np.arange(0,170,5)))
            )
            
            results[f"Chamber_{i}"]["Hist_1"]["Bins"] = bins_1
            results[f"Chamber_{i}"]["Hist_1"]["Counts"] = counts_1
            
            #Histogram 2
            bins_2, counts_2 = (
            chamber.groupBy("TDC_CHANNEL","ORBIT_CNT")
            .count()
            .select("ORBIT_CNT")
            .rdd.map(lambda x: x.ORBIT_CNT)
            .histogram(list(np.arange(6.e5,1.e7,0.5e6)))
            )
            
            results[f"Chamber_{i}"]["Hist_2"]["Bins"] = bins_2
            results[f"Chamber_{i}"]["Hist_2"]["Counts"] = counts_2            
            
            
            #keep only the hits with a scintillator signal within the same orbit
            chamber_sci = chamber.join(DF_scilantor,chamber.ORBIT_CNT ==  DF_scilantor.ORBIT_CNT_sci,"inner")

            #Add the ABSSOLUTETIME 
            chamber_sci = chamber_sci.withColumn("ABSOLUTETIME",
                             25 * (col("ORBIT_CNT") * 3564 + col("BX_COUNTER") + col("TDC_MEAS")/30))

            #Drop useless data
            chamber_sci = chamber_sci.drop("HEAD").drop("FPGA").drop("BX_COUNTER_sci").drop("TDC_MEAS_sci").drop("BX_COUNTER").drop("ORBIT_CNT_sci")
 
            #Add DRIFTIME
            chamber_sci = chamber_sci.withColumn("DRIFTIME",col("ABSOLUTETIME")-col("PASSAGETIME") + 95)#.show()
        
            #Histogram 3
            bins_3, counts_3 = (
            chamber_sci.select("TDC_CHANNEL")
                 .rdd.map(lambda x: x.TDC_CHANNEL)
                 .histogram(list(np.arange(0,170,5)))
            )    
            results[f"Chamber_{i}"]["Hist_3"]["Bins"] = bins_3
            results[f"Chamber_{i}"]["Hist_3"]["Counts"] = counts_3
            
            #Histogram 4
            bins_4, counts_4 = (
            chamber_sci.select("DRIFTIME")
                 .rdd.map(lambda x: x.DRIFTIME)
                 .histogram(list(np.arange(-100,1000,10)))
            )
            results[f"Chamber_{i}"]["Hist_4"]["Bins"] = bins_4
            results[f"Chamber_{i}"]["Hist_4"]["Counts"] = counts_4
            
            
            
        else:
            #Histogram 1
            results[f"Chamber_{i}"]["Hist_1"]["Bins"] = list(np.arange(0,170,5))
            counts = list(np.arange(0,170,5)* 0) 
            results[f"Chamber_{i}"]["Hist_1"]["Counts"] = counts
            
             #Histogram 2
            results[f"Chamber_{i}"]["Hist_2"]["Bins"] = list(np.arange(6.e5,1.e7,0.5e6))
            counts = list(np.arange(6.e5,1.e7,0.5e6)* 0) 
            results[f"Chamber_{i}"]["Hist_2"]["Counts"] = counts
            
            #Histogram 3
            results[f"Chamber_{i}"]["Hist_3"]["Bins"] = list(np.arange(0,170,5))
            counts = list(np.arange(0,170,5)* 0) 
            results[f"Chamber_{i}"]["Hist_3"]["Counts"] = counts
            
             #Histogram 4
            results[f"Chamber_{i}"]["Hist_4"]["Bins"] = list(np.arange(-100,1000,10))
            counts = list(np.arange(-100,1000,10)* 0) 
            results[f"Chamber_{i}"]["Hist_4"]["Counts"] = counts
        chamber.unpersist()
        
    results["Total Count"] = results["Chamber_1"]["Count"] + results["Chamber_2"]["Count"] + \
                             results["Chamber_3"]["Count"] + results["Chamber_4"]["Count"]
    end =time.time()
    print("Time =",end-start)
       
    producer.send(topic="results", value= str(results).encode('utf-8'))
    #producer.flush()

In [48]:
computations_8(cleanDF)

                                                                                

+-----------+---------+----------+--------+-------+
|TDC_CHANNEL|ORBIT_CNT|BX_COUNTER|TDC_MEAS|CHAMBER|
+-----------+---------+----------+--------+-------+
|         12| 617015.0|      3126|    20.0|      3|
|        115| 617015.0|      2678|    17.0|      2|
+-----------+---------+----------+--------+-------+



                                                                                

+-----------+---------+----------+--------+-------+
|TDC_CHANNEL|ORBIT_CNT|BX_COUNTER|TDC_MEAS|CHAMBER|
+-----------+---------+----------+--------+-------+
|        115| 617015.0|      2678|    17.0|      2|
+-----------+---------+----------+--------+-------+



UnboundLocalError: local variable 'DF_scilantor' referenced before assignment