In [1]:
from pyspark.sql.types import StructType, LongType, StringType, StructField, DoubleType

In [2]:
#Presentation libraries
import ipywidgets as widgets
from IPython.display import Markdown, display

In [3]:
year = widgets.IntText(value=2019)
year

IntText(value=2019)

In [4]:
month = widgets.Select(value="06", options=["%02d"%i for i in range(1,13)])
display(month)

Select(index=5, options=('01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'), value='06')

# Load the data from HDFS
We want to load the data for a given month, only for the completed jobs and we are not interested in the metadata

In [5]:
schema = StructType([
    StructField('data', StructType([
        StructField("GlobalJobId", StringType(), nullable=False),
        StructField("CommittedTime", LongType(), nullable=False),
        StructField("CpuTimeHr", DoubleType(), nullable=False),
        StructField("Status", StringType(), nullable=True),
        StructField("Site", StringType(), nullable=True),
        StructField("AffiliationCountry", StringType(), nullable=True),
        StructField("AffiliationInstitute", StringType(), nullable=True),
    ])),
])

In [6]:
folder = "/project/monitoring/archive/condor/raw/metric/{year}/{month}".format(year=year.value, month=month.value)
raw_df = spark.read.option("basePath",folder).json(folder+"/*",schema=schema)\
                   .select("data.*")\
                   .filter("Status='Completed'")

We only have affiliation since end of May, we need a workaround for previous months (in this case just use Unknown)

In [7]:
aff_select = ["coalesce(AffiliationCountry,'Unknown') as Country", "coalesce(AffiliationInstitute,'Unknown') as Institute"]
if not 'AffiliationCountry' in raw_df.columns:
    aff_select = ["'Unknown' as Country","'Unknown' as Institute"]


We want only one report by each job, so we will drop duplicates and project only the columns we actually want to use

In [8]:
df = raw_df.selectExpr("Site", 
                       "CommittedTime",
                       "CpuTimeHr",
                       "GlobalJobId",
                       *aff_select).dropDuplicates()\
                                   .drop("GlobalJobId")

We will group by Country, institute and site (later we will use the site to filter)

In [9]:
grouped=df.groupBy("Country",
                   "Institute", 
                   "Site").sum().cache()

In [10]:
grouped = grouped.withColumnRenamed("sum(CommittedTime)","CommittedTime")\
                 .withColumnRenamed("sum(CpuTimeHr)","CpuTimeHr")

## Get the values for the filters
We want to get the Country, institute pairs and the list of the sites in order to filter over them.

In [11]:
affiliation=grouped.select('Country',
                           'Institute')\
                   .groupby('Country',
                            'Institute').count()\
                   .toPandas()

In [12]:
sites = grouped.select("Site")\
               .distinct()\
               .toPandas()["Site"].unique()

In [13]:
sites.sort()

# Checkpoint
___________

**Once we are here we can run the next cells changing the selections to see different results. Those operations are not as expensive as the previous ones (we only work over grouped data).**

# Select the parameters
You can select the sites you want to take into account, and the country (countries) you are interested in.
You can select multiple elements. If you don't select any value, all the values will be included.

In [14]:
s = widgets.SelectMultiple(
    options=sites,
    description='Site:',
    disabled=False,
)
countries = affiliation["Country"].unique()
c = widgets.SelectMultiple(
    options=countries,
    description='Country:',
    disabled=False,
)
display(s)
display(c)

U2VsZWN0TXVsdGlwbGUoZGVzY3JpcHRpb249dSdTaXRlOicsIG9wdGlvbnM9KHUnVDFfREVfS0lUJywgdSdUMV9FU19QSUMnLCB1J1QxX0ZSX0NDSU4yUDMnLCB1J1QxX0lUX0NOQUYnLCB1J1TigKY=


U2VsZWN0TXVsdGlwbGUoZGVzY3JpcHRpb249dSdDb3VudHJ5OicsIG9wdGlvbnM9KHUnQkUnLCB1J1VTJywgdSdFUycsIHUnSVQnLCB1J0lOJywgdSdSVScsIHUnJywgdSdIUicsIHUnS1InLCDigKY=


 **Each time you select the country (or countries) please run the next cell to update the institutes list**

In [15]:
institutes = (affiliation[affiliation["Country"].isin( c.value)] if c.value else affiliation)["Institute"]
i = widgets.SelectMultiple(
    options=institutes,
    description='Institutes',
    disabled=False,
)
i

U2VsZWN0TXVsdGlwbGUoZGVzY3JpcHRpb249dSdJbnN0aXR1dGVzJywgb3B0aW9ucz0odSdVbml2ZXJzaXR5IG9mIFZpcmdpbmlhJywgdSdOb3J0aGVybiBJbGxpbm9pcyBVbml2ZXJzaXR5JyzigKY=


# Apply the filters

The query will be executed with the selected values.

In [16]:
#print("Sites: %s \nCountries: %s\nInstitutes  %s"% (s.value or 'All', c.value or 'All', i.value or 'All'))
query = grouped
if s.value:
    query = query.filter(query.Site.isin(*s.value))
if c.value:
    query = query.filter(query.Country.isin(*c.value))
if i.value:
    query = query.filter(query.Institute.isin(*i.value))
    
display(Markdown(
"""
### The current query will be run with:
**Sites:** %s 

**Countries:** %s

**Institutes:** %s
"""% (s.value or 'All', c.value or 'All', i.value or 'All')))


### The current query will be run with:
**Sites:** (u'T2_US_Caltech', u'T2_US_Florida', u'T2_US_MIT', u'T2_US_Nebraska', u'T2_US_Purdue', u'T2_US_UCSD', u'T2_US_Wisconsin') 

**Countries:** (u'US',)

**Institutes:** All


In [17]:
res = query.groupBy("Country", "Institute").sum()\
           .withColumnRenamed("sum(CommittedTime)", "CommittedTime")\
           .withColumnRenamed("sum(CpuTimeHr)","CpuTimeHr")

In [18]:
res_pd = res.toPandas()

Committed time is expressed by default in seconds, CPUTimeHr in hours. 

In [19]:
res_pd.sort_values('CommittedTime',ascending=False)

Unnamed: 0,Country,Institute,CommittedTime,CpuTimeHr
26,US,University of Nebraska Lincoln,124967421612,24588470.0
28,US,Fermi National Accelerator Lab.,571107104,160560.9
31,US,Johns Hopkins University,487379759,126689.8
6,US,Brown University,480699132,155968.9
36,US,The University of Kansas,428823039,107188.2
12,US,Massachusetts Inst. of Technology,400210091,94370.89
5,US,University of Illinois at Chicago,392410713,57380.24
39,US,Cornell University,362521538,84072.85
2,US,University of California Riverside,357961674,85463.13
24,US,Rice University,333031333,51451.36


In [20]:
countries_total = res.groupBy(res.Country).sum()

In [21]:
countries_total.toPandas()

Unnamed: 0,Country,sum(CommittedTime),sum(CpuTimeHr)
0,US,131567618362,26294810.0
