In [1]:
import pandas as pd
import numpy as np

# Process Data from ssact
### Sample sacct command for GPU nodes:
sacct -a -P -X -S 071918-00:00:00 -E 072018-23:59:59 --format=Account,AllocTRES,JobName,JobID,User,Partition,Start,State,Submit,ReqMem,Timelimit,NodeList,End -s CANCELLED,TIMEOUT,FAILED,COMPLETED,NODE_FAIL,OUT_OF_MEMORY -r GPU,GPU-shared,GPU-small> today_gpu.csv

### Sample sacct command for RM nodes:
sacct -a -P -X -S 071918-00:00:00 -E 072018-23:59:59 --format=Account,AllocTRES,JobName,JobID,User,Partition,Start,State,Submit,ReqMem,Timelimit,NodeList,End -s CANCELLED,TIMEOUT,FAILED,COMPLETED,NODE_FAIL,OUT_OF_MEMORY -r RM,RM-shared,RM-small> today_rm.csv

### Sample sacct command for LM nodes:
sacct -a -P -X -S 071918-00:00:00 -E 072018-23:59:59 --format=Account,AllocTRES,JobName,JobID,User,Partition,Start,State,Submit,ReqMem,Timelimit,NodeList,End -s CANCELLED,TIMEOUT,FAILED,COMPLETED,NODE_FAIL,OUT_OF_MEMORY -r LM> today_lm.csv

In [2]:
def lm_process_node_partition(df):
    xl = []
    l =[]
    for a in range(df["NodeList"].shape[0]):
        if df["NodeList"][a][0]=="x":
            xl.append(a)
        elif df["NodeList"][a][0]=="l":
            l.append(a)
    #xlm
    xlm=df.iloc[xl,:]
    xlm["NodeList_1"]=xlm["NodeList"].str.replace("xl","")
    xlm.loc[:,"type"]="xl"
    #lm    
    lm=df.iloc[l,:]
    lm["NodeList_1"]=lm["NodeList"].str.replace("l","")
    lm.loc[:,"type"]="l"
    all_lm=pd.concat([lm,xlm])
    return all_lm

In [3]:
def process_AllocTRES(df,fileName):
    if "gpu" in str(fileName):
        df["Alloc_NODE"]=np.where(df['AllocTRES']=='Not Allocated', 0, df['AllocTRES'].str.split(",").str[2].str[5:])
        df["Alloc_gres/gpu"]=np.where(df['AllocTRES']=='Not Allocated', 0,df['AllocTRES'])
        df["Alloc_gres/gpu"]=np.where(df['Alloc_gres/gpu'].str.split(",").str[4].str.contains(":"),df['Alloc_gres/gpu'].str.split(",").str[4].str[9:12],df['Alloc_gres/gpu'].str.split(",").str[5].str[9:12])
        df["Alloc_gres/gpu"]=np.where(df['Alloc_gres/gpu']=='p10', 'p100',df['Alloc_gres/gpu'])
        df["Alloc_GPU"]=np.where(df['AllocTRES']=='Not Allocated',0, df['AllocTRES'])
        df["Alloc_GPU"]=np.where(df['Alloc_GPU'].str.split(",").str[4].str.contains(":"),df['Alloc_GPU'].str.split(",").str[4].str[-2:],df['Alloc_GPU'].str.split(",").str[5].str[-2:])
        df["Alloc_GPU"]=df["Alloc_GPU"].str.replace("=","")
    elif "rm" in str(df):
        df["Alloc_CPU"]=np.where(df['AllocTRES']=='Not Allocated', 0, df['AllocTRES'].str.split(",").str[0].str[4:])
        df["Alloc_MEM"]=np.where(df['AllocTRES']=='Not Allocated', 0, df['AllocTRES'].str.split(",").str[1].str[4:-1])
        df["Alloc_NODE"]=np.where(df['AllocTRES']=='Not Allocated', 0, df['AllocTRES'].str.split(",").str[2].str[5:])
    else:
        df["Alloc_CPU"]=np.where(df['AllocTRES']=='Not Allocated', 0, df['AllocTRES'].str.split(",").str[0].str[4:])
        df["Alloc_MEM"]=np.where(df['AllocTRES']=='Not Allocated', 0, df['AllocTRES'].str.split(",").str[1].str[4:-1])
        df["Alloc_NODE"]=np.where(df['AllocTRES']=='Not Allocated', 0, df['AllocTRES'].str.split(",").str[2].str[5:])



    return df
        
    

In [4]:
def process_NodeList_Row(row):
    allNum = []
    [[allNum.append(i) for i in range(int(a.split("-")[0]), int(a.split("-")[1])+1, 1)] if "-" in a else allNum.append(int(a)) for a in row.split(",")] 
    return allNum

def process_NodeList(df,fileName):
    all_arr=[]
    if "gpu" in str(fileName):
        df["NodeList_1"]=df["NodeList"].str.replace("gpu","")
    elif "rm" in str(fileName):
        df["NodeList_1"]=df["NodeList"].str.replace("r","")
    else:
        df=lm_process_node_partition(df)
    df["NodeList_1"]=df["NodeList_1"].str.replace("[","")
    df["NodeList_1"]=df["NodeList_1"].str.replace("]","")
    for row in df["NodeList_1"]:
        if row=="None assigned":
            all_arr.append(None)
        elif "-" not in row and "," not in row:
            all_arr.append([int(row)])
        else:
            all_arr.append(process_NodeList_Row(row))
    df["nodeArray"]=all_arr
    
    return df
    

In [5]:
#Transform the requested time to seconds
def process_TimeLimit(df):
    df["Timelimit"] = df["Timelimit"].str.replace("-",":") 
    temp=df['Timelimit'].str.split(":")
    df['ReqTime']=np.where(temp.str.len()==3,temp.str[0].astype(int)*3600+temp.str[1].astype(int)*60+temp.str[2].astype(int),temp.str[0].astype(int)*86400+temp.str[1].astype(int)*3600+temp.str[2].astype(int)*60+temp.str[3].fillna(0).astype(int))
    df=df.drop(['Timelimit'], axis=1)
    return df

In [6]:
def sacct_data_cleansing(fileName):

    df = pd.read_csv(fileName, sep='|',error_bad_lines=False)
    df = df.fillna("Not Allocated")
    df = df.sort_values('Start')
    df = process_AllocTRES(df,fileName)
    df = df.reset_index(drop=True)
    df["End"]=np.where(df['End']=='Unknown',None,df['End'])
    df["Start"]=np.where(df['Start']=='Unknown',None,df['Start'])
    df = process_NodeList(df,fileName)
    df=process_TimeLimit(df)
    return df.drop(['AllocTRES','NodeList_1'], axis=1)
    
    

        
df_gpu=sacct_data_cleansing("today_gpu.csv")
df_gpu.to_csv("gpu.csv")
df_rm=sacct_data_cleansing("today_rm.csv")
df_rm.to_csv("rm.csv")
df_lm=sacct_data_cleansing("today_lm.csv")
df_lm.to_csv("lm.csv")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats 

# Process Data from ssactmgr
### Get Events on downed or draining nodes on clusters.
### Sample sacctmgr command:
sacctmgr show events -P event=node Start=071918-00:00:00 End=072018-23:59:59 > today_state.csv

In [7]:
def sacctmgr_data_cleansing(fileName, df):
    df_state=pd.read_csv("today_state.csv", sep='|',error_bad_lines=False)
    df_state=df_state.drop(["Cluster","User"], axis=1)
    
    if "gpu" in fileName:
        df_state= df_state.drop(df_state[df_state["NodeName"].str[:3]!='gpu'].index)
        df_state["Alloc_gres/gpu"]=np.where(df_state["NodeName"].str[3:].astype(int)<17,"k80","p100")
        df_state["nodeArray"]=[[row] for row in df_state["NodeName"].str[3:].astype(int)]
        df_state["Alloc_GPU"]=np.where(df_state["Alloc_gres/gpu"]=="k80",4,2)


    elif "rm" in fileName:
        df_state= df_state.drop(df_state[df_state["NodeName"].str[0]!='r'].index)
        df_state["nodeArray"]=[[row] for row in df_state["NodeName"].str[1:].astype(int)]
        df_state["Alloc_CPU"]=28
        
    else:

        df_lm= df_state.drop(df_state[df_state["NodeName"].str[0]!='l'].index)
        df_lm["nodeArray"]=[[row] for row in df_lm["NodeName"].str[1:].astype(int)]
        df_lm["type"]="l"
        df_lm["Alloc_MEM"]=3000
        df_xlm= df_state.drop(df_state[df_state["NodeName"].str[:2]!='xl'].index)
        df_xlm["type"]="xl"
        df_xlm["Alloc_MEM"]=12000
        df_xlm["nodeArray"]=[[row] for row in df_xlm["NodeName"].str[2:].astype(int)]
        df_state=df_lm.append(df_xlm)
   
    
    df_state["State"]=df_state["State"].str.replace("$","") 
    df_state["State"]=df_state["State"].str.replace("*","") 
    df_state.rename(columns = {'TimeStart':'Start','TimeEnd':'End'}, inplace = True)
    df_state=df_state.drop(["NodeName"], axis=1)
    df_state["Alloc_NODE"]=1
    df_all=df_state.append(df)
    df_all = df_all.reset_index(drop=True)
    df_all["Alloc_NODE"]=df_all["Alloc_NODE"].fillna(0)
    df_all["State"]=np.where(df_all["State"].str.contains("CANCELLED"),"CANCELLED",df_all["State"])
    
    if "gpu" in fileName:
        df_all["Alloc_GPU"]=df_all["Alloc_GPU"].fillna(0)
        df_all["GPU_used"]=df_all["Alloc_GPU"].astype(float)/df_all["Alloc_NODE"].astype(float)
        
    elif "rm" in fileName:

        df_all["Alloc_CPU"]=df_all["Alloc_CPU"].fillna(0)
        df_all["CPU_used"]=df_all["Alloc_CPU"].astype(float)/df_all["Alloc_NODE"].astype(float)
    else:
        df_all["Alloc_CPU"]=df_all["Alloc_CPU"].fillna(0)
        df_all["Mem_used"]=df_all["Alloc_MEM"].astype(float)/df_all["Alloc_NODE"].astype(float)
        df_all["Partition"]=df_all["type"]
        df_all=df_all.drop(["type"], axis=1)
        

    return df_all
       
df_all_gpu=sacctmgr_data_cleansing("today_gpu.csv",df_gpu)
df_all_rm=sacctmgr_data_cleansing("today_rm.csv",df_rm)
df_all_lm=sacctmgr_data_cleansing("today_lm.csv",df_lm)

### Prepare Data for graphs

In [8]:
list(df_all_gpu)

['Account',
 'Alloc_GPU',
 'Alloc_NODE',
 'Alloc_gres/gpu',
 'End',
 'JobID',
 'JobName',
 'NodeList',
 'Partition',
 'Reason',
 'ReqMem',
 'ReqTime',
 'Start',
 'State',
 'Submit',
 'User',
 'nodeArray',
 'GPU_used']

In [9]:
list(df_all_rm)

['Account',
 'Alloc_CPU',
 'Alloc_MEM',
 'Alloc_NODE',
 'End',
 'JobID',
 'JobName',
 'NodeList',
 'Partition',
 'Reason',
 'ReqMem',
 'ReqTime',
 'Start',
 'State',
 'Submit',
 'User',
 'nodeArray',
 'CPU_used']

In [10]:
list(df_all_lm)

['Account',
 'Alloc_CPU',
 'Alloc_MEM',
 'Alloc_NODE',
 'End',
 'JobID',
 'JobName',
 'NodeList',
 'Partition',
 'Reason',
 'ReqMem',
 'ReqTime',
 'Start',
 'State',
 'Submit',
 'User',
 'nodeArray',
 'Mem_used']

### transform start and end time to minute per row
*** This is how I transformed the data (like taking a snapshot at a specific point of time): If a job has a start time of 2018-07-19T13:28:35 and end time of 2018-07-19T13:32:21 in sacct, this single job will be converted to four rows with timestamp like the following:

2018-07-19T13:29:00

2018-07-19T13:30:00

2018-07-19T13:31:00

2018-07-19T13:32:00

### For GPU/RM/LM Utilization graphs

In [14]:
from datetime import date, datetime, timedelta
import dateutil.parser

def perdelta(start, end, delta, job ):
    curr = start
    while curr < end:
        yield curr,job[0],job[1],job[2],job[3],job[4],job[5],job[6],job[7],job[8],job[9],job[10],job[11],job[12],job[13],job[14],job[15],job[16],job[17],job[18]
        curr += delta

In [15]:
def transform_start_end_time(df_util):
    temp_util = []
    for a in range(df_util.shape[0]):
        temp_util_time=[]
        startdate=dateutil.parser.parse(df_util.iloc[a,:]["Start"])
        enddate=dateutil.parser.parse(df_util.iloc[a,:]["fakeEnd"])+timedelta(seconds=1)
        for result in perdelta(startdate,enddate,timedelta(minutes=1),df_util.iloc[a,:]):
            if result[0]==startdate and str(result[0])[17:]!="00":
                pass
            else:
                temp_util_time.append(result)
        temp_util.append(pd.DataFrame(temp_util_time))
    return temp_util

In [16]:
def createTimeMap(df):
    time=list(df[0])
    timeMaps = {}
    for i in range(len(time)):
        if time[i] not in timeMaps:
            timeMaps[time[i]]=[i]
        else:
            timeMaps[time[i]].append(i)
    return timeMaps

In [89]:
def createNodeUsage(fileName,result):
    all_data=[]
    if "gpu" in fileName:  
        for key, value in createTimeMap(result).items():
            states = {}
            gpus = {}
            types = {}
            partitions = {}
            for i in value:
                row=result.iloc[i]
                for a in row[17].split(","):
                    if a not in gpus:
                        states[a] = row[14]
                        partitions[a] = row[9]
                        types[a] = row[4]
                        gpus[a]=row[18]
                    else:
                        if row[16]=="COMPLETED" or row[14]=="TIMEOUT" or row[14]=="CANCELLED" or row[14]=="NODE_FAIL" or row[14]=="OUT_OF_MEMORY":
                            gpus[a] = (gpus.get(a)+row[18])


            minute=pd.concat([pd.DataFrame(list(gpus.items())),pd.DataFrame(list(types.values())),pd.DataFrame(list(states.values())),pd.DataFrame(list(partitions.values())),pd.DataFrame([key]*len(gpus))],axis=1)    
            all_data.append(minute)  
    else:
        for key, value in createTimeMap(result).items():
            states = {}
            cpu_OR_mem = {}
            partitions = {}
            for i in value:
                row=result.iloc[i]
                for a in row[17].split(","):
                    if a not in cpu_OR_mem:
                        
                        states[a] = row[14]
                        partitions[a] = row[9]
                        cpu_OR_mem[a]=row[18]
                    else:
                        if row[16]=="COMPLETED" or row[14]=="TIMEOUT" or row[14]=="CANCELLED" or row[14]=="NODE_FAIL" or row[14]=="OUT_OF_MEMORY":
                            cpu_OR_mem[a] =(cpu_OR_mem.get(a)+row[18])


            minute=pd.concat([pd.DataFrame(list(cpu_OR_mem.items())),pd.DataFrame(list(states.values())),pd.DataFrame(list(partitions.values())),pd.DataFrame([key]*len(cpu_OR_mem))],axis=1)    
            all_data.append(minute)  
        
    return all_data


In [34]:
import datetime as dt
def utilization_graph_data(fileName,df):
    df_util=df.dropna(subset=["Start","End"])
    df_util=df_util.drop(df_util[df_util.End=='Unknown'].index)
    
    if "gpu" in fileName:
        df_util=df_util.dropna(subset=["Alloc_gres/gpu"])
   
    df_util["fakeEnd"]=df_util["End"].str[:17]+"59"
    
    temp_util=transform_start_end_time(df_util)
    result = pd.concat(temp_util)
    result = result.reset_index(drop=True)
    result[0] = pd.to_datetime(result[0])
    result[0]=result[0].map(lambda x: x.replace(second=0))
    result[17]=result[17].astype("str")
    result[17]=result[17].str.replace("[","")
    result[17]=result[17].str.replace("]","")
    all_data=createNodeUsage(fileName,result)
    all_data_df = pd.concat(all_data)
    
    if "gpu" in fileName:
        all_data_df.columns = ["Node","gpu","type","state","partition","time"]
    elif "rm" in fileName:
        all_data_df.columns = ["Node","core","state","partition","time"]
    else:
        all_data_df.columns = ["Node","mem","state","nodeType","time"]
    
    all_data_df2=all_data_df.reset_index(drop=True)
    all_data_df2['time']= all_data_df2['time'].map(lambda x: dt.datetime.strftime(x, '%Y-%m-%dT%H:%M:%SZ'))
    all_data_df2["time"]=all_data_df2["time"].str[:-1]
    
    if "gpu" in fileName:
        all_data_df2["partition"]=all_data_df2["partition"].fillna("NA")
        temp=all_data_df2.groupby(['time',"gpu","type","partition","state"])['Node'].count().reset_index()
        temp.to_csv("gpu_level.csv")
        
    elif "rm" in fileName:
        all_data_df2["partition"]=all_data_df2["partition"].fillna("NA")
        temp=all_data_df2.groupby(['time',"core","partition","state"])['Node'].count().reset_index()
        temp.to_csv("rm_level.csv")
        
    else:
        all_data_df2["nodeType"]=all_data_df2["nodeType"].fillna("NA")
        temp=all_data_df2.groupby(['time',"mem","nodeType","state"])['Node'].count().reset_index()
        temp.to_csv("lm_level.csv")
                
    return temp




In [91]:
%%time
gpu_util_graph_data=utilization_graph_data("today_gpu.csv",df_all_gpu)

CPU times: user 1min 15s, sys: 45 ms, total: 1min 15s
Wall time: 1min 15s


In [35]:
gpu_util_graph_data.head()

NameError: name 'gpu_util_graph_data' is not defined

In [92]:
%%time
rm_util_graph_data=utilization_graph_data("today_rm.csv",df_all_rm)

CPU times: user 10min 10s, sys: 1.55 s, total: 10min 12s
Wall time: 10min 12s


In [120]:
rm_util_graph_data.head()

Unnamed: 0,time,core,partition,state,Node
0,2018-07-16T15:40:00,28.0,RM,TIMEOUT,1
1,2018-07-16T15:41:00,28.0,RM,TIMEOUT,1
2,2018-07-16T15:42:00,28.0,RM,TIMEOUT,1
3,2018-07-16T15:43:00,28.0,RM,TIMEOUT,1
4,2018-07-16T15:44:00,28.0,RM,TIMEOUT,1


In [36]:
%%time

lm_util_graph_data=utilization_graph_data("today_lm.csv",df_all_lm)

TypeError: Parser must be a string or character stream, not float

In [121]:
lm_util_graph_data.head()

Unnamed: 0,time,mem,nodeType,state,Node
0,2018-07-10T23:30:00,720.0,xl,COMPLETED,1
1,2018-07-10T23:31:00,720.0,xl,COMPLETED,1
2,2018-07-10T23:32:00,720.0,xl,COMPLETED,1
3,2018-07-10T23:33:00,720.0,xl,COMPLETED,1
4,2018-07-10T23:34:00,720.0,xl,COMPLETED,1


### For GPU/RM/LM Backlog graphs

In [29]:
def transform_start_end_time(df_backlog):
    temp_backlog = []
    for a in range(df_backlog.shape[0]):
        temp_backlog_time=[]
        startdate=dateutil.parser.parse(df_backlog.iloc[a,:]["Submit"])
        enddate=dateutil.parser.parse(df_backlog.iloc[a,:]["fakeStart"])+timedelta(seconds=1)
        for result in perdelta(startdate,enddate,timedelta(minutes=1),df_backlog.iloc[a,:]):
            if result[0]==startdate and str(result[0])[17:]!="00":
                pass
            else:
                temp_backlog_time.append(result)
        temp_backlog.append(pd.DataFrame(temp_backlog_time))

        
    
    return temp_backlog

In [47]:
def backlog_graph_data(fileName,df):    
    df_backlog=df.dropna(subset=["Start","End","Submit"])
    df_backlog = df_backlog.drop(df_backlog[df_backlog.End=='Unknown'].index)
    df_backlog = df_backlog.drop(df_backlog[df_backlog.NodeList=='None assigned'].index)
    df_backlog=df_backlog.reset_index(drop=True)
    df_backlog["fakeStart"]=df_backlog["Start"].str[:17]+"59"
    
    if "gpu" in fileName:
        df_backlog=df_backlog.dropna(subset=["Alloc_gres/gpu"])    
    
    temp_backlog=transform_start_end_time(df_backlog)
    
    result = pd.concat(temp_backlog)
    result = result.reset_index(drop=True)
    
    if "gpu" in fileName:
        result_small_user=result.iloc[:,[0,4,6,9,12,16]]
        result_small_user.columns=["time","nodeType","jobid","partition","reqTime","user"]

    elif "rm" in fileName:
        result_small_user=result.iloc[:,[0,6,9,12,16]]
        result_small_user.columns=["time","jobid","partition","reqTime","user"]
        

    else:
        result_small_user=result.iloc[:,[0,3,6,9,12,16]]
        result_small_user.columns=["time","mem","jobid","nodeType","reqTime","user"]
        
        
    result_small_user['time']=result_small_user['time'].map(lambda x: dt.datetime.strftime(x, '%Y-%m-%dT%H:%M:%SZ'))
    result_small_user['time']=result_small_user['time'].str[:-1]

    
    if "gpu" in fileName:
        temp=result_small_user.groupby(["time","partition","nodeType","user"]).agg({'reqTime':'sum','jobid':'count'}).reset_index()
        temp.to_csv("backlog_gpu.csv")
    elif "rm" in fileName:
        temp=result_small_user.groupby(["time","partition","user"]).agg({'reqTime':'sum','jobid':'count'}).reset_index()
        temp.to_csv("backlog_rm.csv")

    else:
        temp=result_small_user.groupby(["time","nodeType","user"]).agg({'mem':'sum','reqTime':'sum','jobid':'count'}).reset_index()
        temp.to_csv("backlog_lm.csv") 
    
    
    return temp

    


In [41]:
%%time
gpu_backlog_graph_data=backlog_graph_data("today_gpu.csv",df_all_gpu)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


CPU times: user 4min 39s, sys: 886 ms, total: 4min 40s
Wall time: 4min 40s


In [54]:
gpu_backlog_graph_data.head()

Unnamed: 0,time,partition,nodeType,user,reqTime,jobid
0,2018-07-09T13:26:45,GPU,p100,jqwang,172800.0,1
1,2018-07-09T13:26:48,GPU,p100,jqwang,172800.0,1
2,2018-07-09T13:26:57,GPU,p100,jqwang,172800.0,1
3,2018-07-09T13:27:45,GPU,p100,jqwang,172800.0,1
4,2018-07-09T13:27:48,GPU,p100,jqwang,172800.0,1


In [39]:
%%time
rm_backlog_graph_data=backlog_graph_data("today_rm.csv",df_all_rm)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


CPU times: user 3min 14s, sys: 576 ms, total: 3min 15s
Wall time: 3min 15s


In [53]:
rm_backlog_graph_data.head()

Unnamed: 0,time,partition,user,reqTime,jobid
0,2018-07-12T10:18:25,RM,ahazel3,172800.0,1
1,2018-07-12T10:19:25,RM,ahazel3,172800.0,1
2,2018-07-12T10:20:25,RM,ahazel3,172800.0,1
3,2018-07-12T10:21:25,RM,ahazel3,172800.0,1
4,2018-07-12T10:22:25,RM,ahazel3,172800.0,1


In [48]:
%%time
lm_backlog_graph_data=backlog_graph_data("today_lm.csv",df_all_lm)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


CPU times: user 47.4 s, sys: 170 ms, total: 47.6 s
Wall time: 47.5 s


In [52]:
lm_backlog_graph_data.head()

Unnamed: 0,time,nodeType,user,mem,reqTime,jobid
0,2018-07-10T23:28:38,l,macmanes,720,1209600.0,1
1,2018-07-10T23:28:38,xl,macmanes,720,1209600.0,1
2,2018-07-10T23:29:38,l,macmanes,720,1209600.0,1
3,2018-07-10T23:29:38,xl,macmanes,720,1209600.0,1
4,2018-07-10T23:30:38,l,macmanes,720,1209600.0,1


### For GPU/RM/LM Interactive Jobs Graphs

In [101]:
def process_time(df_interact):
    df_interact['Start'] = pd.to_datetime(df_interact['Start'])
    df_interact['Submit'] = pd.to_datetime(df_interact['Submit'])
    df_interact['Start']= df_interact['Start'].map(lambda x: dt.datetime.strftime(x, '%Y-%m-%dT%H:%M:%SZ'))
    df_interact['Submit']= df_interact['Submit'].map(lambda x: dt.datetime.strftime(x, '%Y-%m-%dT%H:%M:%SZ'))
    df_interact['Start']=df_interact['Start'].str[:-1]
    df_interact['Submit']=df_interact['Submit'].str[:-1]
    df_interact["Start"]=df_interact["Start"]+".000"
    df_interact["Submit"]=df_interact["Submit"]+".000"
    return df_interact



In [110]:
def Interact_graph_data(fileName,df): 
    df_backlog=df.dropna(subset=["Start","End","Submit"])
    if "gpu" in fileName:
        df_backlog=df.dropna(subset=["Alloc_gres/gpu"])
        
    df_backlog = df_backlog.drop(df_backlog[df_backlog.End=='Unknown'].index)
    df_backlog = df_backlog.drop(df_backlog[df_backlog.NodeList=='None assigned'].index)
    df_backlog=df_backlog.reset_index(drop=True)
    
    if "gpu" in fileName:
        df_interact=df_backlog[df_backlog['JobName']=="Interact"].loc[:,["Start","Submit","Partition","Alloc_gres/gpu","Alloc_NODE"]]
    elif "rm" in fileName:
        df_interact=df_backlog[df_backlog['JobName']=="Interact"].loc[:,["Start","Submit","Partition","Alloc_NODE"]]
    else:
        df_interact=df_backlog[df_backlog['JobName']=="Interact"].loc[:,["Start","Submit","Partition","Alloc_MEM","Alloc_NODE"]]

        
    df_interact["waittime"]=[dateutil.parser.parse(df_interact.iloc[a,:]["Start"])-dateutil.parser.parse(df_interact.iloc[a,:]["Submit"]) for a in range (df_interact.shape[0])]    
    df_interact["waittime"]=df_interact["waittime"].dt.total_seconds()
    df_interact=df_interact.reset_index(drop=True)
    
    if "gpu" in fileName:
        df_interact.columns=["Start","Submit","Partition","nodeType","Alloc_NODE","Waittime"]
    elif "rm" in fileName:
        df_interact.columns=["Start","Submit","Partition","Alloc_NODE","Waittime"]
    else:
        df_interact.columns=["Start","Submit","nodeType","Alloc_MEM","Alloc_NODE","Waittime"]

    df_interact=process_time(df_interact)
    df_interact=df_interact.drop(["Start"],axis=1)
    
    if "gpu" in fileName:
        df_interact.to_csv("interact_gpu.csv")
    elif "rm" in fileName:
        df_interact.to_csv("interact_rm.csv")        
    else:
        df_interact.to_csv("interact_lm.csv") 
    
    return df_interact
    


In [111]:
%%time 
gpu_interact_graph_data=Interact_graph_data("today_gpu.csv",df_all_gpu)

CPU times: user 143 ms, sys: 4 µs, total: 143 ms
Wall time: 146 ms


In [125]:
gpu_interact_graph_data.head()

Unnamed: 0,Submit,Partition,nodeType,Alloc_NODE,Waittime
0,2018-07-19T00:14:06.000,GPU-small,p100,1,0.0
1,2018-07-19T01:18:48.000,GPU-small,p100,1,0.0
2,2018-07-19T04:07:01.000,GPU-shared,p100,1,0.0
3,2018-07-19T04:11:05.000,GPU-shared,p100,1,0.0
4,2018-07-19T04:22:39.000,GPU-shared,p100,1,0.0


In [115]:
%%time 
rm_interact_graph_data=Interact_graph_data("today_rm.csv",df_all_rm)


CPU times: user 105 ms, sys: 1e+03 µs, total: 106 ms
Wall time: 107 ms


In [126]:
rm_interact_graph_data.head()

Unnamed: 0,Submit,Partition,Alloc_NODE,Waittime
0,2018-07-18T17:34:47.000,RM,4,0.0
1,2018-07-18T19:40:23.000,RM,4,60.0
2,2018-07-18T19:52:25.000,RM,3,0.0
3,2018-07-18T20:36:09.000,RM,4,0.0
4,2018-07-18T23:39:07.000,RM,4,194.0


In [117]:
%%time 
lm_interact_graph_data=Interact_graph_data("today_lm.csv",df_all_lm)

CPU times: user 27.7 ms, sys: 1 ms, total: 28.7 ms
Wall time: 29 ms


In [127]:
lm_interact_graph_data.head()

Unnamed: 0,Submit,nodeType,Alloc_MEM,Alloc_NODE,Waittime
0,2018-07-18T20:04:35.000,l,200,1,0.0
1,2018-07-18T23:18:50.000,l,200,1,0.0
2,2018-07-19T09:10:35.000,l,200,1,0.0
3,2018-07-19T09:11:19.000,l,200,1,0.0
4,2018-07-19T11:09:11.000,l,128,1,41.0
