## System Resources Analysis

In [1]:
# import necessary python libraries
import pandas as pd
import numpy as np

In [2]:
#define cols and read cpu csv
cols = ['Image', 'PID', 'Description', 'Status', 'Threads', 'Average CPU']
df_cpu = pd.read_csv("cpu.csv", usecols=cols)

In [3]:
#check shape of df
df_cpu.shape

(306, 6)

In [4]:
#check info of df
df_cpu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306 entries, 0 to 305
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Image        306 non-null    object 
 1   PID          306 non-null    object 
 2   Description  290 non-null    object 
 3   Status       306 non-null    object 
 4   Threads      306 non-null    object 
 5   Average CPU  306 non-null    float64
dtypes: float64(1), object(5)
memory usage: 14.5+ KB


In [5]:
#define cols and read disk csv
cols = ['Image', 'PID', 'Total Byte Per Second']
df_disk = pd.read_csv("disk.csv", usecols=cols)

In [6]:
#check shape of df
df_disk.shape

(14, 3)

In [7]:
#check info of df
df_disk.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 3 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Image                  14 non-null     object
 1   PID                    14 non-null     int64 
 2   Total Byte Per Second  14 non-null     object
dtypes: int64(1), object(2)
memory usage: 468.0+ bytes


In [8]:
#define cols and read disk csv
cols = ['Image', 'PID', 'Working Set KB']
df_memory = pd.read_csv("memory.csv", usecols=cols)

In [9]:
#check shape of df
df_memory.shape

(306, 3)

In [10]:
#check info of df
df_memory.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306 entries, 0 to 305
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Image           306 non-null    object
 1   PID             306 non-null    int64 
 2   Working Set KB  306 non-null    object
dtypes: int64(1), object(2)
memory usage: 7.3+ KB


### Column Cleaning

In [11]:
# create a func to convert all column name to lowercase and change all spaces in column names to underscores
def CleanColumnHeading(dfx):
    dfx.columns = [x.lower() for x in dfx.columns]
    dfx.columns = dfx.columns.str.replace(' ', '_')
    return dfx

In [12]:
df_cpu.head()

Unnamed: 0,Image,PID,Description,Status,Threads,Average CPU
0,Secure System,140,,Suspended,-,0.0
1,SearchHost.exe,9272,SearchHost,Suspended,68,0.0
2,LockApp.exe,12784,LockApp.exe,Suspended,16,0.0
3,AcrobatNotificationClient.exe,18096,AcrobatNotificationClient,Suspended,13,0.0
4,AdobeNotificationClient.exe,9964,Adobe Notification Client,Suspended,12,0.0


In [13]:
# call func passing in df_cpu 
CleanColumnHeading(df_cpu)

Unnamed: 0,image,pid,description,status,threads,average_cpu
0,Secure System,140,,Suspended,-,0.0
1,SearchHost.exe,9272,SearchHost,Suspended,68,0.0
2,LockApp.exe,12784,LockApp.exe,Suspended,16,0.0
3,AcrobatNotificationClient.exe,18096,AcrobatNotificationClient,Suspended,13,0.0
4,AdobeNotificationClient.exe,9964,Adobe Notification Client,Suspended,12,0.0
...,...,...,...,...,...,...
301,splwow64.exe,20660,Print driver host for applications,Running,7,0.0
302,svchost.exe (PrintWorkflow),13528,Host Process for Windows Services,Running,3,0.0
303,chrome.exe,13556,Google Chrome,Running,13,0.0
304,SearchProtocolHost.exe,7736,Microsoft Windows Search Protocol Host,Running,6,0.0


In [14]:
df_disk.head()

Unnamed: 0,Image,PID,Total Byte Per Second
0,chrome.exe,5448,6278
1,chrome.exe,13324,4940
2,EXCEL.EXE,12132,71807
3,Grammarly.Desktop.exe,19180,6729
4,msedgewebview2.exe,20716,8799


In [15]:
# call func passing in df_disk
CleanColumnHeading(df_disk)

Unnamed: 0,image,pid,total_byte_per_second
0,chrome.exe,5448,6278
1,chrome.exe,13324,4940
2,EXCEL.EXE,12132,71807
3,Grammarly.Desktop.exe,19180,6729
4,msedgewebview2.exe,20716,8799
5,msedgewebview2.exe,14716,5616
6,msedgewebview2.exe,10576,4681
7,msedgewebview2.exe,21848,964
8,MsMpEng.exe,6728,1457
9,OUTLOOK.EXE,7300,1471


In [16]:
df_memory.head()

Unnamed: 0,Image,PID,Working Set KB
0,AcrobatNotificationClient.exe,18096,3632
1,Adobe Crash Processor.exe,7856,15252
2,Adobe Desktop Service.exe,6640,85504
3,AdobeCollabSync.exe,17928,20384
4,AdobeCollabSync.exe,16808,16628


In [17]:
# call func passing in df_memory
CleanColumnHeading(df_memory)

Unnamed: 0,image,pid,working_set_kb
0,AcrobatNotificationClient.exe,18096,3632
1,Adobe Crash Processor.exe,7856,15252
2,Adobe Desktop Service.exe,6640,85504
3,AdobeCollabSync.exe,17928,20384
4,AdobeCollabSync.exe,16808,16628
...,...,...,...
301,WUDFHost.exe,1528,19504
302,WUDFHost.exe,1804,13892
303,WUDFHost.exe,1960,7184
304,WUDFHost.exe,1916,5744


### Data Cleaning

In [18]:
#examine threads column 
df_cpu.head()

Unnamed: 0,image,pid,description,status,threads,average_cpu
0,Secure System,140,,Suspended,-,0.0
1,SearchHost.exe,9272,SearchHost,Suspended,68,0.0
2,LockApp.exe,12784,LockApp.exe,Suspended,16,0.0
3,AcrobatNotificationClient.exe,18096,AcrobatNotificationClient,Suspended,13,0.0
4,AdobeNotificationClient.exe,9964,Adobe Notification Client,Suspended,12,0.0


In [19]:
#display info for the df_cpu
df_cpu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306 entries, 0 to 305
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   image        306 non-null    object 
 1   pid          306 non-null    object 
 2   description  290 non-null    object 
 3   status       306 non-null    object 
 4   threads      306 non-null    object 
 5   average_cpu  306 non-null    float64
dtypes: float64(1), object(5)
memory usage: 14.5+ KB


In [20]:
# remove all characters that are not numbers from threads column
df_cpu["threads"] = df_cpu["threads"].replace('[^0-9]', np.nan, regex=True)

In [21]:
# drop rows if threads column is blank
df_cpu = df_cpu.dropna(subset=['threads'])

In [22]:
# display difference for rows that were modified
df_cpu.head()

Unnamed: 0,image,pid,description,status,threads,average_cpu
1,SearchHost.exe,9272,SearchHost,Suspended,68,0.0
2,LockApp.exe,12784,LockApp.exe,Suspended,16,0.0
3,AcrobatNotificationClient.exe,18096,AcrobatNotificationClient,Suspended,13,0.0
4,AdobeNotificationClient.exe,9964,Adobe Notification Client,Suspended,12,0.0
5,ShellExperienceHost.exe,12220,Windows Shell Experience Host,Suspended,18,0.0


In [23]:
# correct datatype
df_cpu["threads"] = pd.to_numeric(df_cpu["threads"], errors='coerce')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cpu["threads"] = pd.to_numeric(df_cpu["threads"], errors='coerce')


In [24]:
#confirm difference in datatype
df_cpu.info()

<class 'pandas.core.frame.DataFrame'>
Index: 304 entries, 1 to 305
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   image        304 non-null    object 
 1   pid          304 non-null    object 
 2   description  289 non-null    object 
 3   status       304 non-null    object 
 4   threads      304 non-null    int64  
 5   average_cpu  304 non-null    float64
dtypes: float64(1), int64(1), object(4)
memory usage: 16.6+ KB


In [25]:
df_cpu.head()

Unnamed: 0,image,pid,description,status,threads,average_cpu
1,SearchHost.exe,9272,SearchHost,Suspended,68,0.0
2,LockApp.exe,12784,LockApp.exe,Suspended,16,0.0
3,AcrobatNotificationClient.exe,18096,AcrobatNotificationClient,Suspended,13,0.0
4,AdobeNotificationClient.exe,9964,Adobe Notification Client,Suspended,12,0.0
5,ShellExperienceHost.exe,12220,Windows Shell Experience Host,Suspended,18,0.0


In [26]:
#examine total_byte_per_second
df_disk.head()

Unnamed: 0,image,pid,total_byte_per_second
0,chrome.exe,5448,6278
1,chrome.exe,13324,4940
2,EXCEL.EXE,12132,71807
3,Grammarly.Desktop.exe,19180,6729
4,msedgewebview2.exe,20716,8799


In [27]:
#display info for df disk
df_disk.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 3 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   image                  14 non-null     object
 1   pid                    14 non-null     int64 
 2   total_byte_per_second  14 non-null     object
dtypes: int64(1), object(2)
memory usage: 468.0+ bytes


In [28]:
#remove commas from total_byte_per_second column 
df_disk["total_byte_per_second"] = df_disk["total_byte_per_second"].replace(',', '', regex=True)

In [29]:
#confirm removal
df_disk.head()

Unnamed: 0,image,pid,total_byte_per_second
0,chrome.exe,5448,6278
1,chrome.exe,13324,4940
2,EXCEL.EXE,12132,71807
3,Grammarly.Desktop.exe,19180,6729
4,msedgewebview2.exe,20716,8799


In [30]:
# remove invalid characters that are not numbers 
df_disk["total_byte_per_second"] = df_disk["total_byte_per_second"].replace('[^0-9]', np.nan, regex=True)

In [31]:
df_disk.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 3 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   image                  14 non-null     object
 1   pid                    14 non-null     int64 
 2   total_byte_per_second  14 non-null     object
dtypes: int64(1), object(2)
memory usage: 468.0+ bytes


In [32]:
#drop rows that have a blank value in total_byte_per_second
df_disk.dropna(subset=['total_byte_per_second'])

Unnamed: 0,image,pid,total_byte_per_second
0,chrome.exe,5448,6278
1,chrome.exe,13324,4940
2,EXCEL.EXE,12132,71807
3,Grammarly.Desktop.exe,19180,6729
4,msedgewebview2.exe,20716,8799
5,msedgewebview2.exe,14716,5616
6,msedgewebview2.exe,10576,4681
7,msedgewebview2.exe,21848,964
8,MsMpEng.exe,6728,1457
9,OUTLOOK.EXE,7300,1471


In [33]:
# change column data type
df_disk["total_byte_per_second"] = pd.to_numeric(df_disk["total_byte_per_second"], errors='coerce')

In [34]:
#confirm change of data type
df_disk.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 3 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   image                  14 non-null     object
 1   pid                    14 non-null     int64 
 2   total_byte_per_second  14 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 468.0+ bytes


In [35]:
df_disk.head()

Unnamed: 0,image,pid,total_byte_per_second
0,chrome.exe,5448,6278
1,chrome.exe,13324,4940
2,EXCEL.EXE,12132,71807
3,Grammarly.Desktop.exe,19180,6729
4,msedgewebview2.exe,20716,8799


In [36]:
#examine working_set_kb
df_memory.head()

Unnamed: 0,image,pid,working_set_kb
0,AcrobatNotificationClient.exe,18096,3632
1,Adobe Crash Processor.exe,7856,15252
2,Adobe Desktop Service.exe,6640,85504
3,AdobeCollabSync.exe,17928,20384
4,AdobeCollabSync.exe,16808,16628


In [37]:
#display info for df memory
df_memory.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306 entries, 0 to 305
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   image           306 non-null    object
 1   pid             306 non-null    int64 
 2   working_set_kb  306 non-null    object
dtypes: int64(1), object(2)
memory usage: 7.3+ KB


In [38]:
#remove commas from working_set_kb column 
df_memory["working_set_kb"] = df_memory["working_set_kb"].replace(',', '', regex=True)

In [39]:
#confirm removal
df_memory.head()

Unnamed: 0,image,pid,working_set_kb
0,AcrobatNotificationClient.exe,18096,3632
1,Adobe Crash Processor.exe,7856,15252
2,Adobe Desktop Service.exe,6640,85504
3,AdobeCollabSync.exe,17928,20384
4,AdobeCollabSync.exe,16808,16628


In [40]:
# remove invalid characters that are not numbers from working_set_kb
df_memory["working_set_kb"] = df_memory["working_set_kb"].replace('[^0-9]', np.nan, regex=True)

In [41]:
df_memory.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306 entries, 0 to 305
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   image           306 non-null    object
 1   pid             306 non-null    int64 
 2   working_set_kb  306 non-null    object
dtypes: int64(1), object(2)
memory usage: 7.3+ KB


In [42]:
#drop rows that have a blank value in working_set_kb
df_memory.dropna(subset=["working_set_kb"])

Unnamed: 0,image,pid,working_set_kb
0,AcrobatNotificationClient.exe,18096,3632
1,Adobe Crash Processor.exe,7856,15252
2,Adobe Desktop Service.exe,6640,85504
3,AdobeCollabSync.exe,17928,20384
4,AdobeCollabSync.exe,16808,16628
...,...,...,...
301,WUDFHost.exe,1528,19504
302,WUDFHost.exe,1804,13892
303,WUDFHost.exe,1960,7184
304,WUDFHost.exe,1916,5744


In [43]:
#change datatype of working_set_kb column
df_memory["working_set_kb"] = pd.to_numeric(df_memory["working_set_kb"], errors='coerce')

In [44]:
#confirm datatype change
df_memory.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306 entries, 0 to 305
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   image           306 non-null    object
 1   pid             306 non-null    int64 
 2   working_set_kb  306 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 7.3+ KB


In [45]:
#display results
df_memory.head()

Unnamed: 0,image,pid,working_set_kb
0,AcrobatNotificationClient.exe,18096,3632
1,Adobe Crash Processor.exe,7856,15252
2,Adobe Desktop Service.exe,6640,85504
3,AdobeCollabSync.exe,17928,20384
4,AdobeCollabSync.exe,16808,16628


### Data Aggregation

In [46]:
#code aggregates the data based on the requirements provided
def aggregate_data(dfx):
    sum_column_rename = ''
    if 'threads' in dfx.columns:
        sum_column = 'threads'
        sum_column_rename = 'threads_sum'
    elif 'total_byte_per_second' in dfx.columns:
        sum_column = 'total_byte_per_second'
        sum_column_rename = 'total_byte_sum'
    elif 'working_set_kb' in dfx.columns:
        sum_column = 'working_set_kb'
        sum_column_rename = 'working_set_sum'
    else:
        raise ValueError("No aggregate column recognized")

    result_df = dfx.groupby('image').agg(
        process_qty=('image', 'size'),
        sum_value=(sum_column, 'sum')
    ).reset_index().rename(columns={'image': 'image_name', 'sum_value': sum_column_rename})
    
    return result_df

In [47]:
# calling aggregate_data and assigning the result for df_cpu
df_cpu_sum = aggregate_data(df_cpu)

In [48]:
print(df_cpu_sum.head(10))

                      image_name  process_qty  threads_sum
0  AcrobatNotificationClient.exe            1           13
1      Adobe Crash Processor.exe            1            4
2      Adobe Desktop Service.exe            1           47
3            AdobeCollabSync.exe            2           24
4             AdobeIPCBroker.exe            1           26
5    AdobeNotificationClient.exe            1           12
6         AdobeUpdateService.exe            1            5
7             AggregatorHost.exe            1            1
8       ApplicationFrameHost.exe            1            6
9                  CCLibrary.exe            1            1


In [49]:
print(df_cpu_sum.tail(10))

                     image_name  process_qty  threads_sum
140  svchost.exe (osprivacy -p)            1            4
141       svchost.exe (smphost)            1            6
142     svchost.exe (utcsvc -p)            1           12
143               taskhostw.exe            1            8
144                  uihost.exe            1           63
145                unsecapp.exe            1            2
146                vpnagent.exe            1            7
147                 wininit.exe            1            2
148                winlogon.exe            1            3
149                 wlanext.exe            1            2


In [50]:
# calling aggregate_data and assigning the result for df_disk
df_disk_sum = aggregate_data(df_disk)

In [51]:
print(df_disk_sum.head(10))

              image_name  process_qty  total_byte_sum
0              EXCEL.EXE            1           71807
1  Grammarly.Desktop.exe            1            6729
2            MsMpEng.exe            1            1457
3            OUTLOOK.EXE            1            1471
4               Registry            1            5523
5                 System            1          232627
6              Teams.exe            1             547
7           WUDFHost.exe            1           24587
8             chrome.exe            2           11218
9     msedgewebview2.exe            4           20060


In [52]:
print(df_disk_sum.tail(10))

              image_name  process_qty  total_byte_sum
0              EXCEL.EXE            1           71807
1  Grammarly.Desktop.exe            1            6729
2            MsMpEng.exe            1            1457
3            OUTLOOK.EXE            1            1471
4               Registry            1            5523
5                 System            1          232627
6              Teams.exe            1             547
7           WUDFHost.exe            1           24587
8             chrome.exe            2           11218
9     msedgewebview2.exe            4           20060


In [53]:
# calling aggregate_data and assigning the result for df_memory
df_memory_sum = aggregate_data(df_memory)

In [54]:
print(df_memory_sum.head(10))

                      image_name  process_qty  working_set_sum
0  AcrobatNotificationClient.exe            1             3632
1      Adobe Crash Processor.exe            1            15252
2      Adobe Desktop Service.exe            1            85504
3            AdobeCollabSync.exe            2            37012
4             AdobeIPCBroker.exe            1            10568
5    AdobeNotificationClient.exe            1             2836
6         AdobeUpdateService.exe            1             9580
7             AggregatorHost.exe            1             7420
8       ApplicationFrameHost.exe            1            35988
9                  CCLibrary.exe            1             2948


In [55]:
print(df_memory_sum.tail(10))

                     image_name  process_qty  working_set_sum
141  svchost.exe (osprivacy -p)            1            13812
142       svchost.exe (smphost)            1            16624
143     svchost.exe (utcsvc -p)            1            30104
144               taskhostw.exe            1            19400
145                  uihost.exe            1            10472
146                unsecapp.exe            1             7528
147                vpnagent.exe            1            26192
148                 wininit.exe            1             6264
149                winlogon.exe            1            12320
150                 wlanext.exe            1             6764


### Merging Data

In [56]:
# merge df_cpu_sum and df_disk_sum with inner join on image_name
df_new = pd.merge(df_cpu_sum, df_disk_sum, on='image_name', how='inner')

In [57]:
df_new.head()

Unnamed: 0,image_name,process_qty_x,threads_sum,process_qty_y,total_byte_sum
0,EXCEL.EXE,1,64,1,71807
1,Grammarly.Desktop.exe,1,48,1,6729
2,MsMpEng.exe,1,66,1,1457
3,OUTLOOK.EXE,1,89,1,1471
4,Registry,1,4,1,5523


In [58]:
# merge df_new and df_memory_sum with inner join on image_name
df_new = pd.merge(df_new, df_memory_sum, on='image_name', how='inner')

In [59]:
df_new.head()

Unnamed: 0,image_name,process_qty_x,threads_sum,process_qty_y,total_byte_sum,process_qty,working_set_sum
0,EXCEL.EXE,1,64,1,71807,1,298380
1,Grammarly.Desktop.exe,1,48,1,6729,1,273436
2,MsMpEng.exe,1,66,1,1457,1,210204
3,OUTLOOK.EXE,1,89,1,1471,1,447040
4,Registry,1,4,1,5523,1,40864


### Filtering Data

In [60]:
# filter df_new to obtain image_name that had working_set_sum > 200,000
high_memory_image = df_new[df_new["working_set_sum"] > 200000]

In [61]:
# filter df_new to obtain image_name that had thread_sum > 200
high_thread_image = df_new[df_new["threads_sum"] > 200]

In [62]:
print(high_thread_image['image_name'])

5                System
6             Teams.exe
8            chrome.exe
9    msedgewebview2.exe
Name: image_name, dtype: object


In [63]:
# filter df_new to obtain image_name that had working_set_sum > 200,000, threads_sum < 50  and total_byte_sum < 7000
hi_mem_low_thread_low_io = df_new[(df_new["working_set_sum"] > 200000) & (df_new["threads_sum"] < 50) & (df_new["total_byte_sum"] < 7000)]

In [64]:
print(hi_mem_low_thread_low_io['image_name'])

1    Grammarly.Desktop.exe
Name: image_name, dtype: object
