In [1]:
from src.jdv_conn import query_JDV
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option("display.colheader_justify","right")

# Load kcs and cases from the database

In [2]:
import json
kcs_rule_dict = json.load(open( "../data/kcs_rule_dict.json"))
kcs_string = ', '.join(kcs_rule_dict.keys())
#select 10 for tests
kcs_string = '740323, 24651, 1614393, 2065483, 797553, 3415331, 3446331, 4455551, 3991451, 29894'
kcs_rule_dict = {"740323": "filesystem/mounting_fs_with_errors.py", "24651": "filesystem/ext3_lookup_logs_unlinked_inode_error.py", "1614393": "filesystem/xfs_blocksize_issue.py", "2065483": "filesystem/nfs_fragment_too_large_error.py", "797553": "filesystem/nt_status_account_locked_out.py", "3415331": "filesystem/smb2_ntlm_mount_issue.py", "3446331": "filesystem/system_crash_is_size_safe_to_change.py", "4455551": "filesystem/smb3_systemd_issue.py", "3991451": "filesystem/cifs_fips_issue.py", "29894": "filesystem/ext_dx_add_entry.py"}

In [3]:
sql ="""
        SELECT kcs.resource_display_id__c, c.casenumber, c.subject, c.description
        FROM (SELECT  id, casenumber, subject, description
              FROM stg_gss_case
              WHERE stg_curr_flg = true
                  AND case_language__c='en'
                  AND isdeleted = false
                  AND ownerid != '00GA0000000XxxNMAS') c
            INNER JOIN stg_gss_case_rsrc_rltnshp kcs ON kcs.case__c = c.id
        WHERE stg_curr_flg = true
            AND isdeleted = false
            AND type__c = 'Link'  
            AND kcs.resource_display_id__c IN (""" + kcs_string + """)
        ORDER BY kcs.resource_display_id__c
"""

In [4]:
cases = query_JDV(queryString=sql)

In [5]:
cases

Unnamed: 0,resource_display_id__c,casenumber,subject,description
0,1614393,2144975,mount XFS failed: Function not implemented wit...,What problem/issue/behavior are you having tro...
1,1614393,2168396,Disk partitioning - Failed if I use 16K blocksize,"Hi Team,\n\nWe have RHEL 7.5 VM and attached m..."
2,1614393,2532866,We set the block size doesn't reflect to the x...,What problem/issue/behavior are you having tro...
3,1614393,2410915,Want to know correct syntax to format a logica...,What problem/issue/behavior are you having tro...
4,1614393,1734684,Failed to mount XFS file system with 16K block...,What problem/issue/behavior are you having tro...
5,1614393,1906558,Unable to increase XFS block size to 64KB beca...,What problem/issue/behavior are you having tro...
6,1614393,2073948,"mkfs.xfs -f -b size=32k produces ""Function not...",What problem/issue/behavior are you having tro...
7,1614393,2122142,Unable to mount xfs file system if the volume...,What problem/issue/behavior are you having tro...
8,1614393,2468840,Is possible reduce the bs in xfs?,¿Qué problema/comportamiento le está causando ...
9,1614393,2333745,Partition Table Issue for UEFI and 16 Bloxk Size,What problem/issue/behavior are you having tro...


# Single Rank Functions

In [6]:
# skips useless warnings in the pke methods
import logging
logging.basicConfig(level=logging.CRITICAL)
import pke



In [7]:
def preprocessing(text):
    #Replace newlines
    text = text.replace('\n', ' ')
    
    #Remove these unnecessary info from case description that is only need by the support associates
    #For resolution, the issue that customer has is what we need
    # We are going to use this additional text info as seperators and replace with some pattern
    seps = ['What problem/issue/behavior are you having trouble with?  What do you expect to see?', 'Where are you experiencing the behavior?  What environment?', 'When does the behavior occur? Frequently?  Repeatedly?   At certain times?', 'What information can you provide around timeframes and the business impact?']
    for sep in seps:
        text = text.replace(sep, '|#|')
        text_list = [each.strip() for each in text.split('|#|') if each.strip()]
        #print(text_list)
        #print(f"Actual Issue =>>> {text_list[0]}")
        issue = text_list[0]
    return issue

In [8]:
#Execute single rank algorithm to extract key phrases
def keyphrases(text):
    
    # define the set of valid Part Of Speech tags 
    pos = {'NOUN', 'PROPN', 'ADJ'}
    
    #create a SingleRank extractor
    singleRank_extractor = pke.unsupervised.SingleRank()
    
    # load the content of the document
    singleRank_extractor.load_document(input=text, language='en', normalization=None)
    
    # candidate selection (select the longest sequences of nouns and adjectives as candidates)
    singleRank_extractor.candidate_selection(pos)
    
    # candidate_weighing
    # candidate phrases are weighted using sum of their word's scores computed
    # using random walk. In graph, nodes are words of certain part-of-speech(nouns & adjectives)
    # that are connected if they occur in a window of 10 words
    singleRank_extractor.candidate_weighting(window=10, pos=pos)
    
    # rank the keyphrase and get the 10-higest scored candidates
    keyphrases_with_scores = singleRank_extractor.get_n_best(n=10)
    phrases = [keyphrase for keyphrase, score in keyphrases_with_scores]
    
    return phrases

In [9]:

def add_phrases(df):
    df['subject_key_phrases'] = df['subject'].apply(lambda x: keyphrases(preprocessing(x)))
    df['description_key_phrases'] = df['description'].apply(lambda x: keyphrases(preprocessing(x)))
    return df

# Calculate Keywords for cases

In [10]:
import time
start_time = time.time()
df = add_phrases(cases)
elapsed_time = time.time() - start_time

In [11]:
df['description_key_phrases'] = df.apply(lambda x: x['subject_key_phrases'] + x['description_key_phrases'],  axis=1)

In [12]:
df.style.set_properties(**{'text-align': 'left'})


Unnamed: 0,resource_display_id__c,casenumber,subject,description,subject_key_phrases,description_key_phrases
0,1614393,2144975,mount XFS failed: Function not implemented with block size set to 16K,"What problem/issue/behavior are you having trouble with? What do you expect to see? ds-san-dev-lux-01$ sudo mkfs.xfs -b size=16k /dev/xfs_vg01/xfs_lv01 Enter password for yuhuang (QUALPASS): meta-data=/dev/xfs_vg01/xfs_lv01 isize=512 agcount=32, agsize=10485600 blks  = sectsz=512 attr=2, projid32bit=1  = crc=1 finobt=0, sparse=0 data = bsize=16384 blocks=335539200, imaxpct=5  = sunit=1 swidth=1024 blks naming =version 2 bsize=16384 ascii-ci=0 ftype=1 log =internal log bsize=16384 blocks=130432, version=2  = sectsz=512 sunit=1 blks, lazy-count=1 realtime =none extsz=16384 blocks=0, rtextents=0 ds-san-dev-lux-01$ sudo mkdir /local/syncdata02 ds-san-dev-lux-01$ sudo mount /dev/xfs_vg01/xfs_lv01 /local/syncdata02 mount: mount /dev/mapper/xfs_vg01-xfs_lv01 on /local/syncdata02 failed: Function not implemented ds-san-dev-lux-01$ Where are you experiencing the behavior? What environment? try to mount XFS with 16k block size. [root@ds-san-dev-lux-01 ~]# cat /etc/release Red Hat Enterprise Linux Server release 7.5 (Maipo) [root@ds-san-dev-lux-01 ~]# When does the behavior occur? Frequently? Repeatedly? At certain times? try to mount XFS with 16k block size. [root@ds-san-dev-lux-01 ~]# cat /etc/release Red Hat Enterprise Linux Server release 7.5 (Maipo) [root@ds-san-dev-lux-01 ~]# What information can you provide around timeframes and the business impact? the problem is repeatable.","['block size', 'mount xfs', 'function', 'k']","['block size', 'mount xfs', 'function', 'k', 'sudo mkfs.xfs -b', 'sudo mount', 'sunit=1 blks', 'syncdata02 mount', 'rtextents=0 ds', 'ftype=1 log', 'internal log', 'sudo mkdir', 'agsize=10485600 blks', 'bsize=16384']"
1,1614393,2168396,Disk partitioning - Failed if I use 16K blocksize,"Hi Team, We have RHEL 7.5 VM and attached multiple disks. I wanted to partitioning with 64K blocksize for each file system. I am not able to set 64K block size for file system and it is throuwing 'mount failed: Function not implemented' while mounting 64K blksize LVM Please help me to resolve this and provide the steps. Please reach Santosh +61 44 91 91 895 for this issue and updated his mail id.","['k blocksize', 'partitioning']","['k blocksize', 'partitioning', 'k block size', 'k blksize lvm', 'file system', 'k blocksize', 'santosh +61', 'multiple disks', 'issue', 'able', 'vm', 'rhel']"
2,1614393,2532866,We set the block size doesn't reflect to the xfs system.,"What problem/issue/behavior are you having trouble with? What do you expect to see? We set the block size doesn't reflect to the xfs system. [root@cmtoldbcmgpsg01 ~]# /sbin/blockdev --getra /dev/mapper/vg_wi_ssd-lv_data01 16384 [root@cmtoldbcmgpsg01 ~]# xfs_info /dev/mapper/vg_wi_ssd-lv_data01 meta-data=/dev/mapper/vg_wi_ssd-lv_data01 isize=512 agcount=32, agsize=48017600 blks  = sectsz=4096 attr=2, projid32bit=1  = crc=1 finobt=0 spinodes=0 data = bsize=4096 blocks=1536563200, imaxpct=5  = sunit=64 swidth=256 blks naming =version 2 bsize=4096 ascii-ci=0 ftype=1 log =internal bsize=4096 blocks=521728, version=2  = sectsz=4096 sunit=1 blks, lazy-count=1 realtime =none extsz=4096 blocks=0, rtextents=0 Where are you experiencing the behavior? What environment? System slow. When does the behavior occur? Frequently? Repeatedly? At certain times? N/A What information can you provide around timeframes and the business impact? Need to fix it for performance issue asap.","['xfs system', 'block size']","['xfs system', 'block size', 'ci=0 ftype=1 log', 'finobt=0 spinodes=0 data', 'bsize=4096', 'count=1 realtime', 'sunit=1 blks', 'mapper', 'xfs system', 'lv_data01', 'sectsz=4096', 'blockdev --getra']"
3,1614393,2410915,Want to know correct syntax to format a logical volume with xfs filesystem with specific block size,"What problem/issue/behavior are you having trouble with? What do you expect to see? Could you please provide us a detailed document on how to create a logical volume with XFS file system with specific block size. When we use the argument -n in the below command, We doubt it has changed block size to 8192. Any filesystem that is formatted with more than 4KB is not getting mounted. cmaprh3 ~]# mkfs -t xfs -f -s size=4096 -b size=8192 /dev/sdh meta-data=/dev/sdh isize=512 agcount=4, agsize=655360 blks  = sectsz=4096 attr=2, projid32bit=1  = crc=1 finobt=0, sparse=0 data = bsize=8192 blocks=2621440, imaxpct=25  = sunit=0 swidth=0 blks naming =version 2 bsize=8192 ascii-ci=0 ftype=1 log =internal log bsize=8192 blocks=1280, version=2  = sectsz=4096 sunit=1 blks, lazy-count=1 realtime =none extsz=8192 blocks=0, rtextents=0 [root@usdatcmaprh3 ~]# mount /dev/sdh /dummy/ mount: mount /dev/sdh on /dummy failed: Function not implemented [root@usdatcmaprh3 ~]# [root@usdatcmaprh3 ~]# mkfs.xfs -n size=8192 /dev/systemvg/lv_tmp meta-data=/dev/systemvg/lv_tmp isize=512 agcount=4, agsize=983040 blks  = sectsz=512 attr=2, projid32bit=1  = crc=1 finobt=0, sparse=0 data = bsize=4096 blocks=3932160, imaxpct=25  = sunit=0 swidth=0 blks naming =version 2 bsize=8192 ascii-ci=0 ftype=1 log =internal log bsize=4096 blocks=2560, version=2  = sectsz=512 sunit=0 blks, lazy-count=1 realtime =none extsz=4096 blocks=0, rtextents=0 [root@in2itgvlnxbuild01 ~]# xfs_growfs / meta-data=/dev/mapper/rootvg-root isize=256 agcount=8, agsize=2455296 blks  = sectsz=512 attr=2, projid32bit=1  = crc=0 finobt=0 data = bsize=4096 blocks=18996224, imaxpct=25  = sunit=0 swidth=0 blks naming =version 2 bsize=4096 ascii-ci=0 ftype=0 log =internal bsize=4096 blocks=4795, version=2  = sectsz=512 sunit=0 blks, lazy-count=1 realtime =none extsz=4096 blocks=0, rtextents=0 data blocks changed from 18996224 to 20306944 Please provide us the document at the earliest.","['specific block size', 'xfs filesystem', 'logical volume', 'correct syntax']","['specific block size', 'xfs filesystem', 'logical volume', 'correct syntax', 'specific block size', 'sdh meta', 'ci=0 ftype=1 log', 'xfs file system', 'block size', 'rtextents=0 data blocks', 'size=4096 -b size=8192', 'mkfs -t xfs', 'internal log', 'count=1 realtime']"
4,1614393,1734684,Failed to mount XFS file system with 16K block size on RHEL 7 System,"What problem/issue/behavior are you having trouble with? What do you expect to see? Hi RH Support Team, We are facing trouble mounting an xfs file system with block size 16k on RHEL 7 System. To get 3PAR Deduplication, we are going with 16k block size. Can we have your assistance on mounting a file system using 16k block size please? Steps performed: Step 1. Below are 2 5 GB Disks presented to Server /dev/sdc1, /dev/sdd1 [root@vf2lvnetz01 ~]# fdisk -l /dev/sdc1 /dev/sdd1 Disk /dev/sdc1: 5367 MB, 5367660544 bytes, 10483712 sectors Units = sectors of 1 * 512 = 512 bytes Sector size (logical/physical): 512 bytes / 512 bytes I/O size (minimum/optimal): 512 bytes / 512 bytes Disk /dev/sdd1: 5367 MB, 5367660544 bytes, 10483712 sectors Units = sectors of 1 * 512 = 512 bytes Sector size (logical/physical): 512 bytes / 512 bytes I/O size (minimum/optimal): 512 bytes / 512 bytes Step 2. Created Physical Volumes, Volume Groups and Logical Volumes in similar way for both Disks. [root@vf2lvnetz01 ~]# pvcreate /dev/sdc1 ; pvcreate /dev/sdd1 ; vgcreate -s 64m vg02 /dev/sdc1 ; vgcreate -s 64m vg03 /dev/sdd1 ; lvcreate -l 79 -n u01 vg02 ; lvcreate -l 79 -n u02 vg03  Physical volume ""/dev/sdc1"" successfully created  Physical volume ""/dev/sdd1"" successfully created  Volume group ""vg02"" successfully created  Volume group ""vg03"" successfully created  Logical volume ""u01"" created.  Logical volume ""u02"" created. Step 3: Formatted /dev/vg02/u01 as xfs by specifying 16384 (16k) block size. [root@vf2lvnetz01 ~]# mkfs.xfs -b size=16384 /dev/vg02/u01 meta-data=/dev/vg02/u01 isize=256 agcount=4, agsize=80896 blks  = sectsz=512 attr=2, projid32bit=1  = crc=0 finobt=0 data = bsize=16384 blocks=323584, imaxpct=25  = sunit=0 swidth=0 blks naming =version 2 bsize=16384 ascii-ci=0 ftype=0 log =internal log bsize=16384 blocks=640, version=2  = sectsz=512 sunit=0 blks, lazy-count=1 realtime =none extsz=16384 blocks=0, rtextents=0 Step 4: Formatted /dev/vg03/u02 as xfs with default block size. [root@vf2lvnetz01 ~]# mkfs.xfs /dev/vg03/u02 meta-data=/dev/vg03/u02 isize=256 agcount=4, agsize=323584 blks  = sectsz=512 attr=2, projid32bit=1  = crc=0 finobt=0 data = bsize=4096 blocks=1294336, imaxpct=25  = sunit=0 swidth=0 blks naming =version 2 bsize=4096 ascii-ci=0 ftype=0 log =internal log bsize=4096 blocks=2560, version=2  = sectsz=512 sunit=0 blks, lazy-count=1 realtime =none extsz=4096 blocks=0, rtextents=0 Step 5: when trying to mount both file systems, /u02 with default block size mounted fine, but getting error as Function not implemented for 16k block size file system. [root@vf2lvnetz01 ~]# mount /u01 /u02 mount: /u01 is not a block device [root@vf2lvnetz01 ~]# mount -a mount: mount /dev/mapper/vg02-u01 on /u01 failed: Function not implemented We need your urgent help on this as 3PAR suggests to use 16k block size for deduplication and Our Management decided to go with it. Thanks, Ram Where are you experiencing the behavior? What environment? Red Hat Enterprise Linux 7 When does the behavior occur? Frequently? Repeatedly? At certain times? Always on RHEL 7 What information can you provide around timeframes and urgency? Many Server deployments are queued (not delivered to Business) because of this issue and once we release the systems with 4K block size we may not get any downtime to rebuild the systems. We need your urgent help on this as 3PAR recommends to use 16k block size.","['xfs file system', 'k block size', 'system', 'rhel']","['xfs file system', 'k block size', 'system', 'rhel', '16k block size file system', '16k block size', 'block size 16k', 'bytes sector size', 'default block size', 'block size', 'o size', 'xfs file system', 'mount -a mount', 'block device']"
5,1614393,1906558,Unable to increase XFS block size to 64KB because of page size limitation,What problem/issue/behavior are you having trouble with? What do you expect to see? We need to set the block size of a XFS filesystem to 64K but it is dependent on the pagesize. But pagesize is limited by kernel which is hardcoded to 4K. We could like to confirm this to ensure there is no alternative work around to achieve this. What is the maximum supported XFS block size in RHEL 7? https://access.redhat.com/solutions/1614393 How to invoke CONFIG_PAGE_SIZE_8KB directive in RHEL7? https://access.redhat.com/solutions/3056341 What information can you provide around timeframes and the business impact? We would like to confirm this asap so we can proceed with other options.,"['xfs block size', 'page size limitation', 'kb', 'unable']","['xfs block size', 'page size limitation', 'kb', 'unable', 'xfs block size', 'config_page_size_8 kb directive', 'block size', 'xfs filesystem', 'alternative work', 'https://access.redhat.com/solutions/1614393', 'pagesize', 'rhel', 'k', 'dependent']"
6,1614393,2073948,"mkfs.xfs -f -b size=32k produces ""Function not implemented"" on mount","What problem/issue/behavior are you having trouble with? What do you expect to see? What do we have to do in order to have an xfs block size of 32k mount? We understand that we need to likely set the kernel page size cache limit higher somehow. (getconf PAGE_SIZE is 4k by default) This is what we would like to do for the purpose of Oracle Stripe size: mkfs.xfs -f -b size=32k /dev/mapper/vg_07-oradata06a and then mount. Thank you. Where are you experiencing the behavior? What environment? Dev environment currently When does the behavior occur? Frequently? Repeatedly? At certain times? consistently across file systems What information can you provide around timeframes and the business impact? No impact as of yet, we are in the deployment stage","['mkfs.xfs -f -b', 'function', 'size=32k', 'mount']","['mkfs.xfs -f -b', 'function', 'size=32k', 'mount', 'kernel page size cache', 'mkfs.xfs -f -b size=32k', 'oracle stripe size', 'xfs block size', 'getconf page_size', 'mapper', 'purpose', 'vg_07-oradata06a', '32k', 'order']"
7,1614393,2122142,Unable to mount xfs file system if the volume is formated with blocksize of 65536,What problem/issue/behavior are you having trouble with? What do you expect to see? Planned to increase the block size of the xfs file system to increase SAS application performance . but failed to mount it when formatted with 65536 block size. Where are you experiencing the behavior? What environment? Production When does the behavior occur? Frequently? Repeatedly? At certain times? Certain Time What information can you provide around timeframes and the business impact? New setup .,"['mount xfs file system', 'volume', 'unable', 'blocksize']","['mount xfs file system', 'volume', 'unable', 'blocksize', 'xfs file system', 'sas application performance', 'block size']"
8,1614393,2468840,Is possible reduce the bs in xfs?,"¿Qué problema/comportamiento le está causando dificultades? ¿Qué espera ver? The customer need reduce the block size in some file system in RHEL 6, but he need know if is possible and what are the consequences. The environment is RHEL 6 for Oracle Database, but the problem is when restore a backup with commvault over this system the consumption of the IO in the SAN is very high. The SAN administrator says that the performance can better if the size block is minor to 512 bytes. So the customer want know if is possible reduce without destroy the volumes and if he change the bs what are the consequences. ¿En dónde se está presentando el comportamiento? ¿En qué entorno? In RHEL 6 in physical server with 3Par SAN of HPE. ¿Cuándo ocurre este comportamiento? ¿Con frecuencia? ¿Repetidamente? ¿En momentos determinados? Only when restore a backup with commvault. ¿Qué información puede brindar acerca de los plazos y el impacto comercial? A impact in the low general performance of the SAN environment.","['xfs', 'bs', 'possible']","['xfs', 'bs', 'possible', 'de los plazos y el impacto comercial', 'está presentando el comportamiento', 'comportamiento le está causando dificultades', 'qué información puede', 'qué espera ver', 'cuándo ocurre este comportamiento', 'qué entorno', 'qué problema', 'san environment', 'low general performance']"
9,1614393,2333745,Partition Table Issue for UEFI and 16 Bloxk Size,What problem/issue/behavior are you having trouble with? What do you expect to see? vmware admins ask me to format the disks in 16k block size. <%# kind: ptable name: Kickstart default custom oses: - CentOS 5 - CentOS 6 - CentOS 7 - Fedora 16 - Fedora 17 - Fedora 18 - Fedora 19 - Fedora 20 - RedHat 5 - RedHat 6 - RedHat 7 %> zerombr clearpart --drives=vda --all part /boot --fstype=xfs --size=512 --ondisk=vda --asprimary part pv.01 --size=1024 --grow --ondisk=vda --asprimary volgroup vg_<%= @host.shortname %> pv.01 logvol / --fstype=xfs --vgname=vg_<%= @host.shortname %> --name=lv_root -b size=16384 --size=6144 logvol /var --fstype=xfs --vgname=vg_<%= @host.shortname %> --name=lv_var -b size=16384 --size=2048 logvol /home --fstype=xfs --vgname=vg_<%= @host.shortname %> --name=lv_home -b size=16384 --size=2048 logvol /tmp --fstype=xfs --vgname=vg_<%= @host.shortname %> --name=lv_tmp -b size=16384 --size=2048 logvol swap --fstype=swap --vgname=vg_<%= @host.shortname %> --name=lv_swap01 -b size=16384 --size=2048,"['partition table issue', 'bloxk size', 'uefi']","['partition table issue', 'bloxk size', 'uefi', 'lv_tmp -b size=16384 --size=2048 logvol swap', '-b size=16384 --size=2048 logvol', 'lv_root -b size=16384 --size=6144 logvol', '@host.shortname %', '--size=2048 logvol', 'lv_var -b size=16384', '%', 'vda --asprimary part', 'size=16384 --size=2048', 'vda --all part']"


In [13]:
elapsed_time

229.93262028694153

# Calculate Keywords for KCS/rule

In [14]:
df[df['resource_display_id__c'] == '1614393'].shape[0]

29

In [15]:
# 遍历df
rule_tag_dict = {}
for key in kcs_rule_dict.keys():
    temp_df = df[df['resource_display_id__c'] == key]
    case_count = temp_df.shape[0]
    train_count = int(case_count * (4/5))
    temp_train_df = temp_df[:train_count-1]
    kcs_tag = []
    for description_phrases in temp_train_df['description_key_phrases']:
        kcs_tag = kcs_tag + description_phrases    
    rule_tag_dict[key] = kcs_tag   
        

In [16]:
rule_tag_dict

{'740323': ['filesystem',
  'high utilization',
  'filesystem utilization',
  'df -g output',
  'actual size',
  'gb',
  'file system',
  'mail service',
  'regards',
  'zimbra',
  'prioroty',
  'server',
  'e2fsck',
  'checktime',
  'ext3-fs',
  'servers unable',
  'jump server',
  'below output',
  'servers unable',
  'gpd-973',
  'a9e',
  'ff03',
  'mode',
  'team',
  'issue',
  'high swap usage',
  'filesystem errors',
  'logs',
  'swap usage',
  'oracle databases',
  'failover cluster',
  'usage',
  'filesystem errors',
  'node active',
  'few days',
  'server logs',
  'rhel',
  'issue',
  'hardware error',
  'performance issue',
  'dmesg',
  'machine check events',
  'hardware error',
  'root@pruswipprodb1',
  'strange errors',
  'message.log',
  '18th sep log',
  'umount messages',
  'filesystem mount',
  'message.log',
  'reason',
  'possible currupt file system',
  'fsck',
  'mount point',
  'netbackup',
  'root file system',
  'issue',
  'maintenance mode',
  'sos report',
  

# Calculate the match percentage between 'new' cases and rule

In [18]:
performance = []
for key in kcs_rule_dict.keys():
    temp_df = df[df['resource_display_id__c'] == key]
    case_count = temp_df.shape[0]
    train_count = int(case_count * (4/5))
    temp_test_df = temp_df[train_count:]
    for description_phrases in temp_test_df['description_key_phrases']:
        performance.append({'KCS':key, 'case key phrases number': len(description_phrases), 'case matched tags number': len(list(set(description_phrases) & set(rule_tag_dict[key]))), 'rule tags number': len(rule_tag_dict[key])})
      
performance_df = pd.DataFrame(performance)
def calculate_hit_rate(row):
    if row['rule tags number'] == 0:
        return 0
    return row['case matched tags number']/row['rule tags number']
    
performance_df['hit_rate'] = performance_df.apply(calculate_hit_rate, axis=1)
performance_df

Unnamed: 0,KCS,case key phrases number,case matched tags number,rule tags number,hit_rate
0,740323,8,1,144,0.006944
1,740323,3,1,144,0.006944
2,740323,13,6,144,0.041667
3,740323,12,0,144,0.0
4,740323,15,2,144,0.013889
5,24651,13,5,176,0.028409
6,24651,12,3,176,0.017045
7,24651,13,2,176,0.011364
8,24651,11,2,176,0.011364
9,24651,5,1,176,0.005682


# The average match percentage is close to 0

In [19]:
performance_df['hit_rate'].mean()

0.009440565418467538

In [20]:
performance_df['hit_rate'].max()

0.041666666666666664