In [None]:
!pip install pyprind
!pip install graphviz

In [None]:
import nltk

nltk.download('vader_lexicon')

In [None]:
import pandas as pd

def read_input_data():
    df = pd.read_excel(io = './IssueReport.xls', sheet_name = 'Raw')
    df = df[['SR NUMBER', 'SEVERITY', 'CUSTOMER PRIORITY', 'SR DESCRIPTION']]
    return df

df = read_input_data()
df_row_count = len(df.index)

df.head(5)

In [None]:
import re
from sklearn import preprocessing

def preprocessor_description(text):
    text = re.sub('<[^>]*>', '', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('[^a-zA-Z\ \?\!\,\.\:\;\)\(\)]', '', text)
    text = re.sub('[0-9]', '', text)
    text = re.sub('\ +', ' ', text)
    text = text.strip()
    return text

def preprocessor_sr_number(text):
    text = re.findall('[0-9A-Z\-]', text.upper())
    return ''.join(text)

def preprocessor_int(text):
    text = re.findall('[0-9]+', text)
    text = ''.join(text)
    if not text:
        return 5
    else:
        return int(text)
    
def preprocessor_severity(severity):
    severity = preprocessor_int(severity)
    
    if severity <= 2:
        return 1
    else:
        return 0

In [None]:
import pyprind

pbar = pyprind.ProgBar(df_row_count)
le = preprocessing.LabelEncoder()

def preprocess_data():
    for index, row in df.iterrows():
        df.at[index, 'SR NUMBER'] = preprocessor_sr_number(row['SR NUMBER'])
        df.at[index, 'SR DESCRIPTION CLEAN'] = preprocessor_description(row['SR DESCRIPTION'])
        df.at[index, 'CUSTOMER PRIORITY'] = preprocessor_int(row['CUSTOMER PRIORITY'])

        severity = row['SEVERITY']
        df.at[index, 'SEVERITY'] = preprocessor_int(severity)
        df.at[index, 'URGENT'] = preprocessor_severity(severity)
        pbar.update()

    df['URGENT'] = df['URGENT'].astype(int)
    
preprocess_data()
    
df.head(5)

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sentimentAnalyzer = SentimentIntensityAnalyzer()
pbar = pyprind.ProgBar(df_row_count)

def assign_polarity():
    for index, row in df.iterrows():
        polarity = sentimentAnalyzer.polarity_scores(df.at[index, 'SR DESCRIPTION CLEAN'])
        df.at[index, 'POLARITY'] = polarity['compound']
        pbar.update()

assign_polarity()
        
df = df[['SR NUMBER', 'SR DESCRIPTION', 'SR DESCRIPTION CLEAN', 'POLARITY', 'CUSTOMER PRIORITY', 'SEVERITY', 'URGENT']]
df.head()

In [None]:
df.to_excel('./IssueReport_Polarity_Severity_Urgent.xls')

In [None]:
from sklearn.model_selection import train_test_split

X = df[['POLARITY', 'CUSTOMER PRIORITY']].values
y = df['URGENT'].values

In [None]:
from sklearn import tree

clf = tree.DecisionTreeClassifier()
clf.fit(X = X, y = y)

In [None]:
import graphviz

dot_data = tree.export_graphviz(
    decision_tree = clf,
    out_file = None,
    feature_names = ['POLARITY', 'CUSTOMER PRIORITY'],
    class_names = ['URGENT', 'NOT-URGENT']
)
graph = graphviz.Source(dot_data)
graph

In [None]:
def predict(customer_priority, description):
    description = preprocessor_description(description)
    
    polarity = sentimentAnalyzer.polarity_scores(description)
    polarity_compund = polarity['compound']
    
    customer_priority = preprocessor_int(customer_priority)
    
    X = [[polarity_compund, customer_priority]]
    
    return clf.predict(X)

In [None]:
df = read_input_data()
df_row_count = len(df.index)

pbar = pyprind.ProgBar(df_row_count)

def preprocess_for_prediction():
    for index, row in df.iterrows():
        customer_priority = row['CUSTOMER PRIORITY']
        description = row['SR DESCRIPTION']
        severity = row['SEVERITY']

        df.at[index, 'URGENT'] = preprocessor_severity(severity)
        df.at[index, 'URGENT_P'] = predict(customer_priority, description)

        pbar.update()
    
preprocess_for_prediction()
    
df.head()

In [None]:
df = df[['SR NUMBER', 'SR DESCRIPTION', 'CUSTOMER PRIORITY', 'SEVERITY', 'URGENT', 'URGENT_P']]

df.to_excel('./IssueReport_Tree_Predicted.xls')

In [None]:
number_of_equals = 0

for index, row in df.iterrows():
    if row['URGENT'] == row['URGENT_P']:
        number_of_equals = number_of_equals + 1
        
        
float(number_of_equals) / df_row_count