In [1]:
import os
import numpy as np
import pandas as pd
import re
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import RegexpTokenizer as regextoken
from nltk import word_tokenize
from nltk.corpus import wordnet
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

  from numpy.core.umath_tests import inner1d


In [2]:
class TextProcessing():

    english_vocab = set(w.lower() for w in nltk.corpus.words.words())
    wordnet_lemmatizer = WordNetLemmatizer()

    def __init__(self, domain_stopwords, domain_trailwords, domain_impwords,repeatwords,
                 allowed_word_types=["N","V","J","R"]):
        self.domain_stopwords = domain_stopwords
        self.domain_trailwords = domain_trailwords
        self.domain_impwords = domain_impwords
        self.repeatwords=repeatwords

        self.allowed_word_types = allowed_word_types
        self.domain_stopwords.discard(np.nan)
        self.domain_trailwords.discard(np.nan)
        self.domain_impwords.discard(np.nan)

    def process_document(self, document, add_weightage=False):
        document = str(document)
        document = document.lower()
        document = self.remove_joined_trails(document)
        document = re.sub('[^a-zA-Z0-9\n]',' ', document) # Remove Special Characters
        document = re.sub('(\\b[A-Za-z] \\b|\\b [A-Za-z]\\b)', '', document) # Remove single characters
        document = re.sub(' +', ' ', document) # Remove Extra Spaces
        
        document = self.lemmatize_document(document)
        document = [word for word in document if self.check_word(word)]
        
        if add_weightage:
            document = self.add_domain_impwords(document)
        
        document =  ' '.join([word for word in document])
        #document = re.sub('[nan]', '', document)
        
        return document

    def check_word(self, word):
        if len(word)>1 and word not in self.domain_stopwords and not any(c.isdigit() for c in word):
            return True
        else:
            return False

    def lemmatize_document(self, document):
        all_words=[]
        tokenizer = regextoken(r'\w+')
        words = tokenizer.tokenize(document)
        pos = nltk.pos_tag(words)
        for w in pos:
            word =  w[0].lower()
            pos_tag = w[1]
            if pos_tag[0] in self.allowed_word_types:
                wntag = self.get_wordnet_pos(pos_tag)
                lemmatised_word = self.wordnet_lemmatizer.lemmatize(word,pos=wntag)
                word = lemmatised_word if lemmatised_word in self.english_vocab else word
                all_words.append(word)
        return all_words

    def remove_joined_trails(self, document):
        for trail in self.domain_trailwords:
            if trail in document:
                document = document.replace(trail, "").strip()
                document = document + ' ' + trail
        return document

    def get_wordnet_pos(self, treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return None

    def add_domain_impwords(self, document):
        all_words = list(document)
        for word in document:
            if word in self.repeatwords:
                all_words.append(word)
                all_words.append(word)
        return all_words

In [3]:
### loading stopwords,impwords file
stopwords_df = pd.read_excel("Final_CSC_Clustering.xlsx", sheet_name="stopwords")
conolidated_stopwords = set(stopwords_df.Domain_Stopwords)

conolidated_stopwords.update(set(text.ENGLISH_STOP_WORDS))
conolidated_stopwords.update(set(stopwords.words('english')))

trailwords_df = pd.read_excel("Final_CSC_Clustering.xlsx", sheet_name="trailwords")
trailwords = set(trailwords_df.Trail_Words)

impwords_df = pd.read_excel("Final_CSC_Clustering.xlsx", sheet_name="impwords")
impwords = set(impwords_df.Domain_Impwords)

repeat_df = pd.read_excel("Final_CSC_Clustering.xlsx", sheet_name="repeat")
repeatwords = set(repeat_df.weightage)

conolidated_stopwords = conolidated_stopwords - impwords

In [4]:
#### load training data - December clustering output file
traindata=pd.read_excel("Platform ticket categorization.xlsx")
traindata_descript=traindata['Descript']

In [5]:
#traindata=traindata[traindata['Category'].isna()!=True]

In [6]:
traindata.shape

(81905, 117)

#### Preprocessing

In [7]:
preprocess_obj=TextProcessing(conolidated_stopwords, trailwords, impwords ,repeatwords)

In [8]:
traindata_preprocessed = traindata['Descript'].apply(preprocess_obj.process_document, args=(False,))
traindata_preprocessed=traindata_preprocessed.astype(str)

In [9]:
#traindata_preprocessed

In [10]:
# creating tf-idf matrix for train data
tfidfconverter = TfidfVectorizer(max_features=1500, min_df=3, max_df=0.7, stop_words=stopwords.words('english'))  
X = tfidfconverter.fit_transform(traindata_preprocessed).toarray() 

In [11]:
tfidfconverter

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.7, max_features=1500, min_df=3,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs',... 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"],
        strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [12]:
#prepare train - test data
y=traindata['Category']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(65524, 1500) (16381, 1500) (65524,) (16381,)


#### Classification

#### RF

In [13]:
classifier = RandomForestClassifier(n_estimators=100, random_state=0)  
classifier.fit(X, y)
print(classifier.score(X,y))

0.9867285269519566


In [14]:
# prediction on test data
y_pred=classifier.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.9876075941639705


#### Test Data

In [15]:
jan_data1=pd.read_excel('NCR T3 September 2019.xlsx')
print(jan_data1.shape)

(6872, 114)


In [21]:
jan_data1['Descript']

0       WND1 - SBWND1SQ8P - Disk E: full - QDB02: 8550572
1       BCN4 - SBBCN15003 - Disk D: Busy - QDB10: 1008...
2       (2685) BCK0006E OBS2crkoralpa001b6CRK_LPA_ORAC...
3       RESOLVED Problem 261 in environment Global Ten...
4       RESOLVED Problem 624 in environment Global Ten...
5       RESOLVED Problem 288 in environment Global Ten...
6       Create a Service Now request for fulfillment q...
7       Create a Service Now request for fulfillment q...
8       Server         :NYHCBORA02Account      :NATAle...
9       Server         :NYHCBORA01Account      :NATAle...
10      Instance Name:NIMSOFT_ROBOT_nyhcbctx117vEvent ...
11      Description:P3 URGENT Fault: TOA - server rest...
12      Instance Name:NIMSOFT_ROBOT_NGUKSVRRTW561Event...
13      BOR1 - SABOR10038 - Paging File Usage High - Q...
14      Server: ukrctsrtw11024 Reporting: Robot ukrcts...
15                             Server: jsy-shpsqlvp001   
16      ptyagi21colapcpcn-csd01NT_Logical_Disk: C: is ...
17      Messag

In [16]:
jan_data=jan_data1[jan_data1['Team Name']=='Platform']

In [17]:
# prediction on jan data
jan_Documents=jan_data['Descript']
Documentsjan=jan_Documents.apply(preprocess_obj.process_document, args=(True,))
X_final = tfidfconverter.transform(Documentsjan)
y_pred2 = classifier.predict(X_final)
print(len(y_pred2))

4134


In [18]:
jan_data['predictions']=y_pred2
jan_data['process text']=Documentsjan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [19]:
jan_data.to_excel('T3 September platform with categories.xlsx',index=False)

#### clustering on nocategory

In [None]:
nocategory=jan_data.loc[jan_data['predictions']=='No Category']

In [None]:
nocategory.shape

In [None]:
nocategory.to_csv('jan_NoCategory.csv')