In [43]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [44]:
raw_data = pd.read_excel('data/Raw_Data_TechDebt.xlsx')

In [45]:
raw_data.head()

Unnamed: 0,Debt Description,Debt Class
0,EPS tries to push script packets and scripts f...,Technical
1,Normalization failure - SAP sytem sends dupli...,Technical
2,Metrica monitoring issue - issue were raised b...,Technical
3,Unable to withdraw entries becasue related scr...,Technical
4,eSpecial Consideration website does not open f...,Technical


In [46]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dattaa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [47]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dattaa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [48]:
type(raw_data['Debt Description'])

pandas.core.series.Series

In [49]:
documents = list(raw_data['Debt Description'])

In [50]:
## initialise the inbuilt Stemmer and the Lemmatizer
stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

In [51]:
def preprocess(document, stem=True):
    'changes document to lower case and removes stopwords'

    # change sentence to lower case
    document = document.lower()

    # tokenize into words
    words = word_tokenize(document)

    # remove stop words
    words = [word for word in words if word not in stopwords.words("english")]
    
    # snowball stemmer
    #stemmer = SnowballStemmer("english")
    #words = [stemmer.stem(token) for token in words]
    
    if stem:
        words = [stemmer.stem(word) for word in words]
    else:
        words = [wordnet_lemmatizer.lemmatize(word, pos='v') for word in words]

    # join words to make sentence
    document = " ".join(words)
    
    return document

documents = [preprocess(document) for document in documents]
print(documents)

['ep tri push script packet script non-activ session st', 'normal failur - sap sytem send duplic script st , result job failur st', 'metrica monitor issu - issu rais elit speak api directli , tri test data pre product environ', 'unabl withdraw entri becasu relat script packet exist ep', 'especi consider websit open user intermitt', 'spjntap040 apach server issu lead host applic connect problem', "prep centr 's statu 'access grant ' prep centr user still abl login", "centr upload work repositori , longer avail , state 'file found ' .", 'user unabl merg candid cam admin due data cam & interchang', 'number result ukvi flow ibas ukvi .', 'statement result gener ceo candid result applic', 'claim/grade/achiev addit & delet interchang', 'request delet examin em applic', 'problem load entri file pdq modul cca2 seri', 'problem data feed ep st half hourli refresh job . wrong packet id tag script data', 'record process d79 suspens', 'cam gti suspens applic error', "file ci direct applic goe 'ep u

In [52]:
pd.DataFrame(documents).to_csv('debt_desc_stage2.csv')

In [53]:
### Create Bag of words model

In [54]:
vectorizer = CountVectorizer()
bow_model = vectorizer.fit_transform(documents)
print(bow_model)

  (0, 686)	1
  (0, 650)	1
  (0, 25)	1
  (0, 481)	1
  (0, 509)	1
  (0, 636)	2
  (0, 561)	1
  (0, 754)	1
  (0, 264)	1
  (1, 396)	1
  (1, 609)	1
  (1, 243)	1
  (1, 645)	1
  (1, 721)	1
  (1, 623)	1
  (1, 296)	2
  (1, 482)	1
  (1, 686)	2
  (1, 636)	1
  (2, 263)	1
  (2, 553)	1
  (2, 540)	1
  (2, 205)	1
  (2, 733)	1
  (2, 225)	1
  :	:
  (298, 474)	1
  (298, 409)	1
  (298, 595)	1
  (298, 20)	1
  (298, 786)	1
  (299, 87)	1
  (299, 253)	1
  (299, 376)	1
  (299, 300)	1
  (299, 238)	1
  (299, 476)	1
  (299, 474)	1
  (299, 595)	1
  (299, 786)	2
  (299, 264)	1
  (300, 253)	1
  (300, 450)	1
  (300, 182)	1
  (300, 300)	1
  (300, 476)	2
  (300, 147)	1
  (300, 492)	1
  (300, 474)	1
  (300, 224)	1
  (300, 786)	3


In [55]:
print(bow_model.shape)
print(vectorizer.get_feature_names())

(301, 818)
['000', '0042', '0470', '0r', '10', '1124', '114', '11g', '12c', '17th', '2010', '2018', '2019', '24', '25', '3228', '4754', '9239', 'abl', 'accept', 'access', 'account', 'accur', 'achiev', 'action', 'activ', 'actual', 'ad', 'adapt', 'adaptor', 'add', 'addit', 'address', 'adip', 'admin', 'advanc', 'affect', 'aggreg', 'ahead', 'alert', 'align', 'all', 'alloc', 'allow', 'alreadi', 'am', 'amend', 'amount', 'ampersand', 'annual', 'anomali', 'anonym', 'apach', 'api', 'apo', 'app', 'appear', 'applciat', 'applic', 'approv', 'apps', 'area', 'arf', 'articl', 'ascii', 'asfa', 'asia', 'asp', 'assess', 'assign', 'associ', 'at', 'attempt', 'attribut', 'atuo', 'audio', 'audit', 'auto', 'automark', 'automart', 'automat', 'avail', 'avoid', 'award', 'back', 'balanc', 'bank', 'base', 'batchsiz', 'becasu', 'behaviour', 'best', 'beyond', 'big', 'biztalk', 'blank', 'block', 'bmat', 'book', 'box', 'breach', 'breakdown', 'bulat', 'bulk', 'busi', 'button', 'buy', 'c4', 'caap', 'cach', 'calcul', 'ca