In [None]:
# Utilities
from time import time
import os, sys, itertools, re
import warnings, pickle, string
from ftfy import fix_encoding, fix_text, badness

# Translation APIs
from goslate import Goslate # Provided by Google
from pycountry import languages

# Numerical calculation
import numpy as np

# Data Handling
import pandas as pd

# NLP toolkits
import spacy
import nltk
from nltk import tokenize

In [1]:
# Configure for any default setting of any library
warnings.filterwarnings('ignore')

NameError: name 'warnings' is not defined

In [3]:
# Load the dataset into a Pandas dataframe called ticket and check the head of the dataset
ticket = pd.read_excel('Dataset/Dataset.xlsx', )
ticket.head()

Unnamed: 0,Short description,Description,Caller,Assignment group
0,login issue,-verified user details.(employee# & manager na...,spxjnwir pjlcoqds,GRP_0
1,outlook,\r\n\r\nreceived from: hmjdrvpb.komuaywn@gmail...,hmjdrvpb komuaywn,GRP_0
2,cant log in to vpn,\r\n\r\nreceived from: eylqgodm.ybqkwiam@gmail...,eylqgodm ybqkwiam,GRP_0
3,unable to access hr_tool page,unable to access hr_tool page,xbkucsvz gcpydteq,GRP_0
4,skype error,skype error,owlgqjme qhcozdfx,GRP_0


In [4]:
# Check the tail of the dataset
ticket.tail()

Unnamed: 0,Short description,Description,Caller,Assignment group
8495,emails not coming in from zz mail,\r\n\r\nreceived from: avglmrts.vhqmtiua@gmail...,avglmrts vhqmtiua,GRP_29
8496,telephony_software issue,telephony_software issue,rbozivdq gmlhrtvp,GRP_0
8497,vip2: windows password reset for tifpdchb pedx...,vip2: windows password reset for tifpdchb pedx...,oybwdsgx oxyhwrfz,GRP_0
8498,machine nÃ£o estÃ¡ funcionando,i am unable to access the machine utilities to...,ufawcgob aowhxjky,GRP_62
8499,an mehreren pc`s lassen sich verschiedene prgr...,an mehreren pc`s lassen sich verschiedene prgr...,kqvbrspl jyzoklfx,GRP_49


**Comments**
* To take a closer look at the data, pandas library provides **“.head()”** function which returns first five observations and **“.tail()”** function which returns last five observations of the data set.

### Dataset Inspection
The dataset is divided into two parts, namely, **feature matrix** and the **response vector**.

- Feature matrix contains all the vectors(rows) of dataset in which each vector consists of the value of **dependent features**. In above dataset, features are *Short description*, *Description* and *Caller*.
- Response vector contains the value of **class variable**(prediction or output) for each row of feature matrix. In above dataset, the class variable name is *Assignment group*.

In [5]:
# Get the shape and size of the dataset
print('No of rows:\033[1m', ticket.shape[0], '\033[0m')
print('No of cols:\033[1m', ticket.shape[1], '\033[0m')

No of rows:[1m 8500 [0m
No of cols:[1m 4 [0m


In [6]:
# Get more info on it
# 1. Name of the columns
# 2. Find the data types of each columns
# 3. Look for any null/missing values
ticket.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8500 entries, 0 to 8499
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Short description  8492 non-null   object
 1   Description        8499 non-null   object
 2   Caller             8500 non-null   object
 3   Assignment group   8500 non-null   object
dtypes: object(4)
memory usage: 265.8+ KB


In [7]:
# Describe the dataset with various summary and statistics
ticket.describe()

Unnamed: 0,Short description,Description,Caller,Assignment group
count,8492,8499,8500,8500
unique,7481,7817,2950,74
top,password reset,the,bpctwhsn kzqsbmtp,GRP_0
freq,38,56,810,3976


In [8]:
# Check the Short description of tickets having Description as only 'the'
ticket[ticket.Description == 'the'].head()

Unnamed: 0,Short description,Description,Caller,Assignment group
1049,reset passwords for soldfnbq uhnbsvqd using pa...,the,soldfnbq uhnbsvqd,GRP_17
1054,reset passwords for fygrwuna gomcekzi using pa...,the,fygrwuna gomcekzi,GRP_17
1144,reset passwords for wvdxnkhf jirecvta using pa...,the,wvdxnkhf jirecvta,GRP_17
1184,reset passwords for pxvjczdt kizsjfpq using pa...,the,pxvjczdt kizsjfpq,GRP_17
1292,reset passwords for cubdsrml znewqgop using pa...,the,cubdsrml znewqgop,GRP_17


In [9]:
# Find out the null value counts in each column
ticket.isnull().sum()

Short description    8
Description          1
Caller               0
Assignment group     0
dtype: int64

**Observations**
- The dataset comprises of **8500 rows** and **4 columns**
- All columns are of type object containing textual information.
- There are **8 null/missing values** present in the Short description and **1 null/missing values** present in the description column
- **Password reset** is one of the most occuring tickets which reflects in the Short description column.
- The top occuring Description in the dataset is only the text **'the'**, which absolutely doesn't make any sense. hence by looking at the Short description of such rows reveals that these are also a category of Password reset.

In [10]:
# Let's look at the rows with null values
ticket[pd.isnull(ticket).any(axis=1)]

Unnamed: 0,Short description,Description,Caller,Assignment group
2604,,\r\n\r\nreceived from: ohdrnswl.rezuibdt@gmail...,ohdrnswl rezuibdt,GRP_34
3383,,\r\n-connected to the user system using teamvi...,qftpazns fxpnytmk,GRP_0
3906,,-user unable tologin to vpn.\r\n-connected to...,awpcmsey ctdiuqwe,GRP_0
3910,,-user unable tologin to vpn.\r\n-connected to...,rhwsmefo tvphyura,GRP_0
3915,,-user unable tologin to vpn.\r\n-connected to...,hxripljo efzounig,GRP_0
3921,,-user unable tologin to vpn.\r\n-connected to...,cziadygo veiosxby,GRP_0
3924,,name:wvqgbdhm fwchqjor\nlanguage:\nbrowser:mic...,wvqgbdhm fwchqjor,GRP_0
4341,,\r\n\r\nreceived from: eqmuniov.ehxkcbgj@gmail...,eqmuniov ehxkcbgj,GRP_0
4395,i am locked out of skype,,viyglzfo ajtfzpkb,GRP_0


### NULL treatment

In [11]:
# NULL replacement
ticket.fillna(str(), inplace=True)
ticket[pd.isnull(ticket).any(axis=1)]

Unnamed: 0,Short description,Description,Caller,Assignment group


In [12]:
# verify the replacement
ticket.isnull().sum()

Short description    0
Description          0
Caller               0
Assignment group     0
dtype: int64

**Comments**:
- We have various ways of treating the NULL/Missing values in the dataset such as 
    - Replacing them with empty string 
    - Replacing them with some default values
    - Duplicating the Short description and Description values wherever one of them is Null
    - Dropping the records with null/missing values completely.
- We're not choosing to drop any record as we don't want to loose any information. And as we're going to concatenate the Short description and Description columns for each record while feeding them into NLP, we neither want to pollute the data by introducing any default values nor bias it by duplicating the description colomns.
- Hence our NULL/Missing value treatment replaces the NaN cells with just empty string.

In [13]:
# Write a function to apply to the dataset to detect Mojibakes
def is_mojibake_impacted(text):
    if not badness.sequence_weirdness(text):
        # nothing weird, should be okay
        return True
    try:
        text.encode('sloppy-windows-1252')
    except UnicodeEncodeError:
        # Not CP-1252 encodable, probably fine
        return True
    else:
        # Encodable as CP-1252, Mojibake alert level high
        return False
    
# Check the dataset for mojibake impact
ticket[~ticket.iloc[:,:-1].applymap(is_mojibake_impacted).all(1)]

Unnamed: 0,Short description,Description,Caller,Assignment group
99,password expiry tomorrow,\n\nreceived from: ecprjbod.litmjwsy@gmail.com...,ecprjbod litmjwsy,GRP_0
116,server issues,\r\n\r\nreceived from: bgqpotek.cuxakvml@gmail...,bgqpotek cuxakvml,GRP_0
124,mobile device activation,"from: tvcdfqgp nrbcqwgj \nsent: friday, octobe...",tvcdfqgp nrbcqwgj,GRP_0
164,æ’¤å›ž: ticket_no1564867 -- comments added,\n\nreceived from: abcdri@company.com\n\nwindy...,tycludks cjofwigv,GRP_0
170,[urgent!!] delivery note creation request!!,\n\nreceived from: fbvpcytz.nokypgvx@gmail.com...,fbvpcytz nokypgvx,GRP_18
...,...,...,...,...
8470,please review your recent ticketing_tool ticke...,"from: mikhghytr wafglhdrhjop \nsent: thursday,...",azxhejvq fyemlavd,GRP_16
8471,ç”µè„‘å¼€æœºå¼€ä¸å‡ºæ¥,to å°è´ºï¼Œæ—©ä¸Šç”µè„‘å¼€æœºå¼€ä¸å‡ºæ¥,xqyjztnm onfusvlz,GRP_30
8480,customer group enhanced field,\r\n\r\nreceived from: nlearzwi.ukdzstwi@gmail...,nlearzwi ukdzstwi,GRP_9
8498,machine nÃ£o estÃ¡ funcionando,i am unable to access the machine utilities to...,ufawcgob aowhxjky,GRP_62


### Mojibake
[Mojibake](https://en.wikipedia.org/wiki/Mojibake) is the garbled text that is the result of text being decoded using an unintended character encoding. The result is a systematic replacement of symbols with completely unrelated ones, often from a different writing system.<br/>
This display may include the generic replacement character ("�") in places where the binary representation is considered invalid. A replacement can also involve multiple consecutive symbols, as viewed in one encoding, when the same binary code constitutes one symbol in the other encoding. This is either because of differing constant length encoding (as in Asian 16-bit encodings vs European 8-bit encodings), or the use of variable length encodings (notably UTF-8 and UTF-16). Few such Mojibakes are **¶**, **ç**, **å**, **€**, **æ**, **œ**, **º**, **‡**, **¼**, **¥** etc.

As we're dealing with Natural Language and the source of the data is unknown to us, let's run the encoding check to figure out if the dataset is Mojibake impacted.

The library **ftfy** (Fixes Text For You) has a greater ability to detect, fix and deal with such Mojibakes. It fixes Unicode that’s broken in various ways. The goal of ftfy is to take in bad Unicode and output good Unicode.

Installation:<br/>
using pypi: **`!pip install ftfy`**<br/>
using conda: **`conda install -c conda-forge ftfy`**

In [14]:
# Take an example of row# 8471 Short Desc and fix it
print('Grabled text: \033[1m%s\033[0m\nFixed text: \033[1m%s\033[0m' % (ticket['Short description'][8471], 
                                                                        fix_text(ticket['Short description'][8471])))

# List all mojibakes defined in ftfy library
print('\nMojibake Symbol RegEx:\n', badness.MOJIBAKE_SYMBOL_RE.pattern)

Grabled text: [1mç”µè„‘å¼€æœºå¼€ä¸å‡ºæ¥[0m
Fixed text: [1m电脑开机开不出来[0m

Mojibake Symbol RegEx:
 [ÂÃĂ][-€ƒ‚„†‡ˆ‰‹Œ“•˜œŸ¡¢£¤¥¦§¨ª«¬¯°±²³µ¶·¸¹º¼½¾¿ˇ˘˝]|[ÂÃĂ][›»‘”´©™]\w|[¬√][ÄÅÇÉÑÖÜáàâäãåçéèêëíìîïñúùûü†¢£§¶ß®©™≠ÆØ¥ªæø≤≥]|\w√[±∂]\w|[ðđ][Ÿ]|â€|вЂ[љћ¦°№™ќ“”]


In [15]:
# Sanitize the dataset from Mojibakes
ticket['Short description'] = ticket['Short description'].apply(fix_text)
ticket['Description'] = ticket['Description'].apply(fix_text)

# Visualize that row# 8471
ticket.iloc[8471,:]

Short description             电脑开机开不出来
Description           to 小贺,早上电脑开机开不出来
Caller               xqyjztnm onfusvlz
Assignment group                GRP_30
Name: 8471, dtype: object

In [17]:
# Serialize the mojibake treated dataset
ticket.to_csv('Dataset/mojibake_treated.csv', index=False, encoding='utf_8_sig')
with open('Dataset/mojibake_treated.pkl', 'wb') as handle:
    pickle.dump(ticket, handle, protocol=pickle.HIGHEST_PROTOCOL)

**Comments:**
- `badness.sequence_weirdness()` determines how often a text has unexpected characters or sequences of characters. This metric is used to disambiguate when text should be re-decoded or left as is.
- We're successfuly able to get the grabled characters back into their original form using **ftfy.fix_text()``**, however it is observed that the row# 8471 is not English but Mandarine.
- So the data in our hand is multilingual and it is quite not possible to derive embeddings for mix of multiple languages. We're going to translate the entire dataset into a single language of English.

### Language Translation (Goslate: Free Google Translate API)
Goslate is an open source python library that implemented Google Translate API. This uses the [Google Translate Ajax API](https://translate.google.com/) to make calls to such methods as detect and translate. It is choosen over another library Googletrans from Google as Goslate is developed to bypass the ticketing mechanism to prevent simple crawler program to access the Ajax API. Hence Goslate with multiple service urls is able to translate the entire dataset in very few iterations without blocking the user's IP address.

Installation:<br/>
using pypi: **`!pip install goslate`**<br/>
using conda: **`conda install -c conda-forge goslate`**

Servicce Urls used:
**``translate.google.com``**, **``translate.google.com.au``**, **``translate.google.com.ar``**, **``translate.google.co.kr``**, **``translate.google.co.in``**, **``translate.google.co.jp``**, **``translate.google.at``**, **``translate.google.de``**, **``translate.google.ru``**, **``translate.google.ch``**, **``translate.google.fr``**, **``translate.google.es``**, **``translate.google.ae``**

In [17]:
# Define and construct the service urls
svc_domains = ['.com','.com.au','.com.ar','.co.kr','.co.in','.co.jp','.at','.de','.ru','.ch','.fr','.es','.ae']
svc_urls = ['http://translate.google' + domain for domain in svc_domains]

# # Take an example of row# 8471 Short Desc and fix it
gs = Goslate(service_urls=svc_urls)
trans_8471 = gs.translate(ticket['Short description'][8471], target_language='en', source_language='auto')
print('Original text: \033[1m%s\033[0m\nFixed text: \033[1m%s\033[0m' % (ticket['Short description'][8471], trans_8471))

Original text: [1m电脑开机开不出来[0m
Fixed text: [1mBoot the computer does not really come out[0m


In [19]:
# List of column data to consider for translation
trans_cols = ['Short description','Description']

# Add a new column to store the detected language
ticket.insert(loc=2, column='Language', value=np.nan, allow_duplicates=True)

for idx in range(ticket.shape[0]):
    # Instantiate Goslate class in each iteration
    gs = Goslate(service_urls=svc_urls)
    lang = gs.detect(' '.join(ticket.loc[idx, trans_cols].tolist()))
    row_iter = gs.translate(ticket.loc[idx, trans_cols].tolist(), 
                            target_language='en', 
                            source_language='auto')
    ticket.loc[idx, trans_cols] = list(row_iter)
    
ticket.Language = lang
ticket.head()

Unnamed: 0,Short description,Description,Language,Caller,Assignment group
0,login issue,-verified user details.(employee# & manager na...,English,spxjnwir pjlcoqds,GRP_0
1,outlook,received from: hmjdrvpb.komuaywn@gmail.com\n\n...,English,hmjdrvpb komuaywn,GRP_0
2,cant log in to vpn,received from: eylqgodm.ybqkwiam@gmail.com\n\n...,English,eylqgodm ybqkwiam,GRP_0
3,unable to access hr_tool page,unable to access hr_tool page,English,xbkucsvz gcpydteq,GRP_0
4,Error skype,Error skype,English,owlgqjme qhcozdfx,GRP_0


In [20]:
# Serialize the translated dataset
ticket.to_csv('Dataset/translated_ticket.csv', index=False, encoding='utf_8_sig')
with open('Dataset/translated_ticket.pkl','wb') as f:
    pickle.dump(ticket, f, pickle.HIGHEST_PROTOCOL)

In [4]:
# Load the translated pickle file incase the IP gets blocked
ticket = pd.read_csv('Dataset/translated_df.csv', index_col=[0]).fillna(str())
ticket["Language"] = ticket["Language"].apply(lambda x: languages.get(alpha_2=x).name if(x != "zh-CN") else "Chinese")
ticket.head()

Unnamed: 0,Short description,Description,Language,Caller,Assignment group
0,login issue,-verified user details.(employee# & manager na...,English,spxjnwir pjlcoqds,0
1,outlook,received from: hmjdrvpb.komuaywn@gmail.com\n\n...,English,hmjdrvpb komuaywn,0
2,cant log in to vpn,received from: eylqgodm.ybqkwiam@gmail.com\n\n...,English,eylqgodm ybqkwiam,0
3,unable to access hr_tool page,unable to access hr_tool page,English,xbkucsvz gcpydteq,0
4,skype error,skype error,English,owlgqjme qhcozdfx,0


**Comments**:
- Unless paid service is used, Google blocks repetative hits to its Ajax API either via Googletrans or Goslate after certain iterations by cloagging the IP address.
- Using these list of various domains of translation API as service urls helped the traffic being patched among themselves, in turn allowing a longer buffer before the IP gets blocked.

### Text Preprocessing
Text preprocessing is the process of transferring text from human language to machine-readable format for further processing. After a text is obtained, we start with text normalization. Text normalization includes:
- converting all letters to lower or upper case
- converting numbers into words or removing numbers
- removing punctuations, accent marks and other diacritics
- removing white spaces
- removing stop words, sparse terms, and particular words
- text canonicalization

In [5]:
# Define regex patterns
EMAIL_PATTERN = r"([\w.+-]+@[a-z\d-]+\.[a-z\d.-]+)"
PUNCT_PATTERN = r"[,|@|\|?|\\|$&*|%|\r|\n|.:|\s+|/|//|\\|/|\||-|<|>|;|(|)|=|+|#|-|\"|[-\]]|{|}]"
# Negative Lookbehind for EmailId replacement- Don't match any number which follows the text "RetainedEmailId"
NUMER_PATTERN = r"(?<!RetainedEmailId)(\d+(?:\.\d+)?)"

# Define a function to treat the texts
def cleanseText(text):
    # Make the text unicase (lower) 
    text = str(text).lower()
    # Remove email adresses
    # text = re.sub(EMAIL_PATTERN, '', text, flags=re.IGNORECASE)
    # Save Email addresses and replace them with custom keyword
    email_dict = extract_email(text)
    for key in email_dict.keys():
        text = text.replace(email_dict[key], key)
    # Remove all numbers 
    text = re.sub(NUMER_PATTERN, '', text)
    # Replace all punctuations with blank space
    # text = re.sub(PUNCT_PATTERN, " ", text, flags=re.MULTILINE)
    text = text.translate(str.maketrans("","", string.punctuation))
    text = re.sub(r'\s+', ' ', text)
    # Replace multiple spaces from prev step to single
    text = re.sub(r' {2,}', " ", text, flags=re.MULTILINE)
    text = text.replace('`',"'")
    # Replace the email ids back into their original position
    for key in email_dict.keys():
        text = text.replace(key, email_dict[key])
    return text.strip()

def extract_email(text):
    # Replaces the email addresses with custom key word and 
    # save them into a dictionary for future use
    unique_emailid = set(re.findall(EMAIL_PATTERN, text))
    email_replacement = dict()
    for idx, email in enumerate(unique_emailid):
        email_replacement[f'RetainedEmailId{idx}'] = email
    return email_replacement

In [6]:
# Take an example of row# 32 Description and fix it
print('\033[1mOriginal text:\033[0m')
print(ticket['Description'][32])
print('_'*100)
print('\033[1mCleaned text:\033[0m')
print(cleanseText(ticket['Description'][32]))

[1mOriginal text:[0m
received from: kxsceyzo.naokumlb@gmail.com

gentles,

i have two devices that are trying to share an ip address. they are trying to share 96.26.27.9619. one is a printer with the hostname of prtjc0074, and the other is a new display for erp. the display is using dhcp to get its address assigned and the printer is hard coded.

my guess is that the address 96.26.27.9619 did not get set to a static address in dhcp. i need this corrected so the display will pick up another address.
____________________________________________________________________________________________________
[1mCleaned text:[0m
received from kxsceyzo.naokumlb@gmail.com gentles i have two devices that are trying to share an ip address they are trying to share one is a printer with the hostname of prtjc and the other is a new display for erp the display is using dhcp to get its address assigned and the printer is hard coded my guess is that the address did not get set to a static address in dhc

In [7]:
# Apply the cleaning function to entire dataset
ticket['Description'] = ticket['Description'].apply(cleanseText)
ticket['Short description'] = ticket['Short description'].apply(cleanseText)

# Verify the data
ticket.tail()

Unnamed: 0,Short description,Description,Language,Caller,Assignment group
8495,emails not coming in from zz mail,received from avglmrts.vhqmtiua@gmail.com good...,German,avglmrts vhqmtiua,29
8496,telephonysoftware issue,telephonysoftware issue,German,rbozivdq gmlhrtvp,0
8497,vip windows password reset for tifpdchb pedxruyf,vip windows password reset for tifpdchb pedxruyf,German,oybwdsgx oxyhwrfz,0
8498,machine não está funcionando,i am unable to access the machine utilities to...,German,ufawcgob aowhxjky,62
8499,various prgramdntyme can not be opened on mult...,various prgramdntyme can not be opened on mult...,German,kqvbrspl jyzoklfx,49


**Comments:**
- Entire dataset is converted into lower case
- Users email addresses will add NO value to our analysis, despite the fact that user id is given in the caller column. So all email adresses are removed from the dataset
- All numerals are removed because they were dominating the dataset if we were converting them into their word representation otherwise.
- All punctuation marks are removed which used to be a hindrance in lemmatization.
- All occurances of more than one blank spaces, horizontal tab spaces, new line breaks etc. have been replaced with single blank space.

Now with a nice and cleaner data in our hand let's proceed towards Lemmatization.

### Stemming and Lemmatization
Stemming and Lemmatization are Text Normalization (or sometimes called Word Normalization) techniques in the field of Natural Language Processing that are used to prepare text, words, and documents for further processing.<br/>
In grammar, inflection is known as the modification of a word to express different grammatical categories such as tense, case, voice, aspect, person, number, gender, and mood. An inflection expresses one or more grammatical categories with a prefix, suffix or infix, or another internal modification such as a vowel change.

**Stemming**<br/>
Stemming is the process of reducing inflection in words to their root forms such as mapping a group of words to the same stem even if the stem itself is not a valid word in the Language.

**Lemmatization**<br/>
Lemmatization, unlike Stemming, reduces the inflected words properly ensuring that the root word belongs to the language. In Lemmatization root word is called Lemma. A lemma (plural lemmas or lemmata) is the canonical form, dictionary form, or citation form of a set of words.

### spaCy
The [spaCy](https://spacy.io/) library is one of the most popular NLP libraries along with NLTK which contains only one, but the best algorithm to solve any Natural Language problem. Once it is downloaded and installed, the next step is to download the language model, which is used to perform a variety of NLP tasks.

Installation:<br/>
using pypi: **`!pip install spacy`**<br/>
using conda: **`conda install -c conda-forge spacy`**

Language Model Download:<br/>
**``$ python -m spacy download en_core_web_md``**

In [8]:
# Initialize spacy 'en' medium model, keeping only tagger component needed for lemmatization
nlp = spacy.load('en_core_web_md', disable=['parser', 'ner'])

# Define a function to lemmatize the descriptions
def lemmatizer(sentence):
    # Parse the sentence using the loaded 'en' model object `nlp`
    doc = nlp(sentence)
    return " ".join([token.lemma_ for token in doc if token.lemma_ !='-PRON-'])

In [9]:
# Take an example of row# 43 Description and lemmatize it
print('\033[1mOriginal text:\033[0m')
print(ticket['Description'][43])
print('_'*100)
print('\033[1mLemmatized text:\033[0m')
print(lemmatizer(ticket['Description'][43]))

[1mOriginal text:[0m
received from yisohglr.uvteflgb@gmail.com hi the printer printer is not working and needs a part replaced can you reroute the jobs in queue to printer printer wihuyjdo qpogfwkb has indicated that prqos needs a new part and it may not deliver for a few days so the inwarehousetools will need to print on printer for now this needs to be taken care of today since the inwarehousetools are printed and are picked up by an outside vendor at pm in usa on a daily basis please contact dkmcfreg anwmfvlgenkataramdntyana if you have questions about the jobs in queue for today
____________________________________________________________________________________________________
[1mLemmatized text:[0m
receive from yisohglr.uvteflgb@gmail.com hi the printer printer be not work and need a part replace can reroute the job in queue to printer printer wihuyjdo qpogfwkb have indicate that prqos need a new part and may not deliver for a few day so the inwarehousetool will need to print

In [10]:
# Apply the Lemmatization to entire dataset
ticket['Description'] = ticket['Description'].apply(lemmatizer)
ticket['Short description'] = ticket['Short description'].apply(lemmatizer)

# Verify the data
ticket.tail()

Unnamed: 0,Short description,Description,Language,Caller,Assignment group
8495,email not come in from zz mail,receive from avglmrts.vhqmtiua@gmail.com good ...,German,avglmrts vhqmtiua,29
8496,telephonysoftware issue,telephonysoftware issue,German,rbozivdq gmlhrtvp,0
8497,vip windows password reset for tifpdchb pedxruyf,vip windows password reset for tifpdchb pedxruyf,German,oybwdsgx oxyhwrfz,0
8498,machine não está funcionando,i be unable to access the machine utility to f...,German,ufawcgob aowhxjky,62
8499,various prgramdntyme can not be open on multip...,various prgramdntyme can not be open on multip...,German,kqvbrspl jyzoklfx,49


In [13]:
# Create new features of length and word count for both of the description columns
ticket.insert(1, 'sd_len', ticket['Short description'].astype(str).apply(len))
ticket.insert(2, 'sd_word_count', ticket['Short description'].apply(lambda x: len(str(x).split())))
ticket.insert(4, 'desc_len', ticket['Description'].astype(str).apply(len))
ticket.insert(5, 'desc_word_count', ticket['Description'].apply(lambda x: len(str(x).split())))
ticket.head()

Unnamed: 0,Short description,sd_len,sd_word_count,Description,desc_len,desc_word_count,Language,Caller,Assignment group
0,login issue,11,2,verified user detailsemployee manager name che...,177,31,English,spxjnwir pjlcoqds,0
1,outlook,7,1,receive from hmjdrvpb.komuaywn@gmail.com hello...,163,23,English,hmjdrvpb komuaywn,0
2,can not log in to vpn,21,6,receive from eylqgodm.ybqkwiam@gmail.com hi i ...,72,12,English,eylqgodm ybqkwiam,0
3,unable to access hrtool page,28,5,unable to access hrtool page,28,5,English,xbkucsvz gcpydteq,0
4,skype error,11,2,skype error,11,2,English,owlgqjme qhcozdfx,0


In [15]:
ticket.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sd_len,8500.0,39.335882,22.444755,0.0,23.0,33.0,50.0,147.0
sd_word_count,8500.0,6.452235,3.699941,0.0,4.0,6.0,8.0,26.0
desc_len,8500.0,156.313294,324.75203,0.0,37.0,76.0,173.0,7110.0
desc_word_count,8500.0,24.273294,51.277937,0.0,6.0,10.0,26.0,1091.0
Assignment group,8500.0,9.061647,12.97975,0.0,0.0,3.0,13.0,73.0


In [16]:
# Serialize the preprocessed dataset
ticket.to_csv('Dataset/preprocessed_ticket.csv', index=False, encoding='utf_8_sig')
with open('Dataset/preprocessed_ticket.pkl','wb') as f:
    pickle.dump(ticket, f, pickle.HIGHEST_PROTOCOL)