# Overview
Incident Management and Response is a key component of any IT Service Management Strategy. These are the typical steps involved in the Incident Management Process:
- Receipt of the issue 
- Create a ticket
- Review of the ticket by L1/L2 teams
- Attempt to resolve the ticket using Standard Operating Procedures by L1/L2
- If needed, transfer the ticket to the appropriate L3 team for further review and resolving.


# Current ‘Pain’ Points
Currently the organization sees these issues in the Incident Ticket Management Process:
The process is largely ‘manual’. L1/L2 teams need to spend time to review Standard Operating Procedures (SOPs) before assigning to functional teams. Minimum 25-30% incidents needs to be reviewed for SOPs before ticket assignment. 

- Minimum 1 FTE effort needed only for incident assignment to L3 teams

- Human error - many times the incident gets assigned to the wrong L3 team. So additional effort needed to reassign to the correct team after re-review of the ticket, this not only increases the manual effort needed BUT also leads to customer dis-satisfaction because the customer who opened the ticket is left frustrated because the ticket is in limbo being tossed between various teams before getting to the actual team who can help resolve the issued.
 

# Objective of this Project
Create various Machine Learning Models that can help classify incidents and assign them to the right Functional Group. Our objective is to create NLP models that can predict with at least 85% accuracy.





In [None]:
!pip install wordcloud
#!pip install langdetect
#!pip install googletrans
#!pip install textblob
!pip install spacy
#!python -m spacy download en
#!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_md
#!python -m spacy download en_core_web_lg
!pip install -U spacy-lookups-data
#!pip install langid
!pip install google_trans_new
#!pip uninstall googletrans
!pip install autocorrect
!pip install ftfy
!pip install seaborn
!pip install nltk
!pip install bs4
!pip install xgboost
!pip install nbconvert[webpdf]

In [None]:
import itertools
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud, STOPWORDS 
#from langdetect import detect
from itertools import cycle
#import googletrans
#from googletrans import Translator
from google_trans_new import google_translator 
from multiprocessing.dummy import Pool as ThreadPool
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics
from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score, confusion_matrix, classification_report
from sklearn import preprocessing
#from textblob import TextBlob
#from textblob.translate import NotTranslated
import random
import operator
import math
import tqdm
import time
import spacy
import json
#import langid
from bs4 import BeautifulSoup
from string import digits

from autocorrect import Speller
from ftfy import fix_encoding, fix_text, fix_text_segment, badness


### This section below contains  Useful Functions 
- As we find new functions, we will create them here.

In [None]:
contractions_dict = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"didnt": "did not",
"doesnt": "does not",
"thats": "that is",
"wasnt": "was not",
"weren": "were not",
"theyre": "there",
"dont": "do not",
"cant": "cannot",
"arent": "are not",
"whats": "what is",
"you've": "you have"
}
# Function for expanding contractions
def expand_contractions(text,contractions_dict=contractions_dict):
  def replace(match):
    return contractions_dict[match.group(0)]
  return contractions_re.sub(replace, text)

In [None]:
#Most frequently occuring words
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in      
                   vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], 
                       reverse=True)
    return words_freq[:n]

#Most frequently occuring Bi-grams
def get_top_n2_words(corpus, n=None):
    vec1 = CountVectorizer(ngram_range=(2,2),  
            max_features=2000).fit(corpus)
    bag_of_words = vec1.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     
                  vec1.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], 
                reverse=True)
    return words_freq[:n]

#Most frequently occuring Tri-grams
def get_top_n3_words(corpus, n=None):
    vec1 = CountVectorizer(ngram_range=(3,3), 
           max_features=2000).fit(corpus)
    bag_of_words = vec1.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     
                  vec1.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], 
                reverse=True)
    return words_freq[:n]

#Function for sorting tf_idf in descending order
from scipy.sparse import coo_matrix
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

def fn_word_cloud(column):
    
    comment_words = ' '
    stopwords = set(STOPWORDS)

    # iterate through the csv file 
    for val in column: 

        # typecaste each val to string 
        val = str(val) 

        # split the value 
        tokens = val.split() 

        # Converts each token into lowercase 
        for i in range(len(tokens)): 
            tokens[i] = tokens[i].lower() 

        for words in tokens: 
            comment_words = comment_words + words + ' '


    wordcloud = WordCloud(width = 800, height = 800, 
                    background_color ='white', 
                    stopwords = stopwords, 
                    min_font_size = 10).generate(comment_words) 
    
    return wordcloud

def removeString(data, regex):
    return data.str.lower().str.replace(regex.lower(), ' ')

def preprocess(dataset, columnsToPreprocess, regexList):
    for column in columnsToPreprocess:
        #for regex in regexList:
            #dataset[column] = removeString(dataset[column], regex)
            dataset[column] = dataset[column].apply(clean_step2)
    return dataset

def clean_step2(text):
#1)remove html tags    
   soup=BeautifulSoup(text,"html.parser")
   text=soup.get_text(separator="")
    
#2) Remove non-ASCII characters
   encoded_string = text.encode("ascii", "ignore")
   text= encoded_string.decode()
   
#3)lower case    
   text=text.lower()
   text = ' '.join([w for w in text.split()])

#4)remove punctuation       
   text = re.sub(r'[^\w\s]', '',text) 
   
#5)remove whitespaces
   text=" ".join(text.split())
 
#6)remove  digits  
   remove_digits = str.maketrans('', '', digits) 
   text = text.translate(remove_digits) 
    
#7)remove emails   
   text = re.sub(r'\S*@\S*\s?', '', text)
   
#8)remove hyperlinks
   text = re.sub(r'https?:\/\/.*\/\w*','', text)
   
#9)remove other characters   
   text=text.replace("_"," ")
  
   text=text.replace("\\"," ")
   return text   

def getRegexList():
    '''
    Adding regex list as per the given data set to flush off the unnecessary text
    
    '''
    regexList = []
    regexList += ['From:(.*)\r\n']  # from line
    regexList += ['Sent:(.*)\r\n']  # sent to line
    regexList += ['received from:(.*)\r\n']  # received data line
    regexList += ['received']  # received data line
    regexList += ['To:(.*)\r\n']  # to line
    regexList += ['CC:(.*)\r\n']  # cc line
    regexList += ['(.*)infection']  # footer
    regexList += ['\[cid:(.*)]']  # images cid
    regexList += ['https?:[^\]\n\r]+']  # https & http
    regexList += ['Subject:']
    regexList += ['[\w\d\-\_\.]+@[\w\d\-\_\.]+']  # emails are not required
    regexList += ['[0-9][\-0–90-9 ]+']  # phones are not required
    regexList += ['[0-9]']  # numbers not needed
    regexList += ['[^a-zA-z 0-9]+']  # anything that is not a letter
    regexList += ['[\r\n]']  # \r\n
    regexList += [' [a-zA-Z] ']  # single letters makes no sense
    regexList += [' [a-zA-Z][a-zA-Z] ']  # two-letter words makes no sense
    regexList += ["  "]  # double spaces
    
    regexList += ['^[_a-z0-9-]+(\.[_a-z0-9-]+)*@[a-z0-9-]+(\.[a-z0-9-]+)*(\.[a-z]{2,4})$']
    regexList += ['[\w\d\-\_\.]+ @ [\w\d\-\_\.]+']
    regexList += ['Subject:']
    regexList += ['[^a-zA-Z]']

    return regexList


def lemmatize(stringlist):
    processed_all_documents = list()

    for desc in stringlist:
        word_tokens = word_tokenize(desc) 
    
        filtered_sentence = [] 

        # Removing Stopwords
        for w in word_tokens: 
            if w not in stop_words: 
                filtered_sentence.append(w) 
    
        # Lemmetization
        lemma_word = []
        wordnet_lemmatizer = WordNetLemmatizer()
        for w in filtered_sentence:
            word1 = wordnet_lemmatizer.lemmatize(w, pos = "n")
            word2 = wordnet_lemmatizer.lemmatize(word1, pos = "v")
            word3 = wordnet_lemmatizer.lemmatize(word2, pos = ("a"))
            lemma_word.append(word3)
        words = ' '.join(lemma_word)
        processed_all_documents.append(words) 
    return processed_all_documents


# Write a function to apply to the dataset to detect garbage data
def detect_garbage(text):
    if not badness.sequence_weirdness(text):
        # nothing weird, should be okay
        return True
    try:
        text.encode('sloppy-windows-1252')
    except UnicodeEncodeError:
        # Not CP-1252 encodable, probably fine
        return True
    else:
        # Encodable as CP-1252, Mojibake alert level high
        return False
    


# Milestone 1: Pre-Processing, Data Visualisation and EDA

1. Exploring the given Data files
2. Understanding the structure of data
3. Missing points in data
4. Finding inconsistencies in the data
5. Visualizing different patterns
6. Visualizing different text features
7. Dealing with missing values
8. Text preprocessing
9. Creating word vocabulary from the corpus of report text data
10. Creating tokens as required

This notebook contains the detailed steps on our path to accomplishing the goals set for Milestone 1

# EDA

##### We manually create the csv file from the excel and use pandas to read the csv.
- For some reason when we use the read_excel function, the number of NaN increase to 8 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
mydata = pd.read_csv('drive/My Drive/datasets/input_data.csv')
#mydata = pd.read_excel("datasets/input_data.xlsx")

In [None]:
mydata.head(20)

In [None]:
mydata.shape

###### Observation:
- There are 8500 records in the dataset
- Each Dataset contains 4 columns
- The column 'Caller' seems to contain only junk. We will drop it. 

In [None]:
mydata = mydata.drop('Caller',axis=1)

In [None]:
mydata.info()

In [None]:
# Missing data check #1:
mydata.describe(include='all') 

### We also notice some records with junks characters in Short Description and Description

In [None]:
mydata['Short description'] = mydata['Short description'].astype(str)
mydata['Description'] = mydata['Description'].astype(str)
mydata['Assignment group'] = mydata['Assignment group'].astype(str)

In [None]:
# Check the dataset for garbage data
mydata[~mydata.iloc[:,:-1].applymap(detect_garbage).all(1)]
mydata['Description'].apply(detect_garbage)

In [None]:
print(mydata.iloc[7126]['Short description'])
print(mydata.iloc[7969]['Description'])

In [None]:
# Take an example of row# 7126 Short Desc and fix it
print('Junk text: \033[1m%s\033[0m\nFixed text: \033[1m%s\033[0m' % (mydata['Short description'][7126], 
                                                                        fix_text(mydata['Short description'][7126])))

# List all mojibakes defined in ftfy library
print('\nMojibake Symbol RegEx:\n', badness.MOJIBAKE_SYMBOL_RE.pattern)

In [None]:
# Sanitize the dataset from Mojibakes
mydata['Short description'] = mydata['Short description'].apply(fix_text_segment)
mydata['Description'] = mydata['Description'].apply(fix_text)

# Visualize that row# 7126
mydata.iloc[7126,:]

#  Observation
- There seem to a few invalid values in Sort Description & Description.
- On further checking we find that they can be converted to valid non english alphabets using ftfy library

In [None]:
mydata.iloc[1081,:]

In [None]:
df = mydata.query('Description == ""')
df

In [None]:
## Missing data check #2 : 
## Are there any null values
mydata.isna().apply(pd.value_counts)
## Short Description contains 2 nulls and Description contains 1 null 

In [None]:
##Reconfirmation
null_data = mydata[mydata.isnull().any(axis=1)]
null_data

In [None]:
#This method is useful because it shows count, mean, and standard deviation along with the 5 point summary
mydata.describe().T

#### Number of classes in the Assignment Group 

In [None]:
len(mydata['Assignment group'].unique())

#### Assignment Group Values

In [None]:
mydata['Assignment group'].unique()

###### Assignment Group Distribution

In [None]:
df_assignment_group_dist = mydata['Assignment group'].value_counts().reset_index()
df_assignment_group_dist['percentage'] = (df_assignment_group_dist['Assignment group']/df_assignment_group_dist['Assignment group'].sum())*100
df_assignment_group_dist.head(20)

In [None]:
# Plot to visualize the percentage data distribution across different groups
sns.set(style="whitegrid")
plt.figure(figsize=(20,5))
order = mydata["Assignment group"].value_counts().index

ax = sns.countplot(x="Assignment group", data=mydata, order=order, linewidth=2,
                  edgecolor = "k"*len(order), palette='Set1')
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
for p in ax.patches:
  ax.annotate(str(format(p.get_height()/len(mydata.index)*100, '.2f')+"%"), 
              (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'bottom',
              rotation=90, xytext = (0, 10), textcoords = 'offset points')

# Observation
- Group 0 has the most entries - this is expected because we guess Grp_0 is L1 - so gets the most tickets and also resolves them directly based on SOPs (Standard Operating Procedures)

#### Top 20 Assignment groups with highest number of tickets

In [None]:
df_top_20 = mydata['Assignment group'].value_counts().nlargest(20).reset_index()
df_top_20

In [None]:
colors = ['red', 'blue', 'green']
i = -1
def getCycledColor():
    global i, colors
    if i < len(colors) -1:
        i = i + 1
        return colors[i]
    else:
        i = -1
plt.figure(figsize=(12,6))
bars = plt.bar(df_top_20['index'],df_top_20['Assignment group'], facecolor=getCycledColor())
plt.title('Top 20 Assignment groups with highest number of Tickets')
plt.xlabel('Assignment Group')
plt.xticks(rotation=90)
plt.ylabel('Number of Tickets')

for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x(), yval + .005, yval)
plt.tight_layout()
plt.show()

#### Bottom 20 Assignment groups with least number of tickets

In [None]:
df_bottom_20 = mydata['Assignment group'].value_counts().nsmallest(20).reset_index()
df_bottom_20

In [None]:
plt.figure(figsize=(12,6))
bars = plt.bar(df_bottom_20['index'],df_bottom_20['Assignment group'], color='green')
plt.title('Bottom 20 Assignment groups with small number of Tickets')
plt.xlabel('Assignment Group')
plt.xticks(rotation=90)
plt.ylabel('Number of Tickets')
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x(), yval + .005, yval)
plt.tight_layout()
plt.show()

#### Distribution of tickets counts in various bins

In [None]:
df_bins = pd.DataFrame(columns=['Description','Ticket Count'])
one_ticket = {'Description':'1 ticket','Ticket Count':len(df_assignment_group_dist[df_assignment_group_dist['Assignment group'] < 2])}
_2_5_ticket = {'Description':'2-5 ticket',
              'Ticket Count':len(df_assignment_group_dist[(df_assignment_group_dist['Assignment group'] > 1)& (df_assignment_group_dist['Assignment group'] < 6) ])}
_10_ticket = {'Description':' 6-10 ticket',
              'Ticket Count':len(df_assignment_group_dist[(df_assignment_group_dist['Assignment group'] > 5)& (df_assignment_group_dist['Assignment group'] < 11)])}
_10_20_ticket = {'Description':' 11-20 ticket',
              'Ticket Count':len(df_assignment_group_dist[(df_assignment_group_dist['Assignment group'] > 10)& (df_assignment_group_dist['Assignment group'] < 21)])}
_20_50_ticket = {'Description':' 21-50 ticket',
              'Ticket Count':len(df_assignment_group_dist[(df_assignment_group_dist['Assignment group'] > 20)& (df_assignment_group_dist['Assignment group'] < 51)])}
_51_100_ticket = {'Description':' 51-100 ticket',
              'Ticket Count':len(df_assignment_group_dist[(df_assignment_group_dist['Assignment group'] > 50)& (df_assignment_group_dist['Assignment group'] < 101)])}
_100_ticket = {'Description':' >100 ticket',
              'Ticket Count':len(df_assignment_group_dist[(df_assignment_group_dist['Assignment group'] > 100)])}
#append row to the dataframe
#append row to the dataframe
df_bins = df_bins.append([one_ticket,_2_5_ticket,_10_ticket,
                          _10_20_ticket,_20_50_ticket,_51_100_ticket,_100_ticket], ignore_index=True)

df_bins

In [None]:
plt.figure(figsize=(10, 8))
plt.pie(df_bins['Ticket Count'],labels=df_bins['Description'],autopct='%1.1f%%', startangle=15, shadow = True);
plt.title('Assignment Groups Distribution')
plt.axis('equal');

# Fetch wordcount for each Ticket in its raw state 
- (so far we have handled only junk characters and replaced any Nans with empty strings)
- We will merge the Short and Description fields just to perform EDA on the tickets. We will redo this (merge step) later after translation

In [None]:
#merging  the 2 preprocessed columns to a single column without duplicate words
mydata['Raw Combined description'] = mydata['Short description'] .map(str) + ' ' +  mydata['Description'].map(str)    
mydata['Raw Combined description'] = mydata['Raw Combined description'].apply(lambda x: ' '.join(pd.unique(x.split()))) 

mydata['raw_word_count'] = mydata['Raw Combined description'].apply(lambda x: len(str(x).split(" ")))
mydata[['Raw Combined description','raw_word_count']].head()

In [None]:
df = mydata.query('Description == ""')
df

In [None]:
#Identify common words
freq = pd.Series(' '.join(mydata['Raw Combined description']).split()).value_counts()[:20]
freq

# We now will list the most common words used - in this round will NOT remove any stop words - we will do that later and repeat this step 

In [None]:
top_df = pd.DataFrame(freq)
top_df.reset_index(level=0, inplace=True) 
top_df.columns=["Word", "Freq"]
#Barplot of most freq words
import seaborn as sns
sns.set(rc={'figure.figsize':(13,8)})
g = sns.barplot(x="Word", y="Freq", data=top_df)
g.set_xticklabels(g.get_xticklabels(), rotation=90)

In [None]:
#Identify uncommon words
freq1 =  pd.Series(' '.join(mydata['Raw Combined description']).split()).value_counts()[-20:]
freq1

In [None]:
top_df = pd.DataFrame(freq1)
top_df.reset_index(level=0, inplace=True) 
top_df.columns=["Word", "Freq"]
#Barplot of most freq words
import seaborn as sns
sns.set(rc={'figure.figsize':(13,8)})
g = sns.barplot(x="Word", y="Freq", data=top_df)
g.set_xticklabels(g.get_xticklabels(), rotation=90)

In [None]:
#data = mydata
#data["Assignment group"] = data["Assignment group"].apply(lambda x: x.replace("GRP_", ""))
#data["Assignment group"] = data["Assignment group"].astype(int)
#data

##### Now let's cleanup the null values in Short Description and Description fields

In [None]:
mydata[mydata['Description'].isnull()]

In [None]:
mydata[mydata['Short description'].isnull()]

In [None]:
#Replace NaN values in Short Description and Description columns
#mydata['Short description'] = mydata['Short description'].replace(np.nan, '', regex=True)
#mydata['Description'] = mydata['Description'].replace(np.nan, '', regex=True)

In [None]:
##Reconfirmation
null_data = mydata[mydata.isnull().any(axis=1)]
null_data

In [None]:
mydata.info()

In [None]:
mydata.iloc[1178,:]

In [None]:
mydata.isna().apply(pd.value_counts)

In [None]:
mydata.head(20)

##### Now let's merge the Short Description and Description to a new field - Combined Description . This will help us create a rich corpus
- Please note we are doing this now to help us with the word cloud step. 
- We will repeat this step later again if we find non english characters that we need to translate. This step will be repeated after the translation is done

In [None]:
mydata2 = mydata.copy()

In [None]:
mydata.iloc[1178,:]

In [None]:
#merging  the 2 preprocessed columns to a single column without duplicate words
mydata2['Combined description'] = mydata2['Short description'] .map(str) + ' ' +  mydata2['Description'].map(str)
                    
mydata2['Combined description'] = mydata2['Combined description'].apply(lambda x: ' '.join(pd.unique(x.split()))) 
   
#testing on single entry
print(mydata2.iloc[279]['Short description'])
print(mydata2.iloc[279]['Description'])
print(mydata2.iloc[279]['Combined description']) 
print(mydata2.iloc[7126]['Short description'])
print(mydata2.iloc[7126]['Combined description'])
print(mydata2.iloc[7969]['Description'])
print(mydata2.iloc[7969]['Combined description'])

# now let's print the word cloud
- Word clouds (also known as text clouds or tag clouds) work in a simple way: the more a specific word appears in a source of al data (such as a speech blog post, or database), the bigger and bolder it appears in the word cloud.

- A word cloud is a collection, or cluster, of words depicted in different sizes. The bigger and bolder the word appears, the more often it’s mentioned within a given text and the more important it is.

- Also known as tag clouds or text clouds, these are ideal ways to pull out the most pertinent parts of textual data, from blog posts to databases. They can also help business users compare and contrast two different pieces of text to find the wording similarities between the two. 

#### We will print the word cloud for the top 5 groups - GRP_0, GRP_8, GRP_24, GRP_12, GRP_9

In [None]:
wordcloud = fn_word_cloud(mydata2[mydata2['Assignment group']=='GRP_0']["Combined description"])
# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show() 

In [None]:
wordcloud = fn_word_cloud(mydata2[mydata2['Assignment group']=='GRP_8']["Combined description"])
# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show() 

In [None]:
wordcloud = fn_word_cloud(mydata2[mydata2['Assignment group']=='GRP_12']["Combined description"])
# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show()

In [None]:
wordcloud = fn_word_cloud(mydata2[mydata2['Assignment group']=='GRP_9']["Combined description"])
# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show()

In [None]:
wordcloud = fn_word_cloud(mydata2[mydata2['Assignment group']=='GRP_24']["Combined description"])
# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show() 

In [None]:
wordcloud = fn_word_cloud(mydata2[mydata2['Assignment group']=='GRP_30']["Combined description"])
# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show()

#### Observation
- Many non english words in GRP_24
- In GRP_30 there are many special characters 
#### Let us take a quick diversion to look into this further a little bit more
- We will first run the google's language detect in multi-threaded fashion

In [None]:
pool = ThreadPool(20) # Threads

def request(text):
    #lang = "zh"
    t = google_translator(timeout=20)
#    print("Detect Text " + text)
    detect_text = t.detect(text)
    #print(detect_text)
    return detect_text

if __name__ == "__main__" :
      time1 = time.time()
      #with open("datasets/ShortDescriptions.txt",'r',encoding='utf-8') as f_p:
      # texts = f_p.readlines()
      #print(texts)
      data = mydata2['Short description'].values.tolist()
      try:
          results = pool.map(request, data)
          #print(results)
      except Exception as e:
          raise e
      pool.close()
      pool.join()

      time2 = time.time()
      print("Detecting %s Short Desciptions, a total of %s s"%(len(data),time2 - time1))

#### We will load the results to a dataframe and print the last few rows

In [None]:
df = pd.DataFrame (results,columns=['language', 'language name'])
df.tail()

In [None]:
mydata2.isna().apply(pd.value_counts)

In [None]:
df.isna().apply(pd.value_counts)

#### Counts by language

In [None]:
df["language"].value_counts()

### We will graph the distribution of languages

In [None]:
cycol = cycle('bgrcmk')
x = df["language"].value_counts()
x=x.sort_index()
plt.figure(figsize=(10,6))
ax= sns.barplot(x.index, x.values, alpha=0.8)
plt.title("Distribution of text by language")
plt.ylabel('number of records')
plt.xlabel('Language')
rects = ax.patches
labels = x.values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')
plt.show();

# Observation
- Most items are in English followed by German
- The other languages are in low single digits - a could in low 2 digits. 

In [None]:
##Reconfirmation
null_data = mydata2[mydata2.isnull().any(axis=1)]
null_data

#### We will merge the language columns into the main dataframe

In [None]:
df

In [None]:
mydata3=mydata.copy()
mydata.isna().apply(pd.value_counts)

In [None]:
mydata.isna().apply(pd.value_counts)

In [None]:
mydata = mydata2.join(df)
mydata.tail()

#### Observation
- It is interesting to see row 8498 . Short description is in Portugese but Description is in English. 
- The Combine Description gets interpreted as English (we ran the detect alogrithm separately to confirm this)

#### This is the reason we decided to translate Short description and Description independently and then merge them.

In [None]:
##Reconfirmation
null_data = mydata[mydata.isnull().any(axis=1)]
null_data

In [None]:
##Reconfirmation
empty_space = mydata[mydata['Combined description'] == ""]
#empty_space[['Raw Combined description', 'Assignment group']]
empty_space

In [None]:
# Select columns for cleaning
#columnsToPreprocess = ['Short description', 'Description']
#columnsToPreprocess = ['Combined description', 'Short description', 'Description']
# Create list of regex to remove sensitive data
# Clean dataset and remove sensitive data
#mydata = preprocess(mydata, columnsToPreprocess, getRegexList())

In [None]:
##Reconfirmation
null_data = mydata[mydata.isnull().any(axis=1)]
null_data

In [None]:
##Reconfirmation
empty_space = mydata[mydata['Combined description'] == ""]
empty_space

### We will attempt Translation into english all the non-english rows

In [None]:
pool = ThreadPool(20) # Threads

def request(text):
    t = google_translator(timeout=20)
    translate_text = t.translate(text, lang_tgt='en')
    return translate_text

if __name__ == "__main__" :
      time1 = time.time()
      data = mydata['Short description'].values.tolist()
      try:
          results = pool.map(request, data)
          #print(results)
      except Exception as e:
          raise e
      pool.close()
      pool.join()

      time2 = time.time()

In [None]:
print("Translating %s Short Descriptions, a total of %s s"%(len(data),time2 - time1))

In [None]:
df = pd.DataFrame (results,columns=['Translated Short description'])
df.tail()

In [None]:
## Missing data check #2 : 
## Are there any null values
mydata.isna().apply(pd.value_counts)
## Short Description contains 2 nulls and Description contains 1 null 

In [None]:
df.isna().apply(pd.value_counts)

In [None]:
##Reconfirmation
null_data = mydata[mydata.isnull().any(axis=1)]
null_data

In [None]:
##Reconfirmation
empty_space = mydata[mydata['Combined description'] == ""]

#### We will merge the Translated Short description column into the main dataframe

In [None]:
mydata2 = mydata
mydata = mydata.join(df)
mydata.tail()

In [None]:
pool = ThreadPool(20) # Threads

def request(text):
    t = google_translator(timeout=25)
    translate_text = t.translate(text.strip(), lang_tgt='en')
    return translate_text

if __name__ == "__main__" :
      time1 = time.time()
      data = mydata['Description'].values.tolist()
      try:
          results = pool.map(request, data)
          #print(results)
      except Exception as e:
          raise e
      pool.close()
      pool.join()

      time2 = time.time()
      print("Translating %s Descriptions, a total of %s s"%(len(data),time2 - time1))

In [None]:
 print("Translating %s Descriptions, a total of %s s"%(len(data),time2 - time1))

In [None]:
df = pd.DataFrame (results,columns=['Translated Description'])
df.tail()

In [None]:
## Missing data check #2 : 
## Are there any null values
mydata.isna().apply(pd.value_counts)
## Short Description contains 2 nulls and Description contains 1 null 

In [None]:
df.isna().apply(pd.value_counts)

#### We will merge the Combined Description column into the main dataframe

In [None]:
mydata2 = mydata
mydata = mydata.join(df)
mydata.tail()

### We will merge the Short description and Description Columns again

In [None]:
#merging  the 2 preprocessed columns to a single column without duplicate words
mydata['Combined description'] = mydata['Translated Short description'] .map(str) + ' ' +  mydata['Translated Description'].map(str)
                    
mydata['Combined description'] = mydata['Combined description'].apply(lambda x: ' '.join(pd.unique(x.split()))) 
   
#testing on single entry
print(mydata.iloc[279]['Short description'])
print(mydata.iloc[279]['Description'])
print(mydata.iloc[279]['Combined description']) 
print(mydata.iloc[7126]['Short description'])
print(mydata.iloc[7126]['Combined description'])
print(mydata.iloc[7969]['Description'])
print(mydata.iloc[7969]['Combined description'])

In [None]:
mydata.isna().apply(pd.value_counts)

In [None]:
rows = mydata.iloc[8499]
rows

In [None]:
mydata.tail()

In [None]:
row = mydata.iloc[1954]
row

In [None]:
df = mydata.query('Description == ""')
df

In [None]:
##Reconfirmation
empty_space = mydata[mydata['Combined description'] == ""]
empty_space

In [None]:
mydata.iloc[1178,:]

# Preprocessing
### We will now attempt to remove unwanted text in the columns of interest to us: 
- Combined description

In [None]:
# Select columns for cleaning
#columnsToPreprocess = ['Short description', 'Description']
columnsToPreprocess = ['Combined description']
# Create list of regex to remove sensitive data
# Clean dataset and remove sensitive data
mydata = preprocess(mydata, columnsToPreprocess, getRegexList())

In [None]:
mydata.head(20)

In [None]:
##Reconfirmation
empty_space = mydata[mydata['Combined description'] == ""]
empty_space

In [None]:
mydata.at[8043,'Combined description']=mydata.iloc[8043]['Description']
mydata.at[8072,'Combined description']=mydata.iloc[8072]['Description']

# Observation
- For some reason the translation got confused and marked these as Greek and also translated to Greek . We are manually fixing these back to English

In [None]:
#expand contractions

# Regular expression for finding contractions
contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))
# Expanding Contractions in the reviews
mydata['Combined description']=mydata['Combined description'].apply(lambda x:expand_contractions(x))

In [None]:
mydata.isna().apply(pd.value_counts)

In [None]:
spell =Speller('en', fast=True)       #Speller(fast=True) for faster but less accurate correctiondata
mydata['Combined description']=[' '.join([spell(i) for i in x.split()]) for x in mydata['Combined description']]

In [None]:
mydata.isna().apply(pd.value_counts)

In [None]:
mydata.iloc[255]

In [None]:
#Remove non english words
nltk.download('words')
from nltk.corpus import words
Word = list(set(words.words()))
mydata['Combined description'] = [" ".join(w for w in nltk.wordpunct_tokenize(x) 
                       if w.lower() in Word or not w.isalpha()) 
                       for x in mydata['Combined description']]

#testing one a single entry
print(mydata.iloc[255]['Combined description'])

In [None]:
rows = mydata[mydata['Combined description'] == ""]
print(rows.index)
rows

# Observation
- For some reason the translation and remove regex alogrithms got confused and did not do their job for these rows properly . We are manually fixing these entries
- We can drop rows  2045, 2070, 2192 because they look like junk BUT at this point we will just keep them

In [None]:
for row_no in rows.index:
    mydata.at[row_no,'Combined description']=mydata.iloc[row_no]['Raw Combined description']

In [None]:
rows = mydata[mydata['Combined description'] == ""]
rows

# We are creating the dataset required for Deep Learning first
- This data set contains all the stop words too which are important for DL algorithms for Context retention
- This dataset will not contain the lemmatization  which is to follow later

In [None]:
mydata_dl = mydata[['Combined description', 'Assignment group']]
#mydata_dl = mydata_dl.drop(mydata_dl.index[mydata_dl["Combined description"] == ''])
mydata_dl = mydata_dl.rename(columns = {'Combined description':'Combined Description Cleaned'}) 
mydata_dl = mydata_dl.reset_index(drop=True)
mydata_dl.to_csv('drive/My Drive/datasets/input_data_after_preprocessing_for_dl.csv') 
mydata_dl.describe()

In [None]:
mydata_dl

In [None]:
mydata['CombinedWordCount'] = [len(desc.split(' ')) for desc in mydata['Combined description']]
mydata.head()
wordCount_before_lemmatization = mydata['CombinedWordCount'].sum()
print("Total Corpus Word Count before lemmatization: ", wordCount_before_lemmatization)

In [None]:
mydata.isna().apply(pd.value_counts)

In [None]:
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stop_words.add('please')
stop_words

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
mydata['Combined Description Cleaned'] = lemmatize(mydata['Combined description'])

In [None]:
mydata['CombinedWordCountCleaned'] = [len(desc.split(' ')) for desc in mydata['Combined Description Cleaned']]
wordCount_after_lemmatization = mydata['CombinedWordCountCleaned'].sum()
print("Total Corpus Word Count after lemmatization: ", wordCount_after_lemmatization)
print("Max word count of a Document: ", mydata['CombinedWordCountCleaned'].max())
print("Mean word count of Documents: ", mydata['CombinedWordCountCleaned'].mean())

### Creating a vector of word counts
- we will use the CountVectoriser to tokenise the text and build a vocabulary of known words. 
- We first create a variable “cv” of the CountVectoriser class, and then evoke the fit_transform function to learn and build the vocabulary.

###### Parameters used
- cv=CountVectorizer(max_df=0.8,stop_words=stop_words, max_features=10000, ngram_range=(1,3))
- max_df — When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). This is to ensure that we only have words relevant to the context and not commonly used words.
- max_features — determines the number of columns in the matrix.
- n-gram range — we would want to look at a list of single words, two words (bi-grams) and three words (tri-gram) combinations.

In [None]:
cv=CountVectorizer(max_df=0.8,stop_words=stop_words, max_features=10000, ngram_range=(1,3))
X=cv.fit_transform(mydata['Combined description'])

### Visualize top 20 uni-grams, bi-grams & tri-grams

In [None]:
#Convert most freq words to dataframe for plotting bar plot
top_words = get_top_n_words(mydata['Combined Description Cleaned'], n=50)
top_df = pd.DataFrame(top_words)
top_df.columns=["Word", "Freq"]

#Barplot of most freq words
sns.set(rc={'figure.figsize':(24,8)})
g = sns.barplot(x="Word", y="Freq", data=top_df)
g.set_xticklabels(g.get_xticklabels(), rotation=90)

In [None]:
top2_words = get_top_n2_words(mydata['Combined Description Cleaned'], n=50)
top2_df = pd.DataFrame(top2_words)
top2_df.columns=["Bi-gram", "Freq"]
#print(top2_df)

#Barplot of most freq Bi-grams
sns.set(rc={'figure.figsize':(24,8)})
h=sns.barplot(x="Bi-gram", y="Freq", data=top2_df)
h.set_xticklabels(h.get_xticklabels(), rotation=90)

In [None]:
top3_words = get_top_n3_words(mydata['Combined Description Cleaned'], n=50)
top3_df = pd.DataFrame(top3_words)
top3_df.columns=["Tri-gram", "Freq"]
#print(top3_df)
#Barplot of most freq Tri-grams
import seaborn as sns
sns.set(rc={'figure.figsize':(24,8)})
j=sns.barplot(x="Tri-gram", y="Freq", data=top3_df)
j.set_xticklabels(j.get_xticklabels(), rotation=90)

#### Based on the TF-IDF scores, we can extract the words with the highest scores to get the keywords for a document.

In [None]:
# iterate over rows with iterrows()
doc = ' '
for index, row in mydata.iterrows():
     # access data using column names
     doc = doc + row['Combined Description Cleaned']

In [None]:
doc

In [None]:
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(X)
# get feature names
feature_names=cv.get_feature_names()
 
# fetch document for which keywords needs to be extracted

#doc=mydata['Combined Description Cleaned'][0]
 
#generate tf-idf for the given document
tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))
#sort the tf-idf vectors by descending order of scores
sorted_items=sort_coo(tf_idf_vector.tocoo())
#extract only the top n; n here is 10
keywords=extract_topn_from_vector(feature_names,sorted_items,50)
 
# now print the results
#print("\nAbstract:")
#print(doc)
print("\nKeywords:")
for k in keywords:
    print(k,keywords[k])

In [None]:
mydata.isna().apply(pd.value_counts)

### Milestone 1 - So far we have performed these steps

1. Exploring the given Data files
2. Understanding the structure of data
3. Missing points in data
4. Finding inconsistencies in the data
5. Visualizing different patterns
6. Visualizing different text features
7. Dealing with missing values
8. Text preprocessing
9. Creating word vocabulary from the corpus of report text data
10. Creating tokens as required

### Now we will run a quick model on how it performs in predicting the group with the data we have.
### Then we will explore different data augmentation techniques (in a different notebook)

In [None]:
mydata.isna().apply(pd.value_counts)

In [None]:
#mydata['Combined Description Cleaned'] = mydata['Combined Description Cleaned'].replace(np.nan, '', regex=True)

# We will now create the preprocessed dataset required for Machine Learning

In [None]:
mydata_ml = mydata[['Combined Description Cleaned', 'Assignment group']]
mydata_ml = mydata_dl.drop(mydata_dl.index[mydata_dl["Combined Description Cleaned"] == ''])
mydata_ml = mydata_dl.reset_index(drop=True)
mydata_ml.to_csv('drive/My Drive/datasets/input_data_after_preprocessing_for_ml.csv') 
mydata_ml.describe()

In [None]:
##Reconfirmation
null_data = mydata[mydata.isnull().any(axis=1)]
null_data

In [None]:
##Reconfirmation
empty_space = mydata[mydata['Combined Description Cleaned'] == ""]
empty_space.describe()

##### Label encode the target column

In [None]:
le = preprocessing.LabelEncoder()
mydata["LabelEncodings"] = le.fit_transform(mydata["Assignment group"])
y_classes_len = len(le.classes_)
le.classes_
print(y_classes_len)

In [None]:
y = np.asarray(mydata['LabelEncodings'])

In [None]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(mydata['Combined Description Cleaned'])
print(X_train_counts.shape)

In [None]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf, y, test_size=0.3)

In [None]:
print(len(count_vect.vocabulary_))

In [None]:
print(X_train.shape)
print(y_train.shape)

In [None]:
print(X_test.shape)
print(y_test.shape)

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(solver='lbfgs', max_iter=1000, multi_class='multinomial').fit(X_train, y_train)
y_pred = clf.predict(X_test)
acc_score = accuracy_score(y_test, y_pred)
print("Logistic Regression Score: ", acc_score)
f_sc = f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
print("Logistic Regression F1 Score: ", f_sc)

In [None]:
##Reconfirmation
empty_space = mydata_dl[mydata_dl['Combined Description Cleaned'] == ""]
empty_space