<a href="https://colab.research.google.com/github/guptapawan227/Capstone_AIML/blob/Ashish/Recreated_16thDec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Mounting Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
!pip3 install ftfy

Importing Libraries

In [None]:
# Using TensorFlow 1.x only in colab as found a issue with 2.3 version used by colab while working with DNN model fit. Did not observe any issue with Tensor flow 2.1 version on local jupyter enviornment.
%tensorflow_version 1.x

In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
import time, os, sys, itertools, re 
from PIL import Image
import warnings, pickle, string
from dateutil import parser
%matplotlib inline

# Data Visualization
import cufflinks as cf
import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot

from ftfy import fix_text, badness

# Traditional Modeling
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Sequential Modeling
import keras.backend as K
from keras.models import Sequential, Model
from keras.layers.merge import Concatenate
from keras.layers import Input, Dropout, Flatten, Dense, Embedding, LSTM, GRU
from keras.layers import BatchNormalization, TimeDistributed, Conv1D, MaxPooling1D
from keras.constraints import max_norm, unit_norm
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping, ModelCheckpoint

# Tools & Evaluation metrics
from sklearn.metrics import confusion_matrix, classification_report, auc
from sklearn.metrics import roc_curve, accuracy_score, precision_recall_curve
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


Reading the data from excel 

In [None]:
data=pd.read_excel('/content/drive/MyDrive/Capstone/input_data.xlsx')
#data=pd.read_excel('input_data.xlsx')
data.info()

## Exploratory Data Analysis

## Univariate visualization
Single-variable or univariate visualization is the simplest type of visualization which consists of observations on only a single characteristic or attribute. Univariate visualization includes histogram, bar plots and line charts.

### The distribution of Assignment groups
Plots how the assignments groups are scattered across the dataset. The bar chart, histogram and pie chart tells the frequency of any ticket assigned to any group OR the tickets count for each group.

In [None]:
data.head()

In [None]:
assignment_group_count=data['Assignment group'].value_counts()
assignment_group_count.describe()

In [None]:
plt.subplots(figsize=(50,10))
ax=sns.countplot(x='Assignment group', data=data)
ax.set_xticklabels(ax.get_xticklabels(), rotation=30)
plt.tight_layout
plt.show()


In [None]:
assignment_group_count.head(50)

In [None]:
assignment_group_count.tail(24)

### Check Missing Values in dataframe

In [None]:
data.isnull().sum()

In [None]:
data[data["Short description"].isnull()]

### Copy Short Description to Description if the Description value is NaN

In [None]:
data.Description.fillna(data["Short description"], inplace = True)

In [None]:
data[data["Description"].isnull()]

In [None]:
data['Short description'] = data['Short description'].replace(np.nan, '', regex=True)

In [None]:
data.isnull().sum()

In [None]:
init_notebook_mode()
cf.go_offline()

# Assignment group distribution
print('\033[1mTotal assignment groups:\033[0m', data['Assignment group'].nunique())

# Histogram
data['Assignment group'].iplot(
    kind='hist',
    xTitle='Assignment Group',
    yTitle='count',
    title='Assignment Group Distribution- Histogram (Fig-1)')

# Pie chart
assgn_grp = pd.DataFrame(data.groupby('Assignment group').size(),columns = ['Count']).reset_index()
assgn_grp.iplot(
    kind='pie', 
    labels='Assignment group', 
    values='Count', 
    title='Assignment Group Distribution- Pie Chart (Fig-2)', 
    hoverinfo="label+percent+name", hole=0.25)

### Lets visualize the percentage of incidents per assignment group

In [None]:
# Plot to visualize the percentage data distribution across different groups
sns.set(style="whitegrid")
plt.figure(figsize=(20,5))
ax = sns.countplot(x="Assignment group", data=data, order=data["Assignment group"].value_counts().index)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
for p in ax.patches:
  ax.annotate(str(format(p.get_height()/len(data.index)*100, '.2f')+"%"), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'bottom', rotation=90, xytext = (0, 10), textcoords = 'offset points')

### Top 20 and Bottom 20 assignment groups

In [None]:
top_20 = data['Assignment group'].value_counts().nlargest(20).reset_index()

In [None]:
plt.figure(figsize=(12,6))
bars = plt.bar(top_20['index'],top_20['Assignment group'])
plt.title('Top 20 Assignment groups with highest number of Tickets')
plt.xlabel('Assignment Group')
plt.xticks(rotation=90)
plt.ylabel('Number of Tickets')

for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x(), yval + .005, yval)
plt.tight_layout()
plt.show()

In [None]:
bottom_20 = data['Assignment group'].value_counts().nsmallest(20).reset_index()

In [None]:
plt.figure(figsize=(12,6))
bars = plt.bar(bottom_20['index'],bottom_20['Assignment group'])
plt.title('Bottom 20 Assignment groups with small number of Tickets')
plt.xlabel('Assignment Group')
plt.xticks(rotation=90)
plt.ylabel('Number of Tickets')
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x(), yval + .005, yval)
plt.tight_layout()
plt.show()

### The distribution of Callers
Plots how the callers are associated with tickets and what are the assignment groups they most frequently raise tickets for.

In [None]:
# Find out top 10 callers in terms of frequency of raising tickets in the entire dataset
print('\033[1mTotal caller count:\033[0m', data['Caller'].nunique())
df = pd.DataFrame(data.groupby(['Caller']).size().nlargest(10), columns=['Count']).reset_index()
df.iplot(kind='pie',
         labels='Caller', 
         values='Count', 
         title='Top 10 caller- Pie Chart (Fig-7)',
         colorscale='-spectral',
         pull=[0,0,0,0,0.05,0.1,0.15,0.2,0.25,0.3])

### Top 5 callers in each assignment group

In [None]:
top_n = 5
s = data['Caller'].groupby(data['Assignment group']).value_counts()
caller_grp = pd.DataFrame(s.groupby(level=0).nlargest(top_n).reset_index(level=0, drop=True))
caller_grp.head(15)

### The distribution of description lengths
Plots the variation of length and word count of new description attribute

In [None]:
data.insert(1, 'desc_len', data['Description'].astype(str).apply(len))
data.insert(5, 'desc_word_count', data['Description'].apply(lambda x: len(str(x).split())))
data.head()

In [None]:
# Description text length
data['desc_len'].iplot(
    kind='bar',
    xTitle='text length',
    yTitle='count',
    colorscale='-ylgn',
    title='Description Text Length Distribution (Fig-11)')

# Description word count
data['desc_word_count'].iplot(
    kind='bar',
    xTitle='word count',
    linecolor='black',
    yTitle='count',
    colorscale='-bupu',
    title='Description Word Count Distribution (Fig-12)')

## Create a rule based engine

In [None]:
df_rules = pd.read_csv('/content/drive/MyDrive/Capstone/Rule_matrix.csv')
#df_rules = pd.read_csv("Rule_matrix1.csv")

In [None]:
def applyRules(datadf,rulesdf,Description,ShortDescription):
    datadf['pred_group'] = np.nan
    for i, row in rulesdf.iterrows():                  
        for j, row in datadf.iterrows():
            if pd.notna(datadf[ShortDescription][j]):
                if (('erp' in datadf[ShortDescription][j]) and (('EU_tool' in datadf[ShortDescription][j]))):
                        datadf['pred_group'][j] = 'GRP_25'
        for j, row in datadf.iterrows():
            if pd.notna(datadf[Description][j]):
                if (datadf[Description][j] == 'the'):
                    datadf['pred_group'][j] = 'GRP_17' 
                
                if (('finance_app' in ((datadf[ShortDescription][j]) or datadf[Description][j])) and ('HostName_1132' not in datadf[ShortDescription][j])):
                    datadf['pred_group'][j] = 'GRP_55'
                
                if (('processor' in datadf[Description][j]) and ('engg' in datadf[Description][j])):
                    datadf['pred_group'][j] = 'GRP_58'
                
                                     
        if rulesdf['Short Desc Rule'][i] == 'begins with' and rulesdf['Desc Rule'][i] == 'begins with' and pd.isna(rulesdf['User'][i]):
            for j, row in datadf.iterrows():
                if pd.notna(datadf[ShortDescription][j]) and pd.notna(datadf[Description][j]):
                    if ((datadf[ShortDescription][j].startswith(rulesdf['Short Dec Keyword'][i])) and (datadf[Description][j].startswith(rulesdf['Dec keyword'][i]))):
                        datadf['pred_group'][j] = rulesdf['Group'][i]
                        
        if pd.isna(rulesdf['Short Desc Rule'][i]) and rulesdf['Desc Rule'][i] == 'begins with' and pd.notna(rulesdf['User'][i]):
            for j, row in datadf.iterrows():
                if pd.notna(datadf[Description][j]) and pd.notna(datadf['Caller'][j]):
                    if ((datadf[Description][j].startswith(rulesdf['Desc Rule'][i]) and (rulesdf['User'][i] == datadf['Caller'][j]))):
                        datadf['pred_group'][j] = rulesdf['Group'][i]
                        
        if rulesdf['Short Desc Rule'][i] == 'contains' and pd.notna(rulesdf['User'][i]):
            for j, row in datadf.iterrows():
                if (pd.notna(datadf[ShortDescription][j]) and pd.notna(datadf['Caller'][j])):
                     if ((rulesdf['Short Dec Keyword'][i] in datadf[ShortDescription][j]) and (rulesdf['User'][i] == datadf['Caller'][j])):
                        datadf['pred_group'][j] = rulesdf['Group'][i]
        if rulesdf['Short Desc Rule'][i] == 'contains' and pd.isna(rulesdf['Desc Rule'][i]) and pd.isna(rulesdf['User'][i]):
            for j, row in datadf.iterrows():
                if pd.notna(datadf[ShortDescription][j]):
                    if (rulesdf['Short Dec Keyword'][i] in datadf[ShortDescription][j]):
                        datadf['pred_group'][j] = rulesdf['Group'][i]
        if pd.isna(rulesdf['Short Desc Rule'][i]) and rulesdf['Desc Rule'][i] == 'begins with' and pd.isna(rulesdf['User'][i]):
            for j, row in datadf.iterrows():
                if pd.notna(datadf[Description][j]):
                    if (datadf[Description][j].startswith(rulesdf['Dec keyword'][i])):
                        datadf['pred_group'][j] = rulesdf['Group'][i]
        if pd.isna(rulesdf['Short Desc Rule'][i]) and rulesdf['Desc Rule'][i] == 'contains' and pd.isna(rulesdf['User'][i]):
            for j, row in datadf.iterrows():
                if pd.notna(datadf[Description][j]):
                    if (rulesdf['Dec keyword'][i] in datadf[Description][j]):
                        datadf['pred_group'][j] = rulesdf['Group'][i]
        if pd.isna(rulesdf['Short Desc Rule'][i]) and rulesdf['Desc Rule'][i] == 'not contain' and pd.isna(rulesdf['User'][i]):
            for j, row in datadf.iterrows():
                if pd.notna(datadf[Description][j]):
                    if (rulesdf['Dec keyword'][i] in datadf[Description][j]):
                        datadf['pred_group'][j] = rulesdf['Group'][i]


        if rulesdf['Short Desc Rule'][i] == 'not contain' and pd.isna(rulesdf['Desc Rule'][i]) and pd.isna(rulesdf['User'][i]):
            for j, row in datadf.iterrows():

                if pd.notna(datadf[ShortDescription][j]):
                    if (rulesdf['Short Dec Keyword'][i] in datadf[ShortDescription][j]):
                        datadf['pred_group'][j] = rulesdf['Group'][i]
        if pd.isna(rulesdf['Short Desc Rule'][i]) and rulesdf['Desc Rule'][i] == 'not contain' and pd.isna(rulesdf['User'][i]):
            for j, row in datadf.iterrows():
                if pd.notna(datadf[Description][j]):
                    if (datadf[Description][j].startswith(rulesdf['Dec keyword'][i])):
                        datadf['pred_group'][j] = rulesdf['Group'][i]
        if pd.isna(rulesdf['Short Desc Rule'][i]) and rulesdf['Desc Rule'][i] == 'contains' and pd.isna(rulesdf['User'][i]):
            for j, row in datadf.iterrows():
                if pd.notna(datadf[Description][j]):
                    if (rulesdf['Dec keyword'][i] in datadf[Description][j]):
                        datadf['pred_group'][j] = rulesdf['Group'][i]

    return datadf

In [None]:
rules_applied_df = applyRules(data,df_rules,'Description','Short description')
rules_applied_df

In [None]:
rules_applied_df.info()

In [None]:
rules_applied_df = rules_applied_df[(rules_applied_df['pred_group'].isna())]
rules_applied_df.info()

In [None]:
assignment_group_count=rules_applied_df['Assignment group'].value_counts()
assignment_group_count.describe()

### Concatenate Short Description and Description Column into New Description, drop the previous columns

In [None]:
#Concatenate Short Description and Description columns
rules_applied_df['New Description'] = rules_applied_df['Description'] + ' ' +rules_applied_df['Short description']

clean_data=rules_applied_df.drop(['Short description', 'Description', 'pred_group', 'desc_len', 'desc_word_count'], axis=1)


In [None]:
clean_data.info()

## Fixing Garbled Text/ Mojibake using ftfy library

In [None]:
# Write a function to apply to the dataset to detect Mojibakes
def is_mojibake_impacted(text):
    if not badness.sequence_weirdness(text):
        # nothing weird, should be okay
        return True
    try:
        text.encode('sloppy-windows-1252')
    except UnicodeEncodeError:
        # Not CP-1252 encodable, probably fine
        return True
    else:
        # Encodable as CP-1252, Mojibake alert level high
        return False
# Check the dataset for mojibake impact
clean_data[~clean_data.iloc[:,:].applymap(is_mojibake_impacted).all(1)]

In [None]:
# Take an example of row# 8471 Short Desc and fix it
print('Grabled text: \033[1m%s\033[0m\nFixed text: \033[1m%s\033[0m' % (clean_data['New Description'][8471], 
                                                                        fix_text(clean_data['New Description'][8471])))

# List all mojibakes defined in ftfy library
print('\nMojibake Symbol RegEx:\n', badness.MOJIBAKE_SYMBOL_RE.pattern)

In [None]:
# Sanitize the dataset from Mojibakes
clean_data['New Description'] = clean_data['New Description'].apply(fix_text)

# Visualize that row# 8471
clean_data.loc[8471]

## Cleaning & Processing the data

In [None]:
def date_validity(date_str):
    try:
        parser.parse(date_str)
        return True
    except:
        return False

In [None]:

def process(text_string):
    text=text_string.lower()
    text_string = ' '.join([w for w in text_string.split() if not date_validity(w)])
    text_string = re.sub(r"received from:",'',text_string)
    text_string = re.sub(r"from:",' ',text_string)
    text_string = re.sub(r"to:",' ',text_string)
    text_string = re.sub(r"subject:",' ',text_string)
    text_string = re.sub(r"sent:",' ',text_string)
    text_string = re.sub(r"ic:",' ',text_string)
    text_string = re.sub(r"cc:",' ',text_string)
    text_string = re.sub(r"bcc:",' ',text_string)
    text_string = re.sub(r'\S*@\S*\s?', '', text_string)
    text_string = re.sub(r'\d+','' ,text_string)
    text_string = re.sub(r'\n',' ',text_string)
    text_string = re.sub(r'#','', text_string)
    text_string = re.sub(r'&;?', 'and',text_string)
    text_string = re.sub(r'\&\w*;', '', text_string)
    text_string = re.sub(r'https?:\/\/.*\/\w*', '', text_string)  
    #text_string= ''.join(c for c in text_string if c <= '\uFFFF') 
    text_string = text_string.strip()
    #text_string = ' '.join(re.sub("[^\u0030-\u0039\u0041-\u005a\u0061-\u007a]", " ", text_string).split())
    text_string = re.sub(r"\s+[a-zA-Z]\s+", ' ', text_string)
    text_string = re.sub(' +', ' ', text_string)
    text_string = text_string.replace(r'\b\w\b','').replace(r'\s+', ' ')
    text_string = text_string.strip()
    return text_string


In [None]:
clean_data["Clean_Description"] = clean_data["New Description"].apply(process)

In [None]:
clean_data

## Language Translation

#### Load the consolidated final translated pickle file which contains the language translations. The Process used for language translation is commented below

In [2]:
#with open('/content/drive/MyDrive/Capstone/Final_Translated_combined.pkl','rb') as f:
with open('Final_Translated_combined.pkl','rb') as f:
    clean_data = pickle.load(f)

In [3]:
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8466 entries, 0 to 48
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Caller             8466 non-null   object
 1   Assignment group   8466 non-null   object
 2   New Description    8466 non-null   object
 3   Clean_Description  8466 non-null   object
 4   language           8466 non-null   object
 5   Translated Text    8466 non-null   object
dtypes: object(6)
memory usage: 463.0+ KB


In [4]:
clean_data.tail()

Unnamed: 0,Caller,Assignment group,New Description,Clean_Description,language,Translated Text
44,wgmqlnzh vpebwoat,GRP_30,早上开机后显示器不出图像。 显示器不亮,早上开机后显示器不出图像。 显示器不亮,zh-cn,The display does not appear in the morning. Di...
45,rtjwbuev gfpwdetq,GRP_31,"prtSID_737--文件无法打印到打印机,提示打印机错误。 文件无法打印到打印机,提示打...","prtSID_--文件无法打印到打印机,提示打印机错误。 文件无法打印到打印机,提示打印机错误。",zh-cn,The prtsid _- file cannot be printed to the pr...
46,fupikdoa gjkytoeh,GRP_48,"客户提供的在线送货单生成系统打不开,需尽快解决 客户提供的在线系统打不开","客户提供的在线送货单生成系统打不开,需尽快解决 客户提供的在线系统打不开",zh-cn,The online delivery unit provided by the custo...
47,kyagjxdh dmtjpbnz,GRP_30,"进行采购时显示""找不到员工1111154833的数据,请通知系统管理员"" erp无法进行采购...","进行采购时显示""找不到员工的数据,请通知系统管理员"" erp无法进行采购(转给贺正平)",zh-cn,"Show ""Data from the employee, please notify th..."
48,xqyjztnm onfusvlz,GRP_30,"to 小贺,早上电脑开机开不出来 电脑开机开不出来","to 小贺,早上电脑开机开不出来 电脑开机开不出来",zh-cn,"To small congratulations, the computer does no..."


In [5]:
assignment_group_cnt=clean_data['Assignment group'].value_counts()
assignment_group_cnt.describe()

count      43.000000
mean      196.883721
std       596.778064
min        16.000000
25%        31.000000
50%        68.000000
75%       145.500000
max      3941.000000
Name: Assignment group, dtype: float64

## Data Augmentation

In [6]:
!pip3 install nltk
import nltk 
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import wordnet

Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com/simple, https://urm.nvidia.com/artifactory/api/pypi/sw-colossus-pypi/simple


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aroy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aroy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
from collections import OrderedDict
from nltk.tokenize import word_tokenize
def find_synonyms(word):
  synonyms = []
  for synset in wordnet.synsets(word):
    for syn in synset.lemma_names():
      synonyms.append(syn)

  # using this to drop duplicates while maintaining word order (closest synonyms comes first)
  synonyms_without_duplicates = list(OrderedDict.fromkeys(synonyms))
  return synonyms_without_duplicates

In [8]:
def create_set_of_new_sentences(sentence, max_syn_per_word = 3):
  count = 0
  new_sentences = []
  for word in word_tokenize(sentence):
    if len(word)<=3 : continue 
    for synonym in find_synonyms(word)[0:max_syn_per_word]:
      synonym = synonym.replace('_', ' ') #restore space character
      new_sentence = sentence.replace(word,synonym)
      if count <= 4:
        new_sentences.append(new_sentence)
        count += 1    
  return new_sentences

In [9]:
#Create a new dataframe with records not in GRP_0
new_dataframe = clean_data[clean_data["Assignment group"] != 'GRP_0']
zero_dataframe = clean_data[clean_data["Assignment group"] == 'GRP_0']
new_dataframe.head()


Unnamed: 0,Caller,Assignment group,New Description,Clean_Description,language,Translated Text
6,jyoqwxhz clhxsoqy,GRP_1,event: critical:HostName_221.company.com the v...,event: critical:HostName_.company.com the valu...,en,event: critical:HostName_.company.com the valu...
17,sigfdwcj reofwzlm,GRP_3,"when undocking pc , screen will not come back ...","when undocking pc , screen will not come back ...",en,"When undocking pc , screen want distress come ..."
32,kxsceyzo naokumlb,GRP_4,\n\nreceived from: kxsceyzo.naokumlb@gmail.com...,"gentles, have two devices that are trying to s...",en,"gentles, have two devices did are trying to sh..."
43,yisohglr uvteflgb,GRP_5,\n\nreceived from: yisohglr.uvteflgb@gmail.com...,hi - the printer printer is not working and ne...,en,Hi - the printer printer is distress working a...
47,bpctwhsn kzqsbmtp,GRP_6,received from: monitoring_tool@company.com\n\n...,job Job_ failed in job_scheduler at: job Job_ ...,en,job Job_ failed in job_scheduler at: job Job_ ...


In [10]:
new_dataframe.shape, clean_data.shape

((4525, 6), (8466, 6))

In [11]:
maxsyn=1
new_dataframe["Augmented_data"] = new_dataframe.apply(lambda x: create_set_of_new_sentences(x['Translated Text'], maxsyn),axis=1)
new_dataframe

Unnamed: 0,Caller,Assignment group,New Description,Clean_Description,language,Translated Text,Augmented_data
6,jyoqwxhz clhxsoqy,GRP_1,event: critical:HostName_221.company.com the v...,event: critical:HostName_.company.com the valu...,en,event: critical:HostName_.company.com the valu...,[event: critical:HostName_.company.com the val...
17,sigfdwcj reofwzlm,GRP_3,"when undocking pc , screen will not come back ...","when undocking pc , screen will not come back ...",en,"When undocking pc , screen want distress come ...","[When undock pc , screen want distress come ba..."
32,kxsceyzo naokumlb,GRP_4,\n\nreceived from: kxsceyzo.naokumlb@gmail.com...,"gentles, have two devices that are trying to s...",en,"gentles, have two devices did are trying to sh...","[pacify, have two devices did are trying to sh..."
43,yisohglr uvteflgb,GRP_5,\n\nreceived from: yisohglr.uvteflgb@gmail.com...,hi - the printer printer is not working and ne...,en,Hi - the printer printer is distress working a...,[Hi - the printer printer is distress working ...
47,bpctwhsn kzqsbmtp,GRP_6,received from: monitoring_tool@company.com\n\n...,job Job_ failed in job_scheduler at: job Job_ ...,en,job Job_ failed in job_scheduler at: job Job_ ...,[job Job_ fail in job_scheduler at: job Job_ f...
...,...,...,...,...,...,...,...
44,wgmqlnzh vpebwoat,GRP_30,早上开机后显示器不出图像。 显示器不亮,早上开机后显示器不出图像。 显示器不亮,zh-cn,The display does not appear in the morning. Di...,[The display does not appear in the morning. D...
45,rtjwbuev gfpwdetq,GRP_31,"prtSID_737--文件无法打印到打印机,提示打印机错误。 文件无法打印到打印机,提示打...","prtSID_--文件无法打印到打印机,提示打印机错误。 文件无法打印到打印机,提示打印机错误。",zh-cn,The prtsid _- file cannot be printed to the pr...,[The prtsid _- file cannot be printed to the p...
46,fupikdoa gjkytoeh,GRP_48,"客户提供的在线送货单生成系统打不开,需尽快解决 客户提供的在线系统打不开","客户提供的在线送货单生成系统打不开,需尽快解决 客户提供的在线系统打不开",zh-cn,The online delivery unit provided by the custo...,[The on-line delivery unit provided by the cus...
47,kyagjxdh dmtjpbnz,GRP_30,"进行采购时显示""找不到员工1111154833的数据,请通知系统管理员"" erp无法进行采购...","进行采购时显示""找不到员工的数据,请通知系统管理员"" erp无法进行采购(转给贺正平)",zh-cn,"Show ""Data from the employee, please notify th...","[show ""Data from the employee, please notify t..."


In [12]:
s = new_dataframe.apply(lambda x: pd.Series(x['Augmented_data']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'Final_Text'
new_dataframe_aug = new_dataframe.drop(['New Description','Augmented_data', 'Clean_Description', 'Translated Text'],axis=1).join(s)
new_dataframe_aug





Unnamed: 0,Caller,Assignment group,language,Final_Text
0,vrfpyjwi nzhvgqiw,GRP_24,de,hello it's happened again The PC has been rele...
0,vrfpyjwi nzhvgqiw,GRP_24,de,hello it's happen again The PC has been releas...
0,vrfpyjwi nzhvgqiw,GRP_24,de,hello it's happened again The PC has been rele...
0,vrfpyjwi nzhvgqiw,GRP_24,de,hello it's happened again The PC has be releas...
0,vrfpyjwi nzhvgqiw,GRP_24,de,hello it's happened again The PC has been let ...
...,...,...,...,...
8498,ufawcgob aowhxjky,GRP_62,en,i at the unable to access the machine utilitie...
8498,ufawcgob aowhxjky,GRP_62,en,i at the unable to entree the machine utilitie...
8498,ufawcgob aowhxjky,GRP_62,en,i at the unable to access the machine utilitie...
8498,ufawcgob aowhxjky,GRP_62,en,i at the unable to access the machine utility ...


In [13]:
#dataframes=[clean_data_aug1,clean_data_aug2,clean_data_aug3,clean_aug4]
#dataframes=[clean_data_aug1,clean_data_aug2,clean_data_aug3]
zero_dataframe = zero_dataframe.rename(columns={"Translated Text": "Final_Text"})
zero_dataframe = zero_dataframe.drop(['New Description', 'Clean_Description'], axis = 1)
dataframes=[new_dataframe_aug, zero_dataframe]
clean_data_result= pd.concat(dataframes)
clean_data_result

Unnamed: 0,Caller,Assignment group,language,Final_Text
0,vrfpyjwi nzhvgqiw,GRP_24,de,hello it's happened again The PC has been rele...
0,vrfpyjwi nzhvgqiw,GRP_24,de,hello it's happen again The PC has been releas...
0,vrfpyjwi nzhvgqiw,GRP_24,de,hello it's happened again The PC has been rele...
0,vrfpyjwi nzhvgqiw,GRP_24,de,hello it's happened again The PC has be releas...
0,vrfpyjwi nzhvgqiw,GRP_24,de,hello it's happened again The PC has been let ...
...,...,...,...,...
87,gasbfqvp fmvqgjih,GRP_0,de,"On my part, the password was incorrectly enter..."
97,nizholae bjnqikym,GRP_0,de,Stephryhan Needs Access to Below Collaboration...
100,bmhrsxlf ukatbwyi,GRP_0,de,benefits issue benefits issue
101,sjxhcyrq iupxtjcf,GRP_0,de,Security Error in travel expenses Billing Prog...


In [None]:
# Assignment group distribution
print('\033[1mTotal assignment groups:\033[0m', clean_data_result['Assignment group'].nunique())

# Histogram
clean_data_result['Assignment group'].iplot(
    kind='hist',
    xTitle='Assignment Group',
    yTitle='count',
    title='Assignment Group Distribution- Histogram (Fig-5)')

In [14]:
# Serialize the Augmented dataset for later use
clean_data_result.to_csv('Interim_data.csv', index=False, encoding='utf_8_sig')
#with open('/content/Interim_data.pkl','wb') as f:
with open('Interim_data.pkl','wb') as f:
    pickle.dump(clean_data_result, f, pickle.HIGHEST_PROTOCOL)

## Stop words removal and Lemmatise text

In [15]:
clean_data_result.isnull().sum()

Caller                0
Assignment group      0
language              0
Final_Text          197
dtype: int64

In [16]:
clean_data_result['Final_Text'] = clean_data_result['Final_Text'].fillna("")

In [17]:
import re
import string
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english')) 

processed_all_documents = list()

for desc in clean_data_result['Final_Text']:
    word_tokens = word_tokenize(desc) 
    
    filtered_sentence = [] 

    # Removing Stopwords
    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w) 

    words = ' '.join(filtered_sentence)
    processed_all_documents.append(words)  

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aroy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
clean_data_result['Final_Text'] = processed_all_documents

In [19]:
clean_data_result.head(50)

Unnamed: 0,Caller,Assignment group,language,Final_Text
0,vrfpyjwi nzhvgqiw,GRP_24,de,hello 's happened The PC released repeated tim...
0,vrfpyjwi nzhvgqiw,GRP_24,de,hello 's happen The PC released repeated times...
0,vrfpyjwi nzhvgqiw,GRP_24,de,hello 's happened The PC released repeated tim...
0,vrfpyjwi nzhvgqiw,GRP_24,de,hello 's happened The PC released repeated tim...
0,vrfpyjwi nzhvgqiw,GRP_24,de,hello 's happened The PC let go repeated times...
0,vrfpyjwi nzhvgqiw,GRP_24,de,hello Ben Tige Number Block Keyboard R Left Ha...
0,vrfpyjwi nzhvgqiw,GRP_24,de,Hello Ben Tige number Block Keyboard R Left Ha...
0,vrfpyjwi nzhvgqiw,GRP_24,de,Hello Ben Tige Number block Keyboard R Left Ha...
0,vrfpyjwi nzhvgqiw,GRP_24,de,Hello Ben Tige Number Block keyboard R Left Ha...
0,vrfpyjwi nzhvgqiw,GRP_24,de,Hello Ben Tige Number Block Keyboard R left Ha...


In [20]:
clean_data_result.dropna()

Unnamed: 0,Caller,Assignment group,language,Final_Text
0,vrfpyjwi nzhvgqiw,GRP_24,de,hello 's happened The PC released repeated tim...
0,vrfpyjwi nzhvgqiw,GRP_24,de,hello 's happen The PC released repeated times...
0,vrfpyjwi nzhvgqiw,GRP_24,de,hello 's happened The PC released repeated tim...
0,vrfpyjwi nzhvgqiw,GRP_24,de,hello 's happened The PC released repeated tim...
0,vrfpyjwi nzhvgqiw,GRP_24,de,hello 's happened The PC let go repeated times...
...,...,...,...,...
87,gasbfqvp fmvqgjih,GRP_0,de,"On part , password incorrectly entered please ..."
97,nizholae bjnqikym,GRP_0,de,Stephryhan Needs Access Below Collaboration Pl...
100,bmhrsxlf ukatbwyi,GRP_0,de,benefits issue benefits issue
101,sjxhcyrq iupxtjcf,GRP_0,de,Security Error travel expenses Billing Program...


In [21]:
clean_data_result.isnull().sum()

Caller              0
Assignment group    0
language            0
Final_Text          0
dtype: int64

In [22]:
clean_data_result['Final_Text'] = clean_data_result['Final_Text'].replace(np.nan, '', regex=True)

In [23]:
#Lemmatisation using spacy library
!pip install spacy

Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com/simple, https://urm.nvidia.com/artifactory/api/pypi/sw-colossus-pypi/simple


In [24]:
!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz

Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com/simple, https://urm.nvidia.com/artifactory/api/pypi/sw-colossus-pypi/simple
Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz (12.0 MB)
Building wheels for collected packages: en-core-web-sm
  Building wheel for en-core-web-sm (setup.py): started
  Building wheel for en-core-web-sm (setup.py): finished with status 'done'
  Created wheel for en-core-web-sm: filename=en_core_web_sm-2.3.1-py3-none-any.whl size=12047113 sha256=2e3655aefcac3d591cfa31ffe961fff3be83dcc6522e8bc7631daf029c0b54d2
  Stored in directory: c:\users\aroy\appdata\local\pip\cache\wheels\10\6f\a6\ddd8204ceecdedddea923f8514e13afb0c1f0f556d2c9c3da0
Successfully built en-core-web-sm


In [25]:
!pip3 install spacy

Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com/simple, https://urm.nvidia.com/artifactory/api/pypi/sw-colossus-pypi/simple


In [26]:
# Need to run "python -m spacy download en" in anaconda prompt to avoid 'en' not found issue.

In [27]:
import spacy

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']
def lemmatize_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc])

clean_data_result['Final_Text'] = clean_data_result['Final_Text'].apply(lemmatize_text)

In [28]:
clean_data_result

Unnamed: 0,Caller,Assignment group,language,Final_Text
0,vrfpyjwi nzhvgqiw,GRP_24,de,hello be happen the pc release repeat time blu...
0,vrfpyjwi nzhvgqiw,GRP_24,de,hello be happen the pc release repeat time blu...
0,vrfpyjwi nzhvgqiw,GRP_24,de,hello be happen the pc release repeat time blu...
0,vrfpyjwi nzhvgqiw,GRP_24,de,hello be happen the pc release repeat time blu...
0,vrfpyjwi nzhvgqiw,GRP_24,de,hello be happen the pc let go repeat time blue...
...,...,...,...,...
87,gasbfqvp fmvqgjih,GRP_0,de,"on part , password incorrectly enter please pa..."
97,nizholae bjnqikym,GRP_0,de,Stephryhan need Access below Collaboration Pla...
100,bmhrsxlf ukatbwyi,GRP_0,de,benefit issue benefit issue
101,sjxhcyrq iupxtjcf,GRP_0,de,Security Error travel expense Billing ProgramD...


In [29]:
# Serialize the translated dataset
clean_data_result.to_csv('Final_data.csv', index=False, encoding='utf_8_sig')
#with open('/content/Final_data.pkl','wb') as f:
with open('Final_data.pkl','wb') as f:
    pickle.dump(clean_data_result, f, pickle.HIGHEST_PROTOCOL)

In [30]:
# Load the translated pickle file 
#with open('/content/Final_data.pkl','rb') as f:
with open('Final_data.pkl','rb') as f:
    clean_data = pickle.load(f)


### Univariate visualization
Single-variable or univariate visualization is the simplest type of visualization which consists of observations on only a single characteristic or attribute. Univariate visualization includes histogram, bar plots and line charts.

#### The distribution of Assignment groups
Plots how the assignments groups are scattered across the dataset. The bar chart, histogram and pie chart tells the frequency of any ticket assigned to any group OR the tickets count for each group.

In [None]:
# Assignment group distribution
print('\033[1mTotal assignment groups:\033[0m', clean_data['Assignment group'].nunique())

# Histogram
clean_data['Assignment group'].iplot(
    kind='hist',
    xTitle='Assignment Group',
    yTitle='count',
    title='Assignment Group Distribution- Histogram (Fig-1)')

# Pie chart
assgn_grp = pd.DataFrame(clean_data.groupby('Assignment group').size(),columns = ['Count']).reset_index()
assgn_grp.iplot(
    kind='pie', 
    labels='Assignment group', 
    values='Count', 
    title='Assignment Group Distribution- Pie Chart (Fig-2)', 
    hoverinfo="label+percent+name", hole=0.25)


### Lets visualize the percentage of incidents per assignment group

In [None]:
# Plot to visualize the percentage data distribution across different groups
sns.set(style="whitegrid")
plt.figure(figsize=(20,5))
ax = sns.countplot(x="Assignment group", data=clean_data, order=clean_data["Assignment group"].value_counts().index)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
for p in ax.patches:
  ax.annotate(str(format(p.get_height()/len(clean_data.index)*100, '.2f')+"%"), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'bottom', rotation=90, xytext = (0, 10), textcoords = 'offset points')

In [None]:
top_20 = clean_data['Assignment group'].value_counts().nlargest(20).reset_index()

In [None]:
plt.figure(figsize=(12,6))
bars = plt.bar(top_20['index'],top_20['Assignment group'])
plt.title('Top 20 Assignment groups with highest number of Tickets')
plt.xlabel('Assignment Group')
plt.xticks(rotation=90)
plt.ylabel('Number of Tickets')

for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x(), yval + .005, yval)
plt.tight_layout()
plt.show()

In [None]:
bottom_20 = clean_data['Assignment group'].value_counts().nsmallest(20).reset_index()

In [None]:
plt.figure(figsize=(12,6))
bars = plt.bar(bottom_20['index'],bottom_20['Assignment group'])
plt.title('Bottom 20 Assignment groups with small number of Tickets')
plt.xlabel('Assignment Group')
plt.xticks(rotation=90)
plt.ylabel('Number of Tickets')
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x(), yval + .005, yval)
plt.tight_layout()
plt.show()

#### The distribution of Callers
Plots how the callers are associated with tickets and what are the assignment groups they most frequently raise tickets for.

In [None]:
# Find out top 10 callers in terms of frequency of raising tickets in the entire dataset
print('\033[1mTotal caller count:\033[0m', clean_data['Caller'].nunique())
df = pd.DataFrame(clean_data.groupby(['Caller']).size().nlargest(10), columns=['Count']).reset_index()
df.iplot(kind='pie',
         labels='Caller', 
         values='Count', 
         title='Top 10 caller- Pie Chart (Fig-7)',
         colorscale='-spectral',
         pull=[0,0,0,0,0.05,0.1,0.15,0.2,0.25,0.3])

In [None]:
# Top 5 callers in each assignment group
top_n = 5
s = clean_data['Caller'].groupby(clean_data['Assignment group']).value_counts()
caller_grp = pd.DataFrame(s.groupby(level=0).nlargest(top_n).reset_index(level=0, drop=True))
caller_grp.head(15)

#### The distribution of description lengths
Plots the variation of length and word count of new description attribute

In [None]:
clean_data.insert(1, 'desc_len', clean_data['Final_Text'].astype(str).apply(len))
clean_data.insert(5, 'desc_word_count', clean_data['Final_Text'].apply(lambda x: len(str(x).split())))
clean_data.head()

In [None]:
# Description text length
clean_data['desc_len'].iplot(
    kind='bar',
    xTitle='text length',
    yTitle='count',
    colorscale='-ylgn',
    title='Description Text Length Distribution (Fig-11)')

# Description word count
clean_data['desc_word_count'].iplot(
    kind='bar',
    xTitle='word count',
    linecolor='black',
    yTitle='count',
    colorscale='-bupu',
    title='Description Word Count Distribution (Fig-12)')

### N-Grams
N-gram is a contiguous sequence of N items from a given sample of text or speech, in the fields of computational linguistics and probability. The items can be phonemes, syllables, letters, words or base pairs according to the application. N-grams are used to describe the number of words used as observation points, e.g., unigram means singly-worded, bigram means 2-worded phrase, and trigram means 3-worded phrase. 

We'll be using scikit-learn’s CountVectorizer function to derive n-grams and compare them before and after removing stop words. Stop words are a set of commonly used words in any language. We'll be using english corpus stopwords and extend it to include some business specific common words considered to be stop words in our case.

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from sklearn.feature_extraction.text import CountVectorizer

# Extend the English Stop Wordss
STOP_WORDS = STOPWORDS.union({'yes','na','hi',
                              'receive','hello',
                              'regards','thanks',
                              'from','greeting',
                              'forward','reply',
                              'will','please',
                              'see','help','able'})

# Generic function to derive top N n-grams from the corpus
def get_top_n_ngrams(corpus, top_n=None, ngram_range=(1,1), stopwords=None):
    vec = CountVectorizer(ngram_range=ngram_range, 
                          stop_words=stopwords).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:top_n]

### Top Unigrams

In [None]:
# Top 50 Unigrams before removing stop words
top_n = 50
ngram_range = (1,1)
uni_grams = get_top_n_ngrams(clean_data.Final_Text, top_n, ngram_range)

df = pd.DataFrame(uni_grams, columns = ['Final_Text' , 'count'])
df.groupby('Final_Text').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', 
    yTitle='Count', 
    linecolor='black', 
    colorscale='piyg',
    title=f'Top {top_n} Unigrams in Final_Text')

# Top 50 Unigrams after removing stop words
uni_grams_sw = get_top_n_ngrams(clean_data.Final_Text, top_n, ngram_range, stopwords=STOP_WORDS)

df = pd.DataFrame(uni_grams_sw, columns = ['Final_Text' , 'count'])
df.groupby('Final_Text').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', 
    yTitle='Count', 
    linecolor='black',
    colorscale='-piyg',
    title=f'Top {top_n} Unigrams in Final_Text without stop words')

### Top Bigrams

In [None]:
# Top 50 Bigrams before removing stop words
top_n = 50
ngram_range = (2,2)
bi_grams = get_top_n_ngrams(clean_data.Final_Text, top_n, ngram_range)

df = pd.DataFrame(bi_grams, columns = ['Final_Text' , 'count'])
df.groupby('Final_Text').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', 
    yTitle='Count', 
    linecolor='black', 
    colorscale='piyg',
    title=f'Top {top_n} Bigrams in Final_Text')

# Top 50 Bigrams after removing stop words
bi_grams_sw = get_top_n_ngrams(clean_data.Final_Text, top_n, ngram_range, stopwords=STOP_WORDS)

df = pd.DataFrame(bi_grams_sw, columns = ['Final_Text' , 'count'])
df.groupby('Final_Text').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', 
    yTitle='Count', 
    linecolor='black',
    colorscale='-piyg',
    title=f'Top {top_n} Bigrams in Final_Text without stop words')

### Top Trigrams

In [None]:
# Top 50 Trigrams before removing stop words
top_n = 50
ngram_range = (3,3)
tri_grams = get_top_n_ngrams(clean_data.Final_Text, top_n, ngram_range)

df = pd.DataFrame(tri_grams, columns = ['Final_Text' , 'count'])
df.groupby('Final_Text').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', 
    yTitle='Count', 
    linecolor='black', 
    colorscale='piyg',
    title=f'Top {top_n} Trigrams in Final_Text')

# Top 50 Trigrams after removing stop words
tri_grams_sw = get_top_n_ngrams(clean_data.Final_Text, top_n, ngram_range, stopwords=STOP_WORDS)

df = pd.DataFrame(tri_grams_sw, columns = ['Final_Text' , 'count'])
df.groupby('Final_Text').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', 
    yTitle='Count', 
    linecolor='black',
    colorscale='-piyg',
    title=f'Top {top_n} Trigrams in Final_Text without stop words')

### Word Cloud
Let us attempt to visualize this as a word cloud for top three groups that has got maximum records. A word cloud enables us to visualize the data as cluster of words and each words displayed in different font size based on the number of occurences of that word . Basically; the bolder and bigger the word show up in the visualization, it implies its more often it’s mentioned within a given text compared to other words in the cloud and therefore would be more important for us.

Let's write a generic method to generate Word Clouds for both Short and Long Description columns.

In [None]:
# replace any single word character with a word boundary
#clean_data.Final_Text.str.replace(r'\b\w\b','').str.replace(r'\s+', ' ')

In [None]:
def generate_word_cloud(corpus):
        # Instantiate the wordcloud object
    wordcloud = WordCloud(width = 800, height = 800, 
                    background_color ='white', 
                    stopwords=STOP_WORDS,
                    # mask=mask,
                    min_font_size = 10).generate(corpus)

    # plot the WordCloud image                        
    plt.figure(figsize = (12, 12), facecolor = None) 
    plt.imshow(wordcloud) 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 

    plt.show()

In [None]:
# Word Cloud for all tickets assigned to GRP_0
generate_word_cloud(' '.join(clean_data[clean_data['Assignment group'] == 'GRP_0'].Final_Text.str.strip()))

In [None]:
# Word Cloud for all tickets assigned to GRP_8
generate_word_cloud(' '.join(clean_data[clean_data['Assignment group'] == 'GRP_8'].Final_Text.str.strip()))

In [None]:
# Word Cloud for all tickets assigned to GRP_25
generate_word_cloud(' '.join(clean_data[clean_data['Assignment group'] == 'GRP_25'].Final_Text.str.strip()))

In [None]:
# Generate wordcloud for Final_Text field
generate_word_cloud(' '.join(clean_data.Final_Text.str.strip()))

## Prepping Dataframe for Model Building

In [None]:
'''# Create a target categorical column
clean_data['Assignment group OneHotEncoded'] = clean_data['Assignment group'].astype('category').cat.codes
clean_data.info()'''

In [None]:
'''# Import OneHot encoder 
from sklearn.preprocessing import LabelBinarizer
from sklearn import preprocessing 
clean_data['Assignment group OneHotEncoded'] = np.nan
# OneHot_encoder object knows how to understand word labels. 
#onehot_encoder = preprocessing.OneHotEncoder() #categories=62
onehot_encoder = LabelBinarizer()
onehot_encoder.fit(clean_data['Assignment group'])
# Encode labels in column
#transformed = onehot_encoder.fit_transform(clean_data['Assignment group'])
#temp_df = pd.DataFrame(transformed, columns=onehot_encoder.get_feature_names())
transformed = onehot_encoder.transform(clean_data['Assignment group'])
temp_df = pd.DataFrame(transformed)
clean_data = pd.concat([clean_data, temp_df], axis=1)
#clean_data
#clean_data['Assignment group OneHotEncoded'].unique()
clean_data'''

In [31]:
clean_data

Unnamed: 0,Caller,Assignment group,language,Final_Text
0,vrfpyjwi nzhvgqiw,GRP_24,de,hello be happen the pc release repeat time blu...
0,vrfpyjwi nzhvgqiw,GRP_24,de,hello be happen the pc release repeat time blu...
0,vrfpyjwi nzhvgqiw,GRP_24,de,hello be happen the pc release repeat time blu...
0,vrfpyjwi nzhvgqiw,GRP_24,de,hello be happen the pc release repeat time blu...
0,vrfpyjwi nzhvgqiw,GRP_24,de,hello be happen the pc let go repeat time blue...
...,...,...,...,...
87,gasbfqvp fmvqgjih,GRP_0,de,"on part , password incorrectly enter please pa..."
97,nizholae bjnqikym,GRP_0,de,Stephryhan need Access below Collaboration Pla...
100,bmhrsxlf ukatbwyi,GRP_0,de,benefit issue benefit issue
101,sjxhcyrq iupxtjcf,GRP_0,de,Security Error travel expense Billing ProgramD...


In [32]:
# Import label encoder 
from sklearn import preprocessing 
  
# label_encoder object knows how to understand word labels. 
label_encoder = preprocessing.LabelEncoder() 
  
# Encode labels in column 'species'. 
clean_data['Assignment group LabelEncoded']= label_encoder.fit_transform(clean_data['Assignment group']) 
  
clean_data['Assignment group LabelEncoded'].unique()

array([17, 25, 18,  4, 35, 26, 24, 32, 21,  1,  8, 12, 27, 13,  6, 23,  2,
       22, 29,  5, 42, 36, 19, 34, 37, 40, 41, 10,  3,  7,  9, 11, 14, 15,
       16, 20, 28, 30, 31, 33, 38, 39,  0])

In [33]:
label_encoded_dict = dict(zip(clean_data['Assignment group'].unique(), clean_data['Assignment group LabelEncoded'].unique()))
len(label_encoded_dict)

43

## Feature Extraction : Bag of Words using CountVectorizer

In [34]:
from sklearn.feature_extraction.text import CountVectorizer

CV = CountVectorizer(max_features=2000)

X_BoW = CV.fit_transform(clean_data['Final_Text']).toarray()
y = clean_data['Assignment group LabelEncoded']

print("Shape of Input Feature :",np.shape(X_BoW))
print("Shape of Target Feature :",np.shape(y))

Shape of Input Feature : (27478, 2000)
Shape of Target Feature : (27478,)


In [35]:
# Splitting Train Test 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_BoW, y, test_size=0.3, random_state = 0, stratify=y)
print('\033[1mShape of the training set:\033[0m', X_train.shape, X_test.shape)
print('\033[1mShape of the test set:\033[0m', y_train.shape, y_test.shape)

[1mShape of the training set:[0m (19234, 2000) (8244, 2000)
[1mShape of the test set:[0m (19234,) (8244,)


In [36]:
def run_classification(estimator, X_train, X_test, y_train, y_test, arch_name=None, pipelineRequired=True, isDeepModel=False):
    # train the model
    clf = estimator

    if pipelineRequired :
        clf = Pipeline([('tfidf', TfidfTransformer()),
                     ('clf', estimator),
                     ])
      
    if isDeepModel :
        clf.fit(X_train, y_train, validation_data=(X_test, y_test),epochs=25, batch_size=128,verbose=1,callbacks=call_backs(arch_name))
        # predict from the clasiffier
        y_pred = clf.predict(X_test)
        y_pred = np.argmax(y_pred, axis=1)
        y_train_pred = clf.predict(X_train)
        y_train_pred = np.argmax(y_train_pred, axis=1)
    else :
        clf.fit(X_train, y_train)
        # predict from the clasiffier
        y_pred = clf.predict(X_test)
        y_train_pred = clf.predict(X_train)
    
    print('Estimator:', clf)
    print('='*80)
    print('Training accuracy: %.2f%%' % (accuracy_score(y_train,y_train_pred) * 100))
    print('Testing accuracy: %.2f%%' % (accuracy_score(y_test, y_pred) * 100))
    print('='*80)
    print('Confusion matrix:\n %s' % (confusion_matrix(y_test, y_pred)))
    print('='*80)
    print('Classification report:\n %s' % (classification_report(y_test, y_pred)))
    

## Logistic Regression

In [None]:
run_classification(LogisticRegression(), X_train, X_test, y_train, y_test)

## Naive Bayes Classifier

In [None]:
run_classification(MultinomialNB(), X_train, X_test, y_train, y_test)

## K-nearest Neighbor

#### Applying Gridsearch to find Best Parameters

In [None]:
param = {'n_neighbors': [3,5,7], 'weights': ['uniform', 'distance'], 'metric': ['euclidean', 'manhattan']}
gs_knn = GridSearchCV(KNeighborsClassifier(), param, verbose = 1, cv = 3, n_jobs = -1)
gs_knn_results = gs_knn.fit(X_train, y_train)

In [None]:
gs_knn_results.best_score_

In [None]:
gs_knn.best_estimator_

In [None]:
gs_knn.best_params_

In [None]:
knn_clf = gs_knn.best_estimator_

run_classification(knn_clf, X_train, X_test, y_train, y_test)

In [None]:
#run_classification(KNeighborsClassifier(), X_train, X_test, y_train, y_test)

## Support Vector Machine (SVM)

#### Applying Gridsearch to find Best Parameters

In [None]:
?SVC

In [78]:
model = SVC()
param = {'C':[0.5,1,2.5],'kernel':['linear','rbf']}
gs_svm = GridSearchCV (model,param_grid=param, cv=3,scoring ='accuracy')
gs_svm_results = gs_svm.fit(X_train,y_train)

KeyboardInterrupt: 

In [None]:
gs_svm_results.best_score_

In [None]:
gs_svm.best_estimator_

In [None]:
gs_svm.best_params_

In [None]:
svc_clf = gs_svm.best_estimator_

run_classification(svc_clf, X_train, X_test, y_train, y_test)

## Decision Tree

In [None]:
#Using GRIDSearch CV to find hyper parameters

params = {'criterion': ['entropy', 'gini'], 'max_depth': [None,3,4,5,9,10 ],            
               'min_samples_leaf': [ 2, 3, 5,7, 10,20]}
DTreg = DecisionTreeClassifier()
gs_DT = GridSearchCV(estimator = DTreg,
                           param_grid = params,
                           scoring = 'accuracy',
                           cv = 3,)
gs_DT_results = gs_DT.fit(X_train, y_train)


In [None]:
#Using GRIDSearch CV to find hyper parameters

gs_DT_results.best_score_

In [None]:
gs_DT.best_params_

In [None]:
gs_DT.best_estimator_

In [None]:
DT_clf = gs_DT.best_estimator_

In [None]:
run_classification(DT_clf,X_train, X_test, y_train, y_test)

In [None]:
#run_classification(DecisionTreeClassifier(), X_train, X_test, y_train, y_test)

## Random Forest

In [58]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

In [59]:
#Using RandomizedSearchCV (GRIDSearch is taing longer) to find hyper parameters

params = {'n_estimators': [75, 100, 250,500], 'max_depth': [3,5,10,15,25]}
rfc = RandomForestClassifier(class_weight = 'balanced', n_jobs=1)
rs_rfc = RandomizedSearchCV(estimator = rfc,
                           param_distributions = params,
                           cv = 3,random_state = 42, n_jobs = 1, return_train_score= True)
rs_rfc_results = rs_rfc.fit(X_train, y_train)


In [60]:
rs_rfc_results.best_score_

0.5571905031443053

In [61]:
rs_rfc.best_params_

{'n_estimators': 250, 'max_depth': 25}

In [62]:
rs_rfc.best_estimator_

RandomForestClassifier(class_weight='balanced', max_depth=25, n_estimators=250,
                       n_jobs=1)

In [63]:
rfc_clf = rs_rfc.best_estimator_

In [64]:
run_classification(rfc_clf, X_train, X_test, y_train, y_test)

Estimator: Pipeline(steps=[('tfidf', TfidfTransformer()),
                ('clf',
                 RandomForestClassifier(class_weight='balanced', max_depth=25,
                                        n_estimators=250, n_jobs=1))])
Training accuracy: 63.11%
Testing accuracy: 56.91%
Confusion matrix:
 [[604   0   9 ...   9   2   0]
 [  0  30   0 ...   0   0   0]
 [  0   0 125 ...   0   0  11]
 ...
 [  0   0   0 ...  85   0   0]
 [  0   0   4 ...   0 362 110]
 [  4   0   0 ...   0   0 157]]
Classification report:
               precision    recall  f1-score   support

           0       0.96      0.51      0.67      1182
           1       0.79      0.65      0.71        46
           2       0.82      0.68      0.74       184
           3       0.64      0.94      0.76        48
           4       0.81      0.57      0.67       503
           5       0.74      0.86      0.80       225
           6       0.84      0.90      0.87       184
           7       0.94      1.00      0.97      

In [None]:
#run_classification(RandomForestClassifier(n_estimators=100, random_state=0), X_train, X_test, y_train, y_test)

## GradientBoosting

In [41]:
from sklearn.ensemble import GradientBoostingClassifier
#run_classification(GradientBoostingClassifier(n_estimators=100, random_state=0), X_train, X_test, y_train, y_test)

In [42]:
params ={'max_depth':[3,5,10,15],'n_estimators':[5,10,30,50,100]}

In [43]:
gbc = GradientBoostingClassifier(n_estimators=100, random_state=42)
rs_gbc = RandomizedSearchCV(estimator = gbc,
                           param_distributions = params,
                           cv = 3,random_state = 42, n_jobs = 1, return_train_score= True)
#Fitting randomsearch model
rs_gbc_results = rs_gbc.fit(X_train, y_train)

In [44]:
rs_gbc_results.best_score_

0.6972551271388764

In [45]:
rs_gbc.best_params_

{'n_estimators': 50, 'max_depth': 15}

In [46]:
rs_gbc.best_estimator_

GradientBoostingClassifier(max_depth=15, n_estimators=50, random_state=42)

In [47]:
gbc_clf = rs_gbc.best_estimator_

In [48]:
run_classification(gbc_clf, X_train, X_test, y_train, y_test)

Estimator: Pipeline(steps=[('tfidf', TfidfTransformer()),
                ('clf',
                 GradientBoostingClassifier(max_depth=15, n_estimators=50,
                                            random_state=42))])
Training accuracy: 81.88%
Testing accuracy: 71.18%
Confusion matrix:
 [[968   0   3 ...   3   1   4]
 [  0  31   0 ...   0   3   0]
 [  0   0 149 ...   0  13   0]
 ...
 [  1   0   0 ...  90   0   0]
 [  1   0   4 ...   0 551   2]
 [  3   0   0 ...   0  88  96]]
Classification report:
               precision    recall  f1-score   support

           0       0.85      0.82      0.83      1182
           1       0.66      0.67      0.67        46
           2       0.79      0.81      0.80       184
           3       0.76      0.85      0.80        48
           4       0.75      0.70      0.72       503
           5       0.97      0.89      0.93       225
           6       0.92      0.91      0.91       184
           7       0.93      0.93      0.93        58
      

## XGBoosting

In [49]:
!pip install xgboost

Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com/simple, https://urm.nvidia.com/artifactory/api/pypi/sw-colossus-pypi/simple


In [50]:
from xgboost import XGBClassifier
#run_classification(XGBClassifier(), X_train, X_test, y_train, y_test)

In [51]:
params ={'max_depth':[3,5,10,15],'n_estimators':[5,10,30,50,100]}

In [52]:
xgbc = XGBClassifier(class_weight ='balanced', n_jobs=1)
rs_xgbc = RandomizedSearchCV(estimator = xgbc,
                           param_distributions = params,
                           cv = 3,random_state = 42, n_jobs = 1, return_train_score= True)
#Fitting randomsearch model
rs_xgbc_results = rs_xgbc.fit(X_train, y_train)





Parameters: { class_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { class_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { class_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { class_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings bu

In [53]:
rs_xgbc_results.best_score_

0.7107206325139271

In [54]:
rs_xgbc.best_params_

{'n_estimators': 50, 'max_depth': 15}

In [55]:
rs_xgbc.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', class_weight='balanced',
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=15, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=50, n_jobs=1,
              num_parallel_tree=1, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [56]:
xgbc_clf = rs_xgbc.best_estimator_

In [57]:
run_classification(xgbc_clf, X_train, X_test, y_train, y_test)





Parameters: { class_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Estimator: Pipeline(steps=[('tfidf', TfidfTransformer()),
                ('clf',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               class_weight='balanced', colsample_bylevel=1,
                               colsample_bynode=1, colsample_bytree=1, gamma=0,
                               gpu_id=-1, importance_type='gain',
                               interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=15, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=50,
                               n_jobs=1, num_parallel_

## Bagging

In [70]:
from sklearn.ensemble import BaggingClassifier
#run_classification(BaggingClassifier(n_estimators=10, random_state=0), X_train, X_test, y_train, y_test)

In [71]:
params ={'n_estimators':[5,10,30,50,100],'random_state' :[1,10,20]}

?BaggingClassifier

In [72]:
bgc = BaggingClassifier()
rs_bgc = RandomizedSearchCV(estimator = bgc,
                           param_distributions = params,
                           cv = 3,random_state = 42, n_jobs = 1, return_train_score= True)
#Fitting randomsearch model
rs_bgc_results = rs_bgc.fit(X_train, y_train)

In [73]:
rs_bgc_results.best_score_

0.7156597595128124

In [74]:
rs_bgc.best_params_

{'random_state': 10, 'n_estimators': 100}

In [75]:
rs_bgc.best_estimator_

BaggingClassifier(n_estimators=100, random_state=10)

In [76]:
bgc_clf = rs_bgc.best_estimator_

In [77]:
run_classification(bgc_clf, X_train, X_test, y_train, y_test)

Estimator: Pipeline(steps=[('tfidf', TfidfTransformer()),
                ('clf', BaggingClassifier(n_estimators=100, random_state=10))])
Training accuracy: 82.18%
Testing accuracy: 73.31%
Confusion matrix:
 [[994   0   2 ...   2   8   2]
 [  0  29   0 ...   0   3   0]
 [  0   0 148 ...   0  13   0]
 ...
 [  1   0   0 ...  90   0   0]
 [  1   0   4 ...   0 565   2]
 [  1   0   0 ...   0  97 101]]
Classification report:
               precision    recall  f1-score   support

           0       0.93      0.84      0.88      1182
           1       0.74      0.63      0.68        46
           2       0.78      0.80      0.79       184
           3       0.88      0.94      0.91        48
           4       0.67      0.70      0.69       503
           5       0.96      0.90      0.93       225
           6       0.93      0.91      0.92       184
           7       0.93      0.98      0.96        58
           8       0.84      0.95      0.89       134
           9       0.80      0.97  

## Stacking

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingClassifier

estimators = [('rf', RandomForestClassifier(n_estimators=100, random_state=42)), ('svr', make_pipeline(StandardScaler(with_mean=False), LinearSVC(random_state=42)))]

run_classification(StackingClassifier(estimators=estimators, final_estimator=DecisionTreeClassifier()), X_train, X_test, y_train, y_test)

## Voting

In [None]:
from sklearn.ensemble import VotingClassifier

estimators = [('rf', RandomForestClassifier(n_estimators=100, random_state=42)), ('dtc', DecisionTreeClassifier(random_state=42)), ('lsvc', LinearSVC(random_state=42))]

run_classification(VotingClassifier(estimators=estimators, voting='hard'), X_train, X_test, y_train, y_test)

In [None]:
from sklearn.ensemble import VotingClassifier

estimators = [('rf', RandomForestClassifier(n_estimators=100, random_state=42)), ('dtc', DecisionTreeClassifier(random_state=42)), ('lsvc', SVC(kernel='linear',probability=True))]

run_classification(VotingClassifier(estimators=estimators, voting='soft'), X_train, X_test, y_train, y_test)

## Deep Neural Networks

In [None]:
# Load the augmented data from pickle file 
with open('/content/Interim_data.pkl','rb') as f:
    clean_data_DL = pickle.load(f)

In [None]:
clean_data_DL.isnull().sum()

In [None]:
clean_data_DL['Final_Text'] = clean_data_DL['Final_Text'].replace(np.nan, '', regex=True)

In [None]:
clean_data_DL.info()

In [None]:
# Import label encoder 
from sklearn import preprocessing 
  
# label_encoder object knows how to understand word labels. 
label_encoder = preprocessing.LabelEncoder() 
  
# Encode labels in column 'species'. 
clean_data_DL['Assignment group LabelEncoded']= label_encoder.fit_transform(clean_data_DL['Assignment group']) 
  
clean_data_DL['Assignment group LabelEncoded'].unique()

In [None]:
onehot_encoded_dict = dict(zip(clean_data_DL['Assignment group'].unique(), clean_data_DL['Assignment group LabelEncoded'].unique()))
len(onehot_encoded_dict)

In [None]:
# Splitting Train Test 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(clean_data_DL['Final_Text'], clean_data_DL['Assignment group LabelEncoded'], test_size=0.3, random_state = 0, stratify=clean_data_DL['Assignment group LabelEncoded'])
print('\033[1mShape of the training set:\033[0m', X_train.shape, X_test.shape)
print('\033[1mShape of the test set:\033[0m', y_train.shape, y_test.shape)

### Create checkpoints function

In [None]:
#Path where you want to save the weights, model and checkpoints
model_path = "Weights/"
%mkdir Weights

# Define model callbacks
def call_backs(name):
    early_stopping = EarlyStopping(monitor='val_loss', mode='min', min_delta=0.01, patience=3)
    model_checkpoint =  ModelCheckpoint(model_path + name + '_epoch{epoch:02d}_loss{val_loss:.4f}.h5',
                                                               monitor='val_loss',
                                                               verbose=1,
                                                               save_best_only=True,
                                                               save_weights_only=False,
                                                               mode='min',
                                                               period=1)
    return [model_checkpoint, early_stopping]

In [None]:
# Function to build Neural Network
def Build_Model_DNN_Text(shape, nClasses, dropout=0.3):
    """
    buildModel_DNN_Tex(shape, nClasses,dropout)
    Build Deep neural networks Model for text classification
    Shape is input feature space
    nClasses is number of classes
    """
    model = Sequential()
    node = 512 # number of nodes
    nLayers = 4 # number of  hidden layer
    model.add(Dense(node,input_dim=shape,activation='relu'))
    model.add(Dropout(dropout))
    for i in range(0,nLayers):
        model.add(Dense(node,input_dim=node,activation='relu'))
        model.add(Dropout(dropout))
        model.add(BatchNormalization())
    model.add(Dense(nClasses, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    print(model.summary())
    return model

In [None]:
Tfidf_vect = TfidfVectorizer(max_features=2000)
Tfidf_vect.fit(clean_data_DL.Final_Text.astype(str))
X_train_tfidf = Tfidf_vect.transform(X_train)
X_test_tfidf = Tfidf_vect.transform(X_test)

# Instantiate the network
model_DNN = Build_Model_DNN_Text(X_train_tfidf.shape[1], 43)

In [None]:
run_classification(model_DNN, X_train_tfidf, X_test_tfidf, y_train, y_test,pipelineRequired = False,isDeepModel=True, arch_name='DNN')

'''model_DNN.fit(X_train_tfidf, y_train,
                              validation_data=(X_test_tfidf, y_test),
                              callbacks=call_backs("NN"),
                              epochs=10,
                              batch_size=128,
                              verbose=2)
predicted = model_DNN.predict(X_test_tfidf)'''

### Extract Glove Embeddings

In [None]:
#download the glove embedding zip file from http://nlp.stanford.edu/data/wordvecs/glove.6B.zip
from zipfile import ZipFile
# Check if it is already extracted else Open the zipped file as readonly
if not os.path.isfile('glove.6B/glove.6B.200d.txt'):
    #glove_embeddings = 'glove.6B.zip'
    glove_embeddings = '/content/drive/MyDrive/Capstone/glove.6B.zip'
    with ZipFile(glove_embeddings, 'r') as archive:
        archive.extractall('glove.6B')

# List the files under extracted folder
os.listdir('glove.6B')

## Convolutional Neural Networks (CNN)

In [None]:
#gloveFileName = 'glove.6B/glove.6B.200d.txt'
gloveFileName = '/content/glove.6B/glove.6B.200d.txt'
MAX_SEQUENCE_LENGTH = 500
EMBEDDING_DIM=200
MAX_NB_WORDS=75000

# Function to generate Embedding
def loadData_Tokenizer(X_train, X_test,filename):
    np.random.seed(7)
    text = np.concatenate((X_train, X_test), axis=0)
    text = np.array(text)
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(text)
    sequences = tokenizer.texts_to_sequences(text)
    word_index = tokenizer.word_index
    text = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    print('Found %s unique tokens.' % len(word_index))
    indices = np.arange(text.shape[0])
    # np.random.shuffle(indices)
    text = text[indices]
    print(text.shape)
    X_train = text[0:len(X_train), ]
    X_test = text[len(X_train):, ]
    embeddings_index = {}
    f = open(filename, encoding="utf8")
    for line in f:
        values = line.split()
        word = values[0]
        try:
            coefs = np.asarray(values[1:], dtype='float32')
        except:
            pass
        embeddings_index[word] = coefs
    f.close()
    print('Total %s word vectors.' % len(embeddings_index))
    return (X_train, X_test, word_index,embeddings_index)


embedding_matrix = []

def buildEmbed_matrices(word_index,embedding_dim):
    embedding_matrix = np.random.random((len(word_index) + 1, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            if len(embedding_matrix[i]) !=len(embedding_vector):
                print("could not broadcast input array from shape",str(len(embedding_matrix[i])), "into shape",str(len(embedding_vector)),
                      " Please make sure your"" EMBEDDING_DIM is equal to embedding_vector file ,GloVe,")
                exit(1)
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [None]:
# Generate Glove embedded datasets
X_train_Glove, X_test_Glove, word_index, embeddings_index = loadData_Tokenizer(X_train,X_test,gloveFileName)
embedding_matrix = buildEmbed_matrices(word_index,EMBEDDING_DIM)

In [None]:
def Build_Model_CNN_Text(word_index, embeddings_matrix, nclasses,dropout=0.5):
    """
        def buildModel_CNN(word_index, embeddings_index, nclasses, MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=50, dropout=0.5):
        word_index in word index ,
        embeddings_index is embeddings index, look at data_helper.py
        nClasses is number of classes,
        MAX_SEQUENCE_LENGTH is maximum lenght of text sequences,
        EMBEDDING_DIM is an int value for dimention of word embedding look at data_helper.py
    """
    model = Sequential()
    embedding_layer = Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embeddings_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True)
    # applying a more complex convolutional approach
    convs = []
    filter_sizes = []
    layer = 5
    print("Filter  ",layer)
    for fl in range(0,layer):
        filter_sizes.append((fl+2))
    node = 128
    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    for fsz in filter_sizes:
        l_conv = Conv1D(node, kernel_size=fsz, activation='relu')(embedded_sequences)
        l_pool = MaxPooling1D(5)(l_conv)
        #l_pool = Dropout(0.25)(l_pool)
        convs.append(l_pool)
    l_merge = Concatenate(axis=1)(convs)
    l_cov1 = Conv1D(node, 5, activation='relu')(l_merge)
    l_cov1 = Dropout(dropout)(l_cov1)
    l_batch1 = BatchNormalization()(l_cov1)
    l_pool1 = MaxPooling1D(5)(l_batch1)
    l_cov2 = Conv1D(node, 5, activation='relu')(l_pool1)
    l_cov2 = Dropout(dropout)(l_cov2)
    l_batch2 = BatchNormalization()(l_cov2)
    l_pool2 = MaxPooling1D(30)(l_batch2)
    l_flat = Flatten()(l_pool2)
    l_dense = Dense(1024, activation='relu')(l_flat)
    l_dense = Dropout(dropout)(l_dense)
    l_dense = Dense(512, activation='relu')(l_dense)
    l_dense = Dropout(dropout)(l_dense)
    preds = Dense(nclasses, activation='softmax')(l_dense)
    model = Model(sequence_input, preds)
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    
    print(model.summary())
    return model

In [None]:
# Train the network and run classification
model_CNN = Build_Model_CNN_Text(word_index,embedding_matrix, 43)
run_classification(model_CNN, X_train_Glove, X_test_Glove, y_train, y_test,pipelineRequired = False,isDeepModel=True, arch_name='CNN')

## Recurrent Neural Networks (RNN) --> Gated Recurrent Unit (GRU)

In [None]:
def Build_Model_RNN_Text(word_index, embeddings_matrix, nclasses,dropout=0.5):
    """
    def buildModel_RNN(word_index, embeddings_matrix, nclasses,  MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=100, dropout=0.5):
    word_index in word index ,
    embeddings_matrix is embeddings_matrix, look at data_helper.py
    nClasses is number of classes,
    MAX_SEQUENCE_LENGTH is maximum lenght of text sequences
    """
    model = Sequential()
    hidden_layer = 3
    gru_node = 32
    
    model.add(Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embeddings_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True))
    print(gru_node)
    for i in range(0,hidden_layer):
        model.add(GRU(gru_node,return_sequences=True, recurrent_dropout=0.2))
        model.add(Dropout(dropout))
        model.add(BatchNormalization())
    model.add(GRU(gru_node, recurrent_dropout=0.2))
    model.add(Dropout(dropout))
    model.add(BatchNormalization())
    model.add(Dense(256, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(nclasses, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy',
                      optimizer='sgd',
                      metrics=['accuracy'])
    
    print(model.summary())
    return model

In [None]:
# Train the network and run classification
model_RNN = Build_Model_RNN_Text(word_index,embedding_matrix, 43)
run_classification(model_RNN, X_train_Glove, X_test_Glove, y_train, y_test,pipelineRequired = False,isDeepModel=True, arch_name='RNN')

## RNN with LSTM networks

In [None]:
EMBEDDING_DIM = 200
#gloveFileName = 'glove.6B/glove.6B.100d.txt'
gloveFileName = '/content/glove.6B/glove.6B.200d.txt'

from keras.models import Sequential
from keras.layers import Dense, LSTM, TimeDistributed, Activation
from keras.layers import Flatten, Permute, merge, Input
from keras.layers import Embedding
from keras.models import Model
from keras.layers import Input, Dense, multiply, concatenate, Dropout
from keras.layers import GRU, Bidirectional


def Build_Model_LTSM_Text(word_index, embeddings_matrix, nclasses):
    kernel_size = 2
    filters = 256
    pool_size = 2
    gru_node = 256
    
    model = Sequential()
    model.add(Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embeddings_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True))
    model.add(Dropout(0.25))
    model.add(Conv1D(filters, kernel_size, activation='relu'))
    model.add(MaxPooling1D(pool_size=pool_size))
    model.add(Conv1D(filters, kernel_size, activation='relu'))
    model.add(MaxPooling1D(pool_size=pool_size))
    model.add(Conv1D(filters, kernel_size, activation='relu'))
    model.add(MaxPooling1D(pool_size=pool_size))
    model.add(Conv1D(filters, kernel_size, activation='relu'))
    model.add(MaxPooling1D(pool_size=pool_size))
    model.add(Bidirectional(LSTM(gru_node, return_sequences=True, recurrent_dropout=0.2)))
    model.add(Bidirectional(LSTM(gru_node, return_sequences=True, recurrent_dropout=0.2)))
    model.add(Bidirectional(LSTM(gru_node, return_sequences=True, recurrent_dropout=0.2)))
    model.add(Bidirectional(LSTM(gru_node, recurrent_dropout=0.2)))
    model.add(Dense(1024,activation='relu'))
    model.add(Dense(nclasses))
    model.add(Activation('softmax'))
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    
    print(model.summary())
    return model

In [None]:
X_train_Glove,X_test_Glove, word_index,embeddings_index = loadData_Tokenizer(X_train,X_test,gloveFileName)
embedding_matrix = buildEmbed_matrices(word_index,EMBEDDING_DIM)

model_LTSM = Build_Model_LTSM_Text(word_index,embedding_matrix, 43)
run_classification(model_LTSM, X_train_Glove, X_test_Glove, y_train, y_test,pipelineRequired = False,isDeepModel=True, arch_name='LSTM')