## Database Connection

In [None]:
# This is dummy connection code which has to be changed based on database types and file names

# import mysql.connector
# mydb = mysql.connector.connect(
#   host="localhost",
#   user="yourusername",
#   password="yourpassword"
# )

# mycursor = mydb.cursor()
# mycursor.execute("SELECT * FROM data")
# data = mycursor.fetchall()

# print(mydb)

## Train model and create the model_files

In [None]:
import os
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import re
import spacy
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import string
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import CountVectorizer
import nltk 
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if not os.path.exists('model_files'):
    os.mkdir('model_files/')

def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

stopword = nltk.corpus.stopwords.words('english')

def remove_stopwords(text):
    text = [word for word in re.split('\W+', text) if word not in stopword]
    return text

ps = nltk.PorterStemmer()

def stemming(text):
    text = [ps.stem(word) for word in text]
    return text

wn = nltk.WordNetLemmatizer()

def lemmatizer(text):
    text = [wn.lemmatize(word) for word in text]
    return ' '.join(text)

def spell_correction(text):           # spelling correction
    txt=TextBlob(text)
    return txt.correct()

def clean_data(x):
    x=x.lower()
    x=x.encode('ascii','ignore').decode() # remove texts other than english
    x=re.sub('https*\S+','',x) # remove urls
    #x=spell_correction(x)
    x=remove_punct(x) # remove punctuations
    x=remove_stopwords(x) # remove stopwords
    #x=stemming(x) # stemming
    #x=lemmatizer(x) # lemmatization
    return ' '.join(x)
    

### Load the training data
data=pd.read_excel('RNN-Data_2.xlsx',sheet_name=None)
df_train=data['train data'].rename(columns={'utterance':'text','intent':'label'})[['text','label']]
df_train['text']=df_train['text'].apply(lambda x: clean_data(x))
df_train['text_length'] = df_train['text'].apply(lambda x: len(x.split()))
import pickle
with open('model_files/labels.pkl', 'wb') as handle:
    pickle.dump(df_train.label.unique().tolist(), handle)

### Build the vocabulary
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(df_train['label'])
df_train['label']=le.transform(df_train['label'])
with open('model_files/label_encoder.pkl', 'wb') as handle:
    pickle.dump(le, handle)

reviews=df_train.text
tok = spacy.load('en')
def tokenize (text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in tok.tokenizer(nopunct)]

from collections import Counter
counts = Counter()
for index, row in df_train.iterrows():
    counts.update(tokenize(row['text']))
#deleting infrequent words
print("num_words before:",len(counts.keys()))
for word in list(counts):
    if counts[word] < 2:
        del counts[word]
print("num_words after:",len(counts.keys()))
#creating vocabulary
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

with open('model_files/vocab.pkl','wb') as vocab:
    pickle.dump(vocab2index,vocab)
    
def encode_sentence(text, vocab2index, N=10):
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length
df_train['encoded'] = df_train['text'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))

### Build training and validation functions
X = list(df_train['encoded'])
y = list(df_train['label'])
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1)

class ReviewsDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx], self.X[idx][1]

train_ds = ReviewsDataset(X_train, y_train)
valid_ds = ReviewsDataset(X_valid, y_valid)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def train_model(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr) ### Adam optimizer
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l in train_dl:
            x = x.long().to(device)
            y = y.long().to(device)
            y_pred = model(x, l)
            optimizer.zero_grad()
            loss = F.cross_entropy(y_pred, y) # cross entropy loss
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc = validation_metrics(model, val_dl)
        if i%5==0:
          print("train loss %.3f, test loss %.3f, test accuracy %.3f" % (sum_loss/total, val_loss, val_acc))

def validation_metrics (model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y, l in valid_dl:
        x = x.long().to(device)
        y = y.long().to(device)
        y_hat = model(x, l)
        loss = F.cross_entropy(y_hat, y)
        pred = torch.max(y_hat, 1)[1]
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
        #sum_rmse += np.sqrt(mean_squared_error(pred, y.unsqueeze(-1)))*y.shape[0]
    return sum_loss/total, correct/total

batch_size = 1000
vocab_size = len(words)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(valid_ds, batch_size=batch_size)

class LSTM_fixed_len(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        #self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True) # activation is taken as tanh
        self.linear = nn.Linear(hidden_dim, df_train.label.nunique())
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

model = LSTM_fixed_len(vocab_size, 50, 500)
model.to(device)
train_model(model, epochs=50, lr=0.01)

torch.save(model, 'model_files/model.pth')

## Deployment

In [27]:
%%writefile app.py
import os
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import re
import spacy
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import string
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import CountVectorizer
import nltk 
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

from flask import Flask
from string import Template
from flask import Flask, render_template, request

app = Flask(__name__)
tok = spacy.load('en')

def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

stopword = nltk.corpus.stopwords.words('english')

def remove_stopwords(text):
    text = [word for word in re.split('\W+', text) if word not in stopword]
    return text

ps = nltk.PorterStemmer()

def stemming(text):
    text = [ps.stem(word) for word in text]
    return text

wn = nltk.WordNetLemmatizer()

def lemmatizer(text):
    text = [wn.lemmatize(word) for word in text]
    return ' '.join(text)

def spell_correction(text):           # spelling correction
    txt=TextBlob(text)
    return txt.correct()

def clean_data(x):
    x=x.lower()
    x=x.encode('ascii','ignore').decode() # remove texts other than english
    x=re.sub('https*\S+','',x) # remove urls
    #x=spell_correction(x)
    x=remove_punct(x) # remove punctuations
    x=remove_stopwords(x) # remove stopwords
    #x=stemming(x) # stemming
    #x=lemmatizer(x) # lemmatization
    return ' '.join(x)
    
tok = spacy.load('en')

def tokenize (text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in tok.tokenizer(nopunct)]

def encode_sentence(text, vocab2index, N=10):
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length
    
class ReviewsDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx], self.X[idx][1]

def prep_data(text):
  text=clean_data(text)
  text=np.array(encode_sentence(text,vocab2index))
  return text

class LSTM_fixed_len(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        #self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True) # activation is taken as tanh
        self.linear = nn.Linear(hidden_dim, df_train.label.nunique())
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])
import pickle
with open('model_files/label_encoder.pkl', 'rb') as handle:
    le= pickle.load(handle)
with open('model_files/labels.pkl', 'rb') as handle:
    labels=pickle.load(handle)
model = torch.load('model_files/model.pth')

with open('model_files/vocab.pkl','rb') as vocab:
    vocab2index=pickle.load(vocab)

@app.route('/')
def home():
	return render_template('home.html')

@app.route('/predict',methods=['POST'])
def predict():
  if request.method=='POST':
    message = request.form['message']
    x=torch.from_numpy(prep_data(message)[0].astype(np.int32)).reshape(1,-1).long().to(device)
    pred=le.inverse_transform(np.argmax(model(x,None).cpu().detach().numpy(),axis=1))[0]   
    print(pred) 
  return render_template('result.html',prediction = pred)


if __name__ == "__main__":
	#decide what port to run the app in
	port = int(os.environ.get('PORT', 5000))
	#run the app locally on the givn port
	app.run(host='0.0.0.0', port=port)
	#optional if we want to run in debugging mode
	#app.run(debug=True)

Overwriting app.py


In [None]:
!python app.py