In [1]:
import os
import json
import codecs
import pandas as pd
import numpy as np
import random

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn import metrics

In [2]:
import random

In [3]:
import os
from google.colab import drive
drive.mount('/content/gdrive')

# Change working directory to be current folder
os.chdir('/content/gdrive/My Drive/MyColab/PLP')
!ls

Mounted at /content/gdrive
 bigram_vectorizer.pkl		   SBert_embedder.pkl
 ch21.pkl			   Small_talk_Intent.csv
 chatbot_intent			   Small_talk_Intent.xlsx
 clr_svm.pkl			  'spi_index_labelled vFinal.csv'
 corpus_embeddings.pkl		  'spi_index_labelled vFinal.db'
 gqlmodel.h5			   terms_questions.csv
 GQLquestion.pkl		   textsql.csv
 investopedia_corpus.csv	   tfidf_matrix.pkl
 investopedia_terms_question.csv   tfidf.pkl


In [4]:
casual_conversation_df = pd.read_excel('Small_talk_Intent.xlsx').drop_duplicates(['Utterances', 'Answer'])
casual_conversation_df.head()

Unnamed: 0,Utterances,Answer,Intent
0,very funny,erm okay.,smalltalk_emotions_ha_ha
1,a bad time to talk,Ok! Bye!,smalltalk_user_does_not_want_to_talk
2,a good day,Yes anything I can help?,smalltalk_greetings_hello
3,a good morning,Good morning!,smalltalk_greetings_goodmorning
4,a great day isn't it?,Yes anything I can help?,smalltalk_greetings_hello


In [5]:
#preparing training data for the chatterbot
groupIntentCasualConversation = list(casual_conversation_df.groupby(['Intent'], group_keys=True))
for groupDF in groupIntentCasualConversation:
  groupname = groupDF[0]
  dataframe = groupDF[1].loc[:,['Utterances','Answer']]
  with open(f'chatbot_intent/{groupname}.yml','w') as f:
    f.write(f'categories:\n- {groupname}\n')
    f.write(f'conversations:\n')
    for data in dataframe.iloc:
      f.write(f'- - {data["Utterances"]}\n')
      f.write(f'  - {data["Answer"]}\n')

In [6]:
#print out one of the sample data
fileList = os.listdir('chatbot_intent')
print(fileList[0])
with open(f'chatbot_intent/{fileList[0]}','r') as f:
  for line in f.readlines():
    print(line)

smalltalk_agent_acquaintance.yml
categories:

- smalltalk_agent_acquaintance

conversations:

- - about yourself

  - I am a financially knowledgeable bot

- - all about you

  - I am a financially knowledgeable bot

- - define yourself

  - I am a financially knowledgeable bot

- - describe yourself

  - I am a financially knowledgeable bot

- - give me more about your personality

  - I am a financially knowledgeable bot

- - i want to get to know you better

  - I am a financially knowledgeable bot

- - I want to know more about you

  - I am a financially knowledgeable bot

- - i want to know more about you

  - I am a financially knowledgeable bot

- - i want to know something about you

  - I am a financially knowledgeable bot

- - I want to know you better

  - I am a financially knowledgeable bot

- - i want to know you better

  - I am a financially knowledgeable bot

- - i'd like to know more about you

  - I am a financially knowledgeable bot

- - introduce yourself

  - I a

In [7]:
#prepare the questions for investopedia terms
import random
import csv
investopedia_corpus = pd.read_csv('investopedia_corpus.csv')
investopedia_terms = list(investopedia_corpus['term'].map(lambda x: x.lower()))
question_tags = ["what is",
"what do you mean by",
"explain",
"define",
"can you tell me about",
"what is meant by",
"can you give me an idea about",
"can you please give short description about",
"briefly explain",
"what do you know about",
"please explain",
"can you say about",
"what is meant by",
"describe",
"what do you know",
"help me with"]
terms_question_corpus = []
for terms in investopedia_terms:
  tempQuestionTag = question_tags.copy()
  for i in range(3):
    question_tag = random.choice(tempQuestionTag)
    tempQuestionTag.remove(question_tag)
    terms_question_corpus.append([terms,f'{question_tag} {terms}'])
terms_question =random.choices([ question for terms, question in terms_question_corpus],k=3000)

In [8]:
investopedia_terms_question = pd.read_csv('terms_questions.csv')
investopedia_terms_question.shape

(17312, 2)

In [9]:
import pickle
with open('GQLquestion.pkl','rb') as f:
  gql_questions = pickle.load(f)
#random pick 3000 questions from gql training data
gql_questions = random.choices(gql_questions,k=3000)

In [12]:
#random pick 3000 questions from sql training data
sql_questions_df = pd.read_csv('textsql.csv')
sql_questions = random.choices(list(sql_questions_df['query']),k=3000)

In [10]:
#random pick 3000 questions from chatterbot training data
casual_conversation = random.choices(list(casual_conversation_df['Utterances'].apply(str)),k=3000)

In [18]:
#prepare train and test data
dataList = gql_questions + sql_questions + casual_conversation + terms_question
intentList = len(gql_questions)*['gql'] + len(sql_questions)*['sql'] + len(casual_conversation)*['casual'] + len(terms_question)*['terms']
random.seed(4)
suffledData = list(tuple(zip(dataList,intentList)))
random.shuffle(suffledData)
X = [data[0] for data in suffledData]
y = [data[1] for data in suffledData]
text_train, text_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [19]:
#apply bigram vectoriazation to perform natural language language and detect the differences across each intent.
bigram_vectorizer = TfidfVectorizer(ngram_range=(1, 2),token_pattern=r'\b\w+\b', min_df=1)
train_bigram_vectors = bigram_vectorizer.fit_transform(text_train)
test_bigram_vectors = bigram_vectorizer.transform(text_test)

print(train_bigram_vectors.shape)
print(test_bigram_vectors.shape)

#apply ch21 test to select features according to the 1000 highest scores.
ch21 = SelectKBest(chi2, k=1000)
train_bigram_Kbest = ch21.fit_transform(train_bigram_vectors, y_train)
test_bigram_Kbest = ch21.transform(test_bigram_vectors)

print(train_bigram_Kbest.shape)
print(test_bigram_Kbest.shape)

(8400, 19138)
(3600, 19138)
(8400, 1000)
(3600, 1000)


In [20]:
#apply logistic regression to perform intent classification
clf_ME = LogisticRegression(random_state=0, solver='lbfgs').fit(train_bigram_Kbest, y_train)
predME = clf_ME.predict(test_bigram_Kbest)
pred = list(predME)
print(metrics.confusion_matrix(y_test, pred))
print(metrics.classification_report(y_test, pred))

#apply SVM to perform intent classification
model_svm = SVC(C=5000.0, gamma="auto", kernel='rbf')
clr_svm = model_svm.fit(train_bigram_Kbest, y_train)   
predicted = clr_svm.predict(test_bigram_Kbest)
print(metrics.confusion_matrix(y_test, predicted))
print(np.mean(predicted == y_test) )
print(metrics.classification_report(y_test, predicted))

[[891   9   1  14]
 [  6 867   0   3]
 [  0   0 923   0]
 [  0   3   0 883]]
              precision    recall  f1-score   support

      casual       0.99      0.97      0.98       915
         gql       0.99      0.99      0.99       876
         sql       1.00      1.00      1.00       923
       terms       0.98      1.00      0.99       886

    accuracy                           0.99      3600
   macro avg       0.99      0.99      0.99      3600
weighted avg       0.99      0.99      0.99      3600

[[903   6   0   6]
 [  3 868   0   5]
 [  0   0 923   0]
 [  0   1   0 885]]
0.9941666666666666
              precision    recall  f1-score   support

      casual       1.00      0.99      0.99       915
         gql       0.99      0.99      0.99       876
         sql       1.00      1.00      1.00       923
       terms       0.99      1.00      0.99       886

    accuracy                           0.99      3600
   macro avg       0.99      0.99      0.99      3600
weighted avg

In [21]:
queryTest = random.choice(gql_questions)
print(queryTest)
test_bigram_vectors = bigram_vectorizer.transform([queryTest])
test_bigram_Kbest = ch21.transform(test_bigram_vectors)
predME = clr_svm.predict(test_bigram_Kbest)
predME

"No one has come out and denied it as far as I can tell," said George Goncalves, head of fixed income strategy at what company?


array(['gql'], dtype='<U6')

In [22]:
queryTest = random.choice(list(sql_questions_df['query']))
print(queryTest)
test_bigram_vectors = bigram_vectorizer.transform([queryTest])
test_bigram_Kbest = ch21.transform(test_bigram_vectors)
predME = clr_svm.predict(test_bigram_Kbest)
print(predME)

Countries having Data Sources Score between 50 and 82 in 2016
['sql']


In [23]:
queryTest = 'please explain abbreviated new drug submission'
print(queryTest)
test_bigram_vectors = bigram_vectorizer.transform([queryTest])
test_bigram_Kbest = ch21.transform(test_bigram_vectors)
predME = clr_svm.predict(test_bigram_Kbest)
print(predME)

please explain abbreviated new drug submission
['terms']


In [24]:
with open('bigram_vectorizer.pkl', 'wb') as f:
  pickle.dump(bigram_vectorizer,f)
with open('ch21.pkl', 'wb') as f:
  pickle.dump(ch21,f)
with open('clr_svm.pkl', 'wb') as f:
  pickle.dump(clr_svm,f)