-
Notifications
You must be signed in to change notification settings - Fork 4
/
nsitbot.py
247 lines (185 loc) · 8.01 KB
/
nsitbot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
from fuzzywuzzy import fuzz
import pandas as pd
import os
from trainer import trainer as Train
"""
NSIT Bot main class
"""
"""
method to find similarity in the strings
"""
def sim(str1,str2):
return fuzz.token_sort_ratio(str1,str2) * 0.01
class NSITBot(object):
def __init__(self,**kwargs):
self.__name__ = kwargs.get('name','NSITBot')
self.__owner__ = kwargs.get('owner','NSIT 2020')
self.__version__ = kwargs.get('version','Test')
"""
creating an instance of trainer inside NSITBot
"""
self.trainer = Train(
name = 'TrainerBot'
)
self.hello_bot = ['bot_action','bot_author','bot_name','bot_version','hello','endings']
"""
fetching the classifier and vectorizer that
the NSITBot will use to predict the
classes of the recieved messaged from
the user end.
"""
self.classifier,self.vectorizer = self.trainer.getClassifier()
def predict(self,message):
"""
take the message fromt the user and transform it.
1. convert the message into lower case so
that the vectorizer can perfomr operations
on it.
2. convert it into a LIST of message
'[message]'.
3. transform the list using the vectorizer.transform()
function so that further predictions can be
carried out on the message string.
"""
self.message = message
self.message_toLower = self.message.lower()
self.messageList = [self.message_toLower]
self.transformed_message = self.vectorizer.transform(self.messageList)
self.prediction = self.classifier.predict(self.transformed_message)
"""
the function will now return the prediction as a list
"""
return [self.message_toLower,self.prediction[0]]
def generateResponse(self,botPredict):
"""
method to generate responses based on the query/
message and the type of message
"""
message_tolower = botPredict[0]
prediction = botPredict[1]
if prediction in self.hello_bot:
self.response = self.getBotResponse(message_tolower,prediction)
else:
res = self.getStreamBasedResponse(message_tolower,prediction)
self.response = str(res[1]) + " : " + str(res[0])
return self.response
def getBotResponse(self,message_tolower,prediction):
"""
method to predict the response if the prediction
of the incoming message was among the following:
self.hello_bot = ['bot_aciton','bot_author','bot_name','bot_version','hello']
the response will be generated using the responses,csv
or the hello_bot.csv
"""
if message_tolower:
message_tolower = message_tolower.replace('?','')
df = pd.read_csv("responses/hello_bot.csv")
df = df[df.class_name == prediction]
for index,message in zip(df['message'].index,df['message'].values):
df.set_value(index,'percentage_match',sim(message,message_tolower))
ndf = df.sort_values('percentage_match',ascending=False)
apt_response = ndf['response1'].iloc[0]
return apt_response
def getBest_QueryID(self,message_topredict):
"""
getBest_QueryID() will return the query ID,
i.e., the best possible match from the csv.
the 'status_id' field inside the csv contains
the unique id for every status.'status_id' has
data in format GROUP_ID + '_' + STATUS_ID
each and every status will be checked and
compared with the 'message'.
a unique score or 'percentage_match' will
determine the match %.
the onw with the best match will be returned.
"""
queryDataFrame = pd.read_csv('status.csv')
status_queryDataFrame = queryDataFrame[queryDataFrame.status_type == 'status']
"""
status_queryDataFrame now contains all the
posts with TYPE = 'status'
this sorting is done to only get written
messages from the csv file.
"""
for index,message in zip(status_queryDataFrame['status_message'].index,status_queryDataFrame['status_message'].values):
status_queryDataFrame.set_value(index,'percentage_match',sim(message,message_topredict))
finalFrame = status_queryDataFrame.sort_values('percentage_match',ascending=False)
apt_id = finalFrame['status_id'].iloc[0]
apt_author = finalFrame['status_author'].iloc[0]
#print apt_id,apt_author
return [apt_id,apt_author]
def getBest_Response(self,queryID,queryAuthor):
"""
method for generating the best possible
response for the incoming message using the
queryID generated by matching the incoming
message with each and every message.
"""
commentDataFrame = pd.read_csv('comments.csv')
newDF = commentDataFrame[commentDataFrame.status_id == queryID]
newDF = newDF[newDF.comment_author != queryAuthor]
"""
newDF is the new DF which
contains all the comments for the queryID.
all the comments from the queryAuthor are
removed in order to get the comments from others
only. as the author's comments may contains questions
and will definitely not contain the answer to his/her
query. XD
"""
"""
the dataframe must have a field called 'rating'.
'rating' will be used to rate each and every
comment and then the best of all will be chosen.
"""
for index,likes,loves,wows,hahas,sads,angrys in zip(newDF['num_likes'].index,newDF['num_likes'].values,newDF['num_loves'].values,newDF['num_wows'].values,newDF['num_hahas'].values,newDF['num_sads'].values,newDF['num_angrys'].values):
rating = float(1*int(likes) + 0.5*int(loves) + 1.5*(wows) - (int(angrys) + int(sads)) - 1.5*int(hahas))
newDF.set_value(index,'rating',rating)
"""
now the dataFrame 'newDF' contains
the rating for all the comments in response
to the queryID.
"""
sortedDF = newDF.sort_values('rating',ascending=False)
apt_comment = sortedDF['comment_message'].iloc[0]
apt_author = sortedDF['comment_author'].iloc[0]
#print apt_author,apt_comment
return [apt_comment,apt_author]
def getStreamBasedResponse(self,message_tolower,prediction):
"""
test method for StreamBasedResponse
"""
message_topredict = message_tolower
if message_topredict:
message_topredict = message_topredict.replace('?','')
"""
Two csv files are used.
first file is used to match with the best query
pertaining in the group.
second file contains the comments to all the
queries and will be used to predict the
apt_response for the particular message.
"""
best_query = self.getBest_QueryID(message_topredict)
queryID = best_query[0]
queryAuthor = best_query[1]
best_Response = self.getBest_Response(queryID,queryAuthor)
"""
best_response conatins ==> [response,author]
"""
return best_Response
"""
There is no need to train the bot as the bot is trained automatically before
the prediction phase starts.
prediction function can be found inside the trainer.py file and may be changed
depending upon the user.
the 'getClassifier' function inside the main class automatically invokes
the trainer to train our bot on all previously available data in form of
.TRAINER files
"""
bot = NSITBot(
name="My First Bot",
owner="Manish Devgan",
version="0.0.3"
)
#print bot.getStreamBasedResponse(str(raw_input("Question / Stream Based: ")),"stream_based")