In [1]:
import spacy
import time
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
parser = spacy.load('en_core_web_lg')

In [2]:
def timeit(method):

    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()

        print ('%r (%r, %r) %2.2f sec' % \
              (method.__name__, args, kw, te-ts))
        return result

    return timed

In [3]:
df = pd.read_csv('../datasets/restaurant data.csv',header = 0)

In [4]:
print(df.head())

                                                text      intent
0  How do I make a table registration using Zomato ?     inquiry
1                    I want to cancel my reservation        help
2               Do you offer food delivery as well ?     inquiry
3  Can you suggest Vietnamese restaurants in Korm...  suggestion
4  How do I get myself added as a blogger using z...     inquiry


In [5]:

from sklearn.model_selection import train_test_split
X = np.array(df['text'])
y = np.array(df['intent'])
X_train,X_test,y_train,y_test = train_test_split(X,y,shuffle = True)

In [6]:
train_data = np.array(list(zip(X_train,y_train)))
TRAIN = pd.DataFrame(train_data)
test_data = np.array(list(zip(X_test,y_test)))
TEST = pd.DataFrame(test_data)

In [7]:
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [8]:
def tokenizeText(sample):
    tokens = parser(sample)
    return tokens.vector
    
#     tokens = parser(sample)
#     lemmas = []
#     for tok in tokens:
#         lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
#     tokens = lemmas
#     #tokens = [word.lemma_ for word in tokens if word.lemma_ != "-PRON-"]
#     #tokens = [tok for tok in tokens if tok not in set(SYMBOLS)|STOP_WORDS]
    
#     while "" in tokens:
#         tokens.remove("")
#     while " " in tokens:
#         tokens.remove(" ")
#     while "\n" in tokens:
#         tokens.remove("\n")
#     while "\n\n" in tokens:
#         tokens.remove("\n\n")

#     return tokens

In [9]:
def append_vectors(row):
    sample = row[0]
    vector = tokenizeText(sample)
    size = len(vector)
    row['vector'] = vector
    #row['size'] = size
    return row


In [10]:
def featurize(dataset):
    dataset = dataset.apply(append_vectors, axis = 1)
    return dataset


In [11]:
training_data = featurize(TRAIN)
training_data.columns = ["text","intent","vector"]
training_data.head()

Unnamed: 0,text,intent,vector
0,"Quirky ambience , decent service and awesome P...",compliment,"[0.126237, 0.099554, -0.0978204, 0.00943531, 0..."
1,The empty plate tells a story. It is hands dow...,compliment,"[0.0211373, 0.123354, -0.0391419, -0.0698811, ..."
2,I want an invoice for my order.,help,"[-0.0666859, 0.220541, -0.22886, -0.0182869, 0..."
3,I want to edit my Zomato blogger profile.,help,"[0.061845, 0.109341, -0.310407, 0.0497672, 0.0..."
4,How do I make a table registration using Zomato ?,inquiry,"[-0.0346227, 0.0484739, -0.280154, -0.0162976,..."


In [12]:
X_train = training_data['vector']
y_train = training_data['intent']

In [13]:
clf = LinearSVC()
X_train = np.array(list(X_train), dtype=np.float)

In [14]:
clf.fit(X_train, y_train)
#print(cross_val_score(clf,X_train,y_train))

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [15]:
def append_svc_response(row):
    utterance = row['vector']
    ip = utterance.reshape(1,-1)
    #ip = np.array(list(utterance),dtype = np.float)
    #ip = pd.Series(utterance)
    
    response = clf.predict(ip)[0]
    
    try:
        row['predicted'] = response
        """
        if response[2]:
            row['predicted_score'] = response[2]
        else:
            row['predicted_score'] = 0
        """
        
        
        #print("q = {}, ans = {}" .format(utterance,response))
            
    except Exception as e:
        row['predicted'] = None
        #row['predicted_score'] = 0
        print("q = {}, ans = {}" .format(utterance,None))
    
    return row

In [16]:
def label_correct_responses(row):
    expected = row[1]
    predicted = row['predicted']
    
    if expected == predicted:
        row['Correct'] = 1
    else:
        row['Correct'] = 0
    
    return row

In [17]:
@timeit
def svc_test(test, write = False):
    test = test.apply(append_svc_response, axis = 1)
    test = test.apply(label_correct_responses, axis = 1)
    print(test['Correct'].mean())
    
    if write:
        test.to_csv('Word2vec with SVC.csv')
    
    return test


In [18]:
test_featurized = featurize(TEST)
#test_featurized

test_output = svc_test(test_featurized, False)
test_output.columns = ['utterance','expected intent','vector','predicted','Correct']
test_output

0.72
'svc_test' ((                                                    0           1  \
0                Do you offer food delivery as well ?     inquiry   
1   Where can I find pet friendly cafes in Ahmedab...  suggestion   
2   Best place for a get together and team dinners...  compliment   
3   Biryani was below average, rice and gravy was ...       issue   
4   The food is really yummy and the ambience is s...  compliment   
5   I want to unsubscribe from Zomato newsletter a...        help   
6            When will Zomato gold be available again     inquiry   
7   Why was my blog post deleted from the Zomato s...        help   
8   One of the oldest brewery in the city with a g...  compliment   
9    Will Swiggy be accountable for quality/quantity?     inquiry   
10       Delivery agent took over 2 hours to  deliver       issue   
11          I have a sponsorship proposal for Swiggy.        help   
12  Are there any Japanese restaurants in Gurgaon ...  suggestion   
13       Did not

Unnamed: 0,utterance,expected intent,vector,predicted,Correct
0,Do you offer food delivery as well ?,inquiry,"[-0.173946, 0.219213, -0.161819, -0.212255, 0....",inquiry,1
1,Where can I find pet friendly cafes in Ahmedab...,suggestion,"[0.114198, 0.218588, -0.18886, -0.197642, 0.38...",suggestion,1
2,Best place for a get together and team dinners...,compliment,"[-0.138739, 0.0298069, 0.0109568, -0.0734907, ...",issue,0
3,"Biryani was below average, rice and gravy was ...",issue,"[-0.210329, 0.242726, -0.0421341, -0.12756, 0....",issue,1
4,The food is really yummy and the ambience is s...,compliment,"[-0.0843063, 0.131123, -0.156124, -0.111926, 0...",compliment,1
5,I want to unsubscribe from Zomato newsletter a...,help,"[-0.0296337, 0.168432, -0.240266, 0.06516, 0.0...",help,1
6,When will Zomato gold be available again,inquiry,"[0.016505, 0.103658, -0.0460616, -0.036939, 0....",issue,0
7,Why was my blog post deleted from the Zomato s...,help,"[-0.0882711, 0.201262, -0.276794, 0.106356, -0...",issue,0
8,One of the oldest brewery in the city with a g...,compliment,"[0.0144093, 0.0573523, -0.0166555, -0.166056, ...",compliment,1
9,Will Swiggy be accountable for quality/quantity?,inquiry,"[-0.194883, 0.277057, -0.125059, -0.0127154, 0...",issue,0


In [19]:
def predict(utterance):
    row = pd.DataFrame({'utterance':[utterance]})
    row = featurize(row)
    test_input = np.array(list(row['vector']), dtype=np.float)
    response = clf.predict(test_input)[0]
    return response

# r = pd.DataFrame({'utterance':[utterance]})
# r = featurize(r)
# #ip = pd.Series(r['vector'])
# ip = np.array(list(r['vector']), dtype=np.float)

# clf.predict(ip)
# #response = clf.predict(ip)[0]
# #response

In [20]:
test_output.to_csv('../datasets/Restaurant test(Word2vec with SVC).csv')

In [21]:
utterance = "Where can I get Ethiopian food ?"
predict(utterance)

'suggestion'

In [22]:
training_data.to_csv('../datasets/Restaurant train(Word2vec with SVC).csv')

In [23]:
set(training_data['intent'])

{'compliment', 'help', 'inquiry', 'issue', 'suggestion'}