In [1]:
import os
import json
import codecs
import pandas as pd
import numpy as np
import random
import pickle

from joblib import dump, load
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

In [2]:
train_list = []

In [3]:
file_location = "./data/intent/train_Exit.json"
file_stream = codecs.open(file_location, 'r', 'utf-8')
jdata = json.load(file_stream)

print(jdata['Exit'][0])

{'data': 'Exit'}


In [4]:
for data in jdata['Exit']:
    train_list.append([data['data'].lower(),'exit'])
print (len(train_list))

13


In [5]:
file_location = "./data/intent/train_GenerateLyrics.json"
file_stream = codecs.open(file_location, 'r', 'utf-8')
jdata = json.load(file_stream)

print(jdata['GenerateLyrics'][0])

{'data': 'Generate Lyrics'}


In [6]:
for data in jdata['GenerateLyrics']:
    train_list.append([data['data'].lower(),'generate'])
print (len(train_list))

27


In [7]:
train_list[9:13]

[['i want to exit', 'exit'],
 ['i want to leave', 'exit'],
 ['restart', 'exit'],
 ['cancel', 'exit']]

In [8]:
file_location = "./data/intent/train_MatchSong.json"
file_stream = codecs.open(file_location, 'r', 'utf-8')
jdata = json.load(file_stream)

print(jdata['MatchSong'][0])

{'data': 'Find similar songs for me'}


In [9]:
for data in jdata['MatchSong']:
    train_list.append([data['data'].lower(),'find'])
print (len(train_list))

47


In [10]:
file_location = "./data/intent/train_RecommendSong.json"
file_stream = codecs.open(file_location, 'r', 'utf-8')
jdata = json.load(file_stream)

print(jdata['RecommendSong'][0])

{'data': 'Recommend a song for me'}


In [11]:
for data in jdata['RecommendSong']:
    train_list.append([data['data'].lower(),'recom'])
print (len(train_list))

61


In [12]:
file_location = "./data/intent/train_Greeting.json"
file_stream = codecs.open(file_location, 'r', 'utf-8')
jdata = json.load(file_stream)

print(jdata['Greeting'][0])

{'data': 'Hello'}


In [13]:
for data in jdata['Greeting']:
    train_list.append([data['data'].lower(),'greet'])
print (len(train_list))

75


In [14]:
random.seed(4)
random.shuffle(train_list)

In [15]:
X_train = [t[0] for t in train_list]
Y_train = [t[1] for t in train_list]

In [16]:
bigram_vectorizer = TfidfVectorizer(ngram_range=(1, 2),token_pattern=r'\b\w+\b', min_df=1)
train_bigram_vectors = bigram_vectorizer.fit_transform(X_train)

In [17]:
train_bigram_vectors.shape

(75, 215)

In [18]:
pickle.dump(train_bigram_vectors, open("tfidf_vector.pkl", "wb"))

In [19]:
clf_ME = LogisticRegression(random_state=0, solver='lbfgs').fit(train_bigram_vectors, Y_train)

In [20]:
model_svm = SVC(C=5000.0, gamma="auto", kernel='rbf')
clr_svm = model_svm.fit(train_bigram_vectors, Y_train)   

In [21]:
dump(clf_ME, 'clf_ME.joblib')

['clf_ME.joblib']

In [22]:
dump(clr_svm, 'clr_svm.joblib')

['clr_svm.joblib']

In [23]:
clf_ME_test = load('clf_ME.joblib')
clf_svm_test = load('clr_svm.joblib')

In [24]:
X_test = ['leave system', 
          'Give lyrics please', 
          'Get me a song for me', 
          'Could you please generate lyrics for me',
          'I want to find a similar song',
          'I want some popular songs',
          'Any hippop songs for me',
          'Morning']

test_bigram_vectors = bigram_vectorizer.transform(X_test)

In [25]:
predME = clf_ME_test.predict(test_bigram_vectors)
predSVM = clf_svm_test.predict(test_bigram_vectors)

In [26]:
print(predME, predSVM)

['exit' 'generate' 'find' 'generate' 'find' 'recom' 'find' 'greet'] ['exit' 'generate' 'find' 'generate' 'find' 'recom' 'find' 'greet']
