In [6]:
import pandas as pd
smalltalk_df = pd.read_csv('data/smalltalk_intent.csv')
smalltalk_df['Output'] = smalltalk_df['Intent'].str.split('_').str[1:].apply('_'.join)
smalltalk_df = smalltalk_df.rename(columns= {'Utterances':'Input'})
smalltalk_df = smalltalk_df[['Input','Output']]
smalltalk_df = smalltalk_df.drop_duplicates()
smalltalk_df.head()


Unnamed: 0,Input,Output
0,who are you?,agent_acquaintance
1,all about you,agent_acquaintance
2,what is your personality,agent_acquaintance
3,define yourself,agent_acquaintance
4,what are you,agent_acquaintance


In [45]:
labels = set(smalltalk_df['Intent'])
print(labels)
greetings_labels = set(smalltalk_df[smalltalk_df['Intent']=='appraisal']['Output'])
print(greetings_labels)



{'agent', 'greetings', 'confirmation', 'emotions', 'user', 'dialog', 'appraisal'}
{'welcome', 'well_done', 'no_problem', 'thank_you', 'good', 'bad'}


In [7]:
df_dict = {}
for label in labels:
    df_dict[label] = smalltalk_df[smalltalk_df['Subject']==label]

KeyError: 'Subject'

In [36]:
from sklearn.model_selection import train_test_split

inputs = smalltalk_df['Input'].values
labels = smalltalk_df['Output'].values

X_train, X_test, y_train, y_test = train_test_split(inputs, labels, stratify=labels, test_size=0.3, random_state=1)


In [37]:
from sklearn . linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np, matplotlib.pyplot as plt
from sklearn . metrics import accuracy_score , f1_score , confusion_matrix

vect = TfidfVectorizer(sublinear_tf=True, use_idf=True)
X_train_tf = vect.fit_transform(X_train)
scores = []
cs = []
#clf = SVC(C=1.2, kernel='rbf').fit(X_train_tf, y_train)
clf = LogisticRegression(penalty='l2', C=1.0, random_state=3, solver='lbfgs').fit(X_train_tf, y_train)

X_test_tf = vect.transform(X_test)
predicted = clf.predict(X_test_tf)
scores.append(accuracy_score(y_test, predicted))


In [38]:

X_test_tf = vect.transform(X_test)
predicted = clf.predict(X_test_tf)
print(confusion_matrix(y_test, predicted))
print(accuracy_score(y_test, predicted))
X_test_tf = vect.transform(['what are you thinking?'])
clf.predict(X_test_tf)

[[7 0 0 ... 0 0 0]
 [0 3 0 ... 0 0 0]
 [0 0 2 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 4 0]
 [0 0 0 ... 0 0 4]]
0.7377483443708609


array(['greetings_whatsup'], dtype=object)

In [39]:
new_data = ["I don't agree with that.", "how are you?","do you like me?"]
processed_new_data = vect.transform(new_data)
clf.predict(processed_new_data)

array(['confirmation_no', 'greetings_how_are_you', 'user_likes_agent'],
      dtype=object)

In [13]:
import numpy as np
import statistics
l=np.array([1,2,3,5,2,8,4,2,7,3,7,9,4,2])
matches=l.argsort()[-3:][::-1]

print(smalltalk_df.iloc[[5,100,400]]['Output'].values)
statistics.mode(smalltalk_df.iloc[[5,100,400]]['Output'].values)


['agent_acquaintance' 'agent_beautiful' 'agent_real']


'agent_acquaintance'

In [155]:
import json

with open('../data/intents.json','r') as file:
    data = json.load(file)['intents']

norm_data=[]
for tag in data:
    patterns = tag['patterns']
    responses = tag['responses']
    for pattern, response in zip(patterns, responses):
        norm_data.append({'Input':pattern, 'Intent':tag['tag'], 'Output':response})
intents_df = pd.DataFrame(norm_data)
intents_df[intents_df['Intent']=='shakespeare_poems']

Unnamed: 0,Input,Intent,Output
61,Read me a poem by Shakespeare.,shakespeare_poems,Certainly! Here's a beautiful poem by William ...


In [None]:
import pandas as pd
import joblib, nltk, re, pprint, string, numpy as np, statistics, json
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity

def stemmer(doc):
    stem = PorterStemmer()
    analyzer = TfidfVectorizer().build_analyzer()
    return [stem.stem(w) for w in analyzer(doc)]


def create_df(filename):
    df = pd.read_csv(f'data/{filename}.csv')
    joblib.dump(df, f'dfs/{filename}.joblib')
    return True



def create_dt_matrix(filename):
    df = joblib.load(f'dfs/{filename}.joblib')
    vect = TfidfVectorizer(use_idf=True, sublinear_tf=True, analyzer=stemmer, lowercase=True)
    matrix = vect.fit_transform(df['Utterance'].values)
    joblib.dump(matrix,f'dtm/{filename}.joblib')
    joblib.dump(vect,f'vects/{filename}.joblib')
    return True

def cosine_sim(input, filename):
    dtm = joblib.load(f'dtm/{filename}.joblib')
    df = joblib.load(f'dfs/{filename}.joblib')
    vect=joblib.load(f'vects/{filename}.joblib')
    similarities = cosine_similarity(vect.transform([input]), dtm).flatten()
    match = similarities.argsort()[-1:]
    return (df.iloc[match]['Intent'].values[0])

def intent_init():
    for filename in ['confirmations','greetings','user']:
        create_df(filename)
        create_dt_matrix(filename)

intent_init()

cosine_sim('what are you doing','greetings')


'do'

In [None]:
def stemmer_sw(doc):
    stem = PorterStemmer()
    analyzer = TfidfVectorizer(stop_words=stopwords.words('english')).build_analyzer()
    return [stem.stem(w) for w in analyzer(doc)]

def create_dt_matrix(filename):
    df = joblib.load(f'dfs/{filename}.joblib')
    vect = TfidfVectorizer(use_idf=True, sublinear_tf=True, analyzer=stemmer_sw, lowercase=True)
    matrix = vect.fit_transform(df['Answer'].values)
    joblib.dump(matrix,f'dtm/{filename}_a.joblib')
    joblib.dump(vect,f'vects/{filename}_a.joblib')
    matrix = vect.fit_transform(df['Question'].values)
    joblib.dump(matrix,f'dtm/{filename}_q.joblib')
    joblib.dump(vect,f'vects/{filename}_q.joblib')

def cosine_sim_answer(input, filename):
    vect = joblib.load(f'vects/{filename}_a.joblib')
    dtm = joblib.load(f'dtm/{filename}_a.joblib')
    df = joblib.load(f'dfs/{filename}.joblib')
    similarities = cosine_similarity(vect.transform([input]), dtm).flatten()
    match = similarities.argsort()[-1:]
    return df.iloc[match]['Answer'].values[0]

def cosine_sim_question(input,filename):
    vect = joblib.load(f'vects/{filename}_q.joblib')
    dtm = joblib.load(f'dtm/{filename}_q.joblib')
    df = joblib.load(f'dfs/{filename}.joblib')
    similarities = cosine_similarity(vect.transform([input]), dtm).flatten()
    match = similarities.argsort()[-1:]
    return df.iloc[match]['Question'].values[0]


'Bonds and stocks are both securities , but the major difference between the two is that (capital) stockholders have an equity stake in the company (i.e. they are owners), whereas bondholders have a creditor stake in the company (i.e. they are lenders).'

In [65]:
c=0
while c<5:
    print(c)
    c+=1
    print(c)

0
1
1
2
2
3
3
4
4
5
