## **Import Libraries**



In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame
import csv
import math
import heapq
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
import spacy

## **Load the data**

In [5]:
data = pd.read_csv('newdata_2.csv')
# print(tfidf['Name'])
# print(data)
# print(data.shape)

corpus = list(data['Corpus'])
# print(corpus)

## **Build Up the Language Processor**

In [8]:
def my_preprocessing(raw_sentence):
    nlp_tool = spacy.load('en_core_web_sm')
    token_sentence = nlp_tool(raw_sentence.lower())
    with open('./irrelevant_words.txt') as file:
        irrelevantlist = [stopword.replace('\n', '').lower() for stopword in file.readlines()]
#     new_sentence = [word for word in token_sentence if word not in irrelevantlist]
    
    preprocessed_sentence = []
    
    for token in token_sentence:
        if token.pos_ == "PUNCT" or token.is_stop == True or token.is_alpha == False or token.pos_ == "SYM":
            continue

        elif token.lemma_ in irrelevantlist or len(token)==1:
            continue
            
        else:
            preprocessed_sentence.append(token.lemma_)
            
    return preprocessed_sentence


## Basic functions

In [13]:
# Filter the dogs according to the hard constraints (gender, age and HDB approval)
def filter_dog(data,gender,age,hdb):
    filtered = []
    for i in range(335):
        if data.loc[i,'Gender'] in gender:
            if data.loc[i,'Age'] in age:
                if data.loc[i,'HDB'] in hdb:
                    filtered.append(i)
    return filtered

# Test
filtered = filter_dog(data,[1],[2],[0])
print('Number of dogs after filterring: ' + str(len(filtered)))
print('Index of dogs after filtering:')
print(filtered)
# index = []
# for dog in filtered:
#     index.append(dog['Index'])
# print(index)

nofilter = filter_dog(data,[0,1],[0,1,2,3],[0,1])

Number of dogs after filterring: 11
Index of dogs after filtering:
[110, 160, 178, 179, 181, 205, 219, 233, 275, 296, 297]


In [3]:
# Make a list of values of dogs to calculate cosine similarity
def make_list(dog_index,hard_constraints,tfidf,soft_constraints):
    value_list = []
    if hard_constraints == True: 
        value_list.append(data.loc[dog_index]['Gender'])
        value_list.append(data.loc[dog_index]['Age'])
        value_list.append(data.loc[dog_index]['HDB'])
    if tfidf == True:
        for i in range(1770):
            value_list.append(data.loc[dog_index]['tfidf'+str(i)])
    if soft_constraints == True:
        for i in range(6):
            value_list.append(data.loc[dog_index]['Tag'+str(i+3)])
#     print(len(value_list))
    return value_list

dogs_value_list = []
for i in range(335):
    dogs_value_list.append(make_list(i,True,True,True))

KeyboardInterrupt: 

In [17]:
# Calculation of the cosine similarity given two lists of vlaues
def cosine_similarity(listx,listy):
    numerator = 0
    denominator_x = 0
    denominator_y = 0
#     if len(listx) != len(listy):
#         raise Exception('The two input lists of the cosine similarity calculator are not in the same length!')
#     if hard_constraint == True:
#         numerator += data.loc[indexx]['Gender']
    for i in range(len(listx)):
        numerator += listx[i] * listy[i]
        denominator_x += listx[i]**2
        denominator_y += listy[i]**2
    denominator_x = math.sqrt(denominator_x)
    denominator_y = math.sqrt(denominator_y)  
    return (numerator/(denominator_x*denominator_y))

# Test
# print(cosine_similarity(make_list(data.loc[0]),make_list(data.loc[70])))
cosine_similarity([2,3,4],[2,4,6])

0.9925833339709303

In [109]:
# Calculation of cosine similarity between two dogs
# For the soft constraints, if the value of one certain aspect for either dog is 0, it won't be taken into calculation.
def cossim_dog_dog(dogx_index,dogy_index):
    listx = make_list(dogx_index,hard_constraints=True,tfidf=True,soft_constraints=True)
    listy = make_list(dogy_index,hard_constraints=True,tfidf=True,soft_constraints=True)
    for i in range(1773,1779):
        if listx[i] == 0 or listy[i] == 0:
            listx[i] = 0
            listy[i] = 0
    cossim = cosine_similarity(listx,listy)
    return cossim

In [110]:
# Calculation of cosine similarity between one dog and the user's profile
# Used in recommendation for a new user
# The hard constraints are filtered before the calcularion, thus not taken into account
# For the soft constraints, if the value of one certain aspect for the dog is 0, 
# or that of the user's profile is 1, it won't be taken into calculation.
def cossim_dog_userprofile(user,dog_index):
    listx = make_list(user,False,True,True)
    listy = make_list(dog,False,True,True)
    for i in range(1773-3,1779-3):
        if listx[i] == 1 or listy[i] == 0:
            listx[i] = 0
            listy[i] = 0
    cossim = cosine_similarity(listx,listy)
    return cossim

# Test
# cossim_dog_dog(data.loc[0],data.loc[100])

In [111]:
# Calculation of cosine similarity between one dog and the user's history selection
# Used in recommendation for an old user after he/she has selected some dogs
# Only consider the descriptions of the dogs
def cossim_dog_userhistory(history,dog_index):
#     listy = make_list(dog_index,hard_constraints=False,tfidf=True,soft_constraints=False,data)
    listy = make_list(dog_index,False,True,False)
    cossim = cosine_similarity(history,listy)
    return cossim

In [8]:
# Based on user's history data, build a user's prefernce.
# Only consider the tfidf
def user_history(dogs_index):
    history = []
    history.append(999)
    history.append('History')
    history.append(0)
    history.append(0)
    history.append(0)
    history.append('courpus')
    for i in range(6):
        history.append(0)
    for i in range(1770):
        summ = 0
        average = 0
        for index in dogs_index:
            summ += data.loc[index]['tfidf'+str(i)]
        average = summ/len(dogs_index)
        history.append(average)
#     data.loc[335] = history
    return history

Unnamed: 0,Index,Name,Gender,Age,HDB,Corpus,Tag3,Tag4,Tag5,Tag6,...,tfidf1760,tfidf1761,tfidf1762,tfidf1763,tfidf1764,tfidf1765,tfidf1766,tfidf1767,tfidf1768,tfidf1769
0,0,Adora,0,3,1,absolute darling food take medicine food aggr...,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,Adore Blessing,0,3,1,tip tiny bit smart unfamiliar pretend calm fi...,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,Ah Boy,1,3,0,sweet absolutely companion hesitate affection...,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,Ah Leng,0,3,0,anxious afraid affection trust wriggle seek,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,Alaska,1,3,0,recall post baby display change temperament b...,1,1,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331,331,Vicki,0,3,0,low energy adorable sweetheart chew furniture...,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
332,332,Watson,1,3,1,wander gleefully carry crate different bite p...,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
333,333,Wiley,1,3,0,affectionate excitable hyper fearful tuggish ...,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
334,334,Zane,1,3,0,handsome bad aggression deep crave puppy driv...,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Recommender for old users based on the user's history prefernece

In [115]:
# Find the similar dogs according to cosine similarity
def recommend_userhistory(dogs_filter_index,history_index,target_num):
    similarity = []
    history = user_history(history_index)
#     print(len(history))
    num = 1
    for index in dogs_filter_index:
        cos_sim = cossim_dog_userhistory(history,index)
        print(str(num) + '/' + str(len(dogs_filter_index)) + ' : ' + str(cos_sim))
        similarity.append(cos_sim)
        num += 1
#     print(similarity)
    max_index_ = list(map(similarity.index, heapq.nlargest(target_num,similarity)))
    max_index = []
    for i in range(len(max_index_)):
        max_index.append(dogs_filter_index[max_index_[i]])
    max_value = heapq.nlargest(target_num,similarity)
    return [[max_index],[max_value]]

In [127]:
filtered_index = filter_dog([0,1],[3],[0])
# print(filtered_index)
history_index = [0,20,42,25]
recommend_dogs = recommend_userhistory(nofilter,history_index,5)
print(recommend_dogs)

1/335 : 0.5273493333322952
2/335 : 0.02349846609718914
3/335 : 0.14396210159463668
4/335 : 0.10900206350340742
5/335 : 0.05229946239031541
6/335 : 0.10718501103979583
7/335 : 0.06679619231839563
8/335 : 0.06580937181830881
9/335 : 0.011807634328146476
10/335 : 0.04005512091193892
11/335 : 0.0
12/335 : 0.056824401172797495
13/335 : 0.17268415543140814
14/335 : 0.08312462133693337
15/335 : 0.06911183112637968
16/335 : 0.08731604662277424
17/335 : 0.06353440733884716
18/335 : 0.01475314018192458
19/335 : 0.06697547130806672
20/335 : 0.1150197117999076
21/335 : 0.48418817050636465
22/335 : 0.03418436478626665
23/335 : 0.07322404297391855
24/335 : 0.020954147433664312
25/335 : 0.03732223605520729
26/335 : 0.5180140458040923
27/335 : 0.07282935831839055
28/335 : 0.01938373470928547
29/335 : 0.15294005021534832
30/335 : 0.08612786457493765
31/335 : 0.07308474391656578
32/335 : 0.10780590817106742
33/335 : 0.011912637925654371
34/335 : 0.09749775620943132
35/335 : 0.12795530076274902
36/335 : 

KeyboardInterrupt: 

## **Recommender for new users**

In [None]:
def filter(gender,age,hdb):
  filtered = []
  for i in range(335):
    if data.loc[i,'Gender'] in gender:
      if data.loc[i,'Age'] in age:
        if data.loc[i,'HDB'] in hdb:
          filtered.append(data.loc[i])
  return filtered

# dog_filter = filter([0],[1,3],[1])
# print(len(dog_filter))
# # print(dog_filter[3])

# user_dog_des = 'I want a vjhgycfuyfytf.'
# user_dog_des = my_preprocessing(user_dog_des)
# print(user_dog_des)
# user_dog_des_str = ''
# for word in user_dog_des:
#   user_dog_des_str += word
#   user_dog_des_str += ' '
# corpus_new = []
# corpus_new.append(user_dog_des_str)
# for dog in dog_filter:
#   corpus_new.append(dog['Corpus'])
# print(len(corpus_new))
# # print(len(corpus))
# print(corpus_new)
def generateNewCorpus(dog_filter, dog_description):
  print(len(dog_filter))
  # print(dog_filter[3])

  user_dog_des = my_preprocessing(dog_description)
  print(user_dog_des)
  user_dog_des_str = ''
  for word in user_dog_des:
    user_dog_des_str += word
    user_dog_des_str += ' '
  corpus_new = []
  corpus_new.append(user_dog_des_str)
  for dog in dog_filter:
    corpus_new.append(dog['Corpus'])
  print(len(corpus_new))
  # print(len(corpus))
  print(corpus_new)
  return corpus_new

71
['vjhgycfuyfytf']
72
['vjhgycfuyfytf ', ' absolute darling food take medicine food aggression starve point confident trust independent separation anxiety company story able smiley face', ' tip tiny bit smart unfamiliar pretend calm fine cool pup mesmerize pup puppy inside', ' eat handfeed meal eat fearful take trust cheer confidence', ' ash young mum hard gentle nurture soul shine quiet thoughtful smooch happily trick caring compassionate confidence understand bit shy ideal child pet perfect companion quiet friendly pet good manner ash pup bail pound save euthanize petite size perfect ash great compassion shy young mum proud irresistibly adorable pup nervous doggie timid nature encounter warm low maintenance gentle opportunity explore perfect companion furry chill petite size sharp feature narrow bridge large pointy ash shiny beauty throw curveball shyness noticed easily forget long special true affectionate beauty lie beauty lie', ' long aggressive threaten attack fear change enjoy

In [None]:
# vectorizer = CountVectorizer()
# word_vec = vectorizer.fit_transform(corpus_new)
# # print(word_vec.toarray())
# # print(vectorizer.get_feature_names())
# transformer = TfidfTransformer()
# tfidf = transformer.fit_transform(word_vec)
# tfidf_matrix = tfidf.toarray()
# print(tfidf_matrix.shape)
# # print(tfidf_matrix)
# # print(tfidf_matrix[71][1])
# cosim = []
# denominator_user = 0
# for j in range(tfidf_matrix.shape[1]):
#   denominator_user += tfidf_matrix[0][j]**2
# denominator_user = math.sqrt(denominator_user)
# for i in range(1,tfidf_matrix.shape[0]):
#   numerator = 0
#   denominator_target = 0
#   for j in range(tfidf_matrix.shape[1]):
#     numerator += tfidf_matrix[0][j]*tfidf_matrix[i][j]
#     denominator_target += tfidf_matrix[i][j]**2
#   denominator_target = math.sqrt(denominator_target)
#   cosim.append(numerator/(denominator_user*denominator_target))

# target_num = 5
# max_index = list(map(cosim.index, heapq.nlargest(target_num,cosim)))
# max_value = heapq.nlargest(target_num,cosim)
# print(max_index)
# print(max_value)

# for index in max_index:
#   print(dog_filter[index]['Corpus'])

def getTargetDogs(dog_description):  
  dog_filter = filter([0],[1,3],[1])
  vectorizer = CountVectorizer()
  corpus_new = generateNewCorpus(dog_filter, dog_description)
  word_vec = vectorizer.fit_transform(corpus_new)
  # print(word_vec.toarray())
  # print(vectorizer.get_feature_names())
  transformer = TfidfTransformer()
  tfidf = transformer.fit_transform(word_vec)
  tfidf_matrix = tfidf.toarray()
  print(tfidf_matrix.shape)
  # print(tfidf_matrix)
  # print(tfidf_matrix[71][1])
  cosim = []
  denominator_user = 0
  for j in range(tfidf_matrix.shape[1]):
    denominator_user += tfidf_matrix[0][j]**2
  denominator_user = math.sqrt(denominator_user)
  for i in range(1,tfidf_matrix.shape[0]):
    numerator = 0
    denominator_target = 0
    for j in range(tfidf_matrix.shape[1]):
      numerator += tfidf_matrix[0][j]*tfidf_matrix[i][j]
      denominator_target += tfidf_matrix[i][j]**2
    denominator_target = math.sqrt(denominator_target)
    cosim.append(numerator/(denominator_user*denominator_target))

  target_num = 5
  max_index = list(map(cosim.index, heapq.nlargest(target_num,cosim)))
  max_value = heapq.nlargest(target_num,cosim)
  print(max_index)
  print(max_value)

  for index in max_index:
    print(dog_filter[index]['Corpus'])
  return max_index

dog_description = "I like a smart dog"
target = getTargetDogs(dog_description)
print('target:', target)


(72, 714)
[0, 0, 0, 0, 0]
[0.0, 0.0, 0.0, 0.0, 0.0]
 absolute darling food take medicine food aggression starve point confident trust independent separation anxiety company story able smiley face
 absolute darling food take medicine food aggression starve point confident trust independent separation anxiety company story able smiley face
 absolute darling food take medicine food aggression starve point confident trust independent separation anxiety company story able smiley face
 absolute darling food take medicine food aggression starve point confident trust independent separation anxiety company story able smiley face
 absolute darling food take medicine food aggression starve point confident trust independent separation anxiety company story able smiley face
