In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from pythainlp.corpus import thai_stopwords
from pythainlp import word_tokenize
import re


In [2]:
#________________________________________________________________________
def bagofwords(text):
    
    # 1. Remove special characters
    # 2. Make bag of words
    # 3. Remove  stopwords
    
    regexp_thai = re.compile(u"[^\u0E00-\u0E7Fa-zA-Z' ]|^'|'$|''") 
    #sign = (' ', ',', '.','..','...', ':', ';','%','+','-','*','/','ๆ','?','#','"',' ','\n',# เพิ่มเติม stop words 
    sign = (' ', ',', '.','..','...', ':', ';','%','+','-','*','/','ๆ','0','1','2','3','4','5','6','7','8','9','\n','\n\n','-','g','Kg','เสิร์ฟ','๐','๑','๒','๓','๔','๕','๖','๗','๘','๙',
            'ก','ข','ฃ','ค','ฅ','ง','จ','ฉ','ช','ญ','ณ','ด','ต','ถ','ท','ธ','น','บ','ป','ผ','ฝ','พ','ฟ','ภ','ม','ย','ร','ล','ว','ศ','ษ','ส','ห','อ',
            'ผสม','คั้น','หั่น','วัตถุดิบ','มล','Shallot','แง่ง','TIP','มล','มล.','Seasoning','เส','ริ์ฟ','วิธีทำ','เด',
            'ช้อนโต๊ะ','ช้อนชา','ถ้วย','ทัพพี','เล็กน้อย','หัว','กรัม','ml','cc','ต้น','ลูก','ตามชอบ','ห่อ','ชต','ชช','ซีก','ชิ้น','แท่ง','ลิตร','มิลลิลิตร','กิโลกรัม','ขีด','ฟอง',
           'แช่น้ำเย็น','พักไว้', 'สำหรับทอด','ดอก','Shredded')
    _STOPWORDS = thai_stopwords()
    _STOPSIGNS = frozenset(sign)
    ret = regexp_thai.sub("", text)
    bow0 = word_tokenize(ret, engine="newmm", keep_whitespace=False)
    bow1= [word for word in bow0 if( (word not in _STOPWORDS) and (word not in _STOPSIGNS))]
    return bow1


#________________________________________________________________________
#Make word set
def makeWordSet(df):
    wordSet = set([])
    for i in range(0,len(df.index)):
        #f[['foodname','ingredients','cul_name']]
        item1 = df.iloc[i]['name']
        #item2 = df.iloc[i]['ingredients']
        #item3 = df.iloc[i]['culture']
       
        bow = bagofwords(item1)
        #bow.append(item1)
        #bow.append(item3)
        wordSet = wordSet.union(set(bow))
    return wordSet
#________________________________________________________________________
#Make WordDict
def makeWordDictListandTFBowList(df,wordSet):
    #wordSet = makeWordSet(df)
    wordDict = dict.fromkeys(wordSet,0)
    wordDictList =[]
    tfBowList=[]
    for i in range(0, len(df.index)):
        wDict = dict.fromkeys(wordSet,0)
        #item = df.iloc[i]['CONTENT']
        item1 = df.iloc[i]['name']
        #item2 = df.iloc[i]['ingredients']
        #item3 = df.iloc[i]['culture']
        
        bow = bagofwords(item1)
        #bow.append(item1)
        #bow.append(item3)
        for word in bow:
            wDict[word]+=1
        tfBow = computeTF(wDict,bow)
        #print('i=',i)
        #print(bow)
        #print(wDict)
        wordDictList.append(wDict)
        tfBowList.append(tfBow)
        del wDict
        #del item1
        #del item2
        del item1
        del bow
    return (wordDictList, tfBowList)    
    
#________________________________________________________________________
# Collecting from various sites
# For finding TF
def computeTF(wordDict, bow):
    tfDict = {}
    bowCount = len(bow)
    for word, count in wordDict.items():
        tfDict[word] = count/float(bowCount)
    return tfDict
#_______________________________________________________________________
# For finding IDF
def computeIDF(docList):
    import math
    idfDict = {}
    N = len(docList)
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for doc in docList:
        for word, val in doc.items():
            if val > 0:
                idfDict[word] += 1   
    for word, val in idfDict.items():
        idfDict[word] = math.log10(N / float(val))
    return idfDict
#________________________________________________________________________
# For finding TF-IDF
def computeTFIDF(tfBow, idf):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val*idf[word]
    return tfidf
#________________________________________________________________________
# Creating TF-IDF List
def  createTFIDFMatrix(TFBowList,idf):
    tfidfMatrix=[]
    for tfBow in TFBowList:
        tfidfBow = computeTFIDF(tfBow,idf)
        tfidfMatrix.append(tfidfBow)
    return tfidfMatrix
#________________________________________________________________________
# Creating Similarity between all docs of the dataset
def MakeSim(df):
    #Input df of dataset
    sdf=df
    wordSet=makeWordSet(sdf)
    wordList = list(wordSet)
    wordDict = dict.fromkeys(wordSet,0)
    wordDictList, TFBowList = makeWordDictListandTFBowList(sdf, wordSet)
    #df_worddict
    df_worddict = pd.DataFrame(wordDictList)
    #create IDF
    idf = computeIDF(wordDictList)
    #create a matrix of TFIDF
    tfidfMatrix = createTFIDFMatrix(TFBowList,idf)
    #createTFIDFMatrix(TFBowList,idf):
    #convert it to dataframe
    pdMatrix   = pd.DataFrame(tfidfMatrix)
    #Calc for sim
    sim = cosine_similarity(pdMatrix, pdMatrix)
    #convert sim to df
    s=pd.DataFrame(sim)
    return tfidfMatrix, pdMatrix, sim, s
#________________________________________________________________________
#Making recommendations with top N
def RecFoodCulture(thedf, active, culture, s, topN):
    
    #retrieve only the active item
    a = s.iloc[active]
    #drop the active item similarity
    result = a.drop(active)
    #convert the result to list
    res=result.sort_values(ascending=False)
    resList = res.index.values.tolist()
    #active item data
    theactive = thedf.iloc[active]
     # assign top rec items 
    rec = resList[:topN]
    sel = thedf.iloc[rec]
    return theactive, rec, sel

In [3]:
df = pd.read_csv('C:/Users/Admin/Desktop/chawalsak/tipla/Projects/RecommendSystemContentBased/bin/Debug/Input/Foods/food_center.csv' , sep='|' ,low_memory=False, index_col=None)

In [4]:
df.head()

Unnamed: 0,id,country_id,country_name,culture_id,culture_name,name,description,cooking_food,dietetic_food,ingredient
0,1,10,ไทย,10,ไทยพุทธ,กะปิคั่ว,ความหอมของกะปิ ประสานกับเครื่องแกง กะทิ ท่านที...,เตรียมเครื่องซอยไว้หอมแดงกระเทียมตะไคร้ผิวมะกร...,โปรตีนคาร์โบไฮเดรตเกลือแร่วิตะมิน,กะปิหอมแดงgกระเทียมgตะไคร้gกระชายgพริกแห้งเม็ด...
1,2,10,ไทย,10,ไทยพุทธ,แกงคั่วหอยแมลงภู่ใบชะพลูและชะอม,แกงคั่วรสเข้มข้นอร่อยเด็ดอย่าบอกใครโดยเฉพาะแกง...,ใส่พริกแกงลงผัดในหัวกะทิสดๆแล้วใส่หางกะทิปล่อย...,ไขมันโปรตีนคาร์โบไฮเดรตเกลือแร่วิตะมิน,พริกแกงคั่วใต้หอยแมลงภู่ต้มสุกแกะกะทิชะอมใบมะก...
2,3,10,ไทย,10,ไทยพุทธ,กุ้งผัดสะตอพริกแกงใต้,ท่านที่ชื่นชอบสะตอต้องไม่พลาดกับเมนูนี้ สะตอผั...,ตั้งกระทะน้ำมันให้ร้อนนำกุ้งลงไปผัดให้สุกจากนั...,ไขมันโปรตีนคาร์โบไฮเดรตเกลือแร่วิตะมิน,สะตอช้อนโต้ะกุ้งสดตัวพริกแกงใต้ช้อนโต้ะน้ำตาลป...
3,4,10,ไทย,10,ไทยพุทธ,กุ้งหวาน,น้อยนักที่อาหารใต้จะมีรสหวานจัด จะมีก็มีเมนูนี...,ตั้งกระทะเปิดไฟปานกลางใส่น้ำตาลปี๊บลงไปตามด้วย...,โปรตีนคาร์โบไฮเดรตเกลือแร่วิตะมิน,น้ำตาลปี๊บกรัมน้ำถ้วยเกลือกุ้งขาวขนาดเล็กกรัม
4,5,10,ไทย,10,ไทยพุทธ,แกงคั่วกระดูกอ่อนลูกกล้วย,ลูกกล้วยดิบสามารถนำมาปรุงอาหารอร่อยได้ ไม่แพ้ผ...,หมักกระดูกอ่อนด้วยนมสดค้างคืนหรือขั้นต่ำชั่วโม...,ไขมันโปรตีนคาร์โบไฮเดรตเกลือแร่วิตะมิน,กระดูกอ่อนขีดกล้วยน้ำว้าห่ามลูกกะทิซีซีพริกแกง...


In [5]:
df.shape

(152, 10)

In [6]:
data =df[['id','name','culture_id','culture_name']]

In [7]:
data.head()

Unnamed: 0,id,name,culture_id,culture_name
0,1,กะปิคั่ว,10,ไทยพุทธ
1,2,แกงคั่วหอยแมลงภู่ใบชะพลูและชะอม,10,ไทยพุทธ
2,3,กุ้งผัดสะตอพริกแกงใต้,10,ไทยพุทธ
3,4,กุ้งหวาน,10,ไทยพุทธ
4,5,แกงคั่วกระดูกอ่อนลูกกล้วย,10,ไทยพุทธ


In [29]:
grouping = data.groupby(by = 'culture_name')

In [30]:
print('group count : ' , len(grouping))

group count :  3


In [31]:
rec = dict()

In [32]:
for key,values in grouping:
    item = pd.DataFrame(data=values.values,columns=list(data.columns))
    model = MakeSim(item)
    rec[key] = dict()
    rec[key]["target"] = item
    rec[key]["tfidMatrix"] = model[0]
    rec[key]["pdMatrix"] = model[1]
    rec[key]["sim"] = model[2]
    rec[key]["df"] = model[3]

In [33]:
rec["ไทยพุทธ"]["df"]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.403354,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.065136,0.055801,0.059769,0.073172,0.474579,0.0,...,0.0,0.0,0.0,0.251739,0.071087,0.073172,0.0,0.186193,0.0,0.0
2,0.0,0.0,1.0,0.242922,0.0,0.0,0.174231,0.0,0.0,0.192327,...,0.0,0.0,0.0,0.367973,0.0,0.0,0.0,0.0,0.154486,0.790267
3,0.0,0.0,0.242922,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.637774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.307393
4,0.0,0.065136,0.0,0.0,1.0,0.078536,0.420605,0.102985,0.068218,0.0,...,0.0,0.0,0.0,0.0,0.10005,0.102985,0.0,0.095797,0.0,0.0
5,0.0,0.055801,0.0,0.0,0.078536,1.0,0.072065,0.088225,0.058441,0.0,...,0.0,0.0,0.0,0.0,0.261147,0.088225,0.0,0.082067,0.0,0.0
6,0.0,0.059769,0.174231,0.0,0.420605,0.072065,1.0,0.0945,0.062597,0.0,...,0.0,0.0,0.0,0.138889,0.091806,0.0945,0.0,0.087904,0.0,0.220471
7,0.0,0.073172,0.0,0.0,0.102985,0.088225,0.0945,1.0,0.076634,0.0,...,0.269684,0.0,0.0,0.0,0.112394,0.316476,0.0,0.107616,0.0,0.0
8,0.0,0.474579,0.0,0.0,0.068218,0.058441,0.062597,0.076634,1.0,0.0,...,0.0,0.0,0.0,0.483758,0.07445,0.076634,0.0,0.195002,0.0,0.0
9,0.0,0.0,0.192327,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.153314,0.0,0.0,0.0,0.0,0.182379,0.24337


In [66]:
topN=20
#input for rec
active=1
active = data[data['id'] == active].index[0]
culture_name = data.loc[active]['culture_name']

topN,active,culture_name

(20, 0, 'ไทยพุทธ')

In [67]:
thedf = rec[culture_name]["target"]
dataGroup = rec[culture_name]["df"]

In [68]:
theactive, result, sel = RecFoodCulture(thedf, active, culture_name, dataGroup, topN)

#Result: แนะนำ
result = [str(r) for r in result]
print('|'.join(result))

17|42|39|21|37|22|12|20|19|18|16|15|14|13|49|11|23|9|8|7
