In [1]:
import nltk
import math

def generate_tficf(reviews, n_cluster = 4):
    tf_word_freq = {}
    for n in range(n_cluster):
        tf_word_freq[n] = {}
   
    tf = []
    for index, review in enumerate(reviews):
        for sentence in nltk.sent_tokenize(review):
            for word in nltk.word_tokenize(sentence):
                if word in tf_word_freq[index]:
                    tf_word_freq[index][word] += 1
                else:
                    tf_word_freq[index][word] = 1

                if word not in tf:
                    tf.append(word)
                    
                    
    tf_icf = []
    for term in tf:
        tf_in_c = {}
        c_i = 0


        for n in range(n_cluster):
            tf_in_c[n] = 0
            
            if term in tf_word_freq[n]:
                tf_in_c[n] = tf_word_freq[n][term]
                c_i += 1


        icf = 1 + math.log(n_cluster/c_i)
        t = ()
        t += (term,)
        for n in range(n_cluster):
            t += (tf_in_c[n]*icf, )
        tf_icf.append(t)
    
    return tf_icf

In [2]:
import re
import nltk
import string
from nltk.stem import WordNetLemmatizer 

lemmatizer = WordNetLemmatizer() 

nltk_words = set(nltk.corpus.stopwords.words('english'))
stop_words = []
for word in nltk_words:
    stop_words.append(word.translate(str.maketrans('', '', string.punctuation)))

def preprocess(sentence):
    res = sentence.lower()
    res = res.translate(str.maketrans('', '', string.punctuation))
    tokenized_words = nltk.word_tokenize(res)
    res = [word for word in tokenized_words if word not in stop_words]
    res = [lemmatizer.lemmatize(r) for r in res]
    res = [re.sub(r"[^A-Za-z]+", '', r) for r in res]
    res = [r for r in res if len(r) > 3]
    return ' '.join(res)

In [3]:
def open_file(kategori):
    f = open('wiki/source/' + kategori + '-sentence.txt', 'r', encoding='utf-8')
    all_sentences = []
    for line in f:
        all_sentences.append(preprocess(str(line).replace('\n','')))
    return ' '.join(all_sentences)

In [4]:
def save_daftar(name, daftar):
    pickle_out = open("Term/daftar_" + name + ".pickle","wb")
    pickle.dump(daftar, pickle_out)
    pickle_out.close()

def load_daftar(name):
    pickle_in = open("Term/daftar_" + name + ".pickle", "rb")
    return pickle.load(pickle_in)

def gabung_file(name):
    daftar = load_daftar(name)
    res = []
    for judul in daftar:
        res.append(open_file(judul))
    return ' '.join(res)


In [5]:
def get_keyword(index, top_n, data_tf):
    sorted_word = sorted(data_tf, key=lambda value: value[index], reverse=True)
    keyword = sorted_word[0:top_n]
    max_k = keyword[0][index]
    min_k = keyword[top_n - 1][index]
    keyword = list(map(lambda value: (value[0], (value[1] - min_k) / (max_k - min_k)), keyword))
    return keyword

In [6]:
import pickle
def run(name, index, top_n, data_tf):
    all_keyword = []
    for a, b in get_keyword(index, top_n, data_tf):
        all_keyword.append(a)
    print(all_keyword)
    pickle_out = open("Term/" + name + ".pickle","wb")
    pickle.dump(all_keyword, pickle_out)
    pickle_out.close()

In [7]:
data = [('ambience', 1), ('food', 2), ('service', 3), ('price', 4)]

save_daftar('ambience', ['Theme restaurant', 'Atmosphere (architecture and spatial design)', 'Ambience (sound recording)'])
save_daftar('food', ['Food', 'Drink', 'Meal'])
save_daftar('service', ['Customer service', "Waiting staff"])
save_daftar('price', ["Price", "Pricing"])
    
ambience = gabung_file('ambience')
food = gabung_file('food')
service = gabung_file('service')
price = gabung_file('price')

In [8]:
data_tf = generate_tficf([ambience, food, service, price], 4)
for name, index in data:
    run(name, index, 10, data_tf)

['restaurant', 'atmosphere', 'theme', 'space', 'architecture', 'sound', 'location', 'located', 'light', 'building']
['food', 'drink', 'diet', 'fruit', 'health', 'meal', 'juice', 'breakfast', 'vegetable', 'cooking']
['customer', 'service', 'server', 'waiting', 'restaurant', 'staff', 'feedback', 'food', 'table', 'serving']
['price', 'pricing', 'product', 'premium', 'service', 'buyer', 'market', 'cost', 'consumer', 'value']


In [9]:
data_tf = generate_tficf([ambience, food, service, price], 4)
for name, index in data:
    run(name, index, 10, data_tf)

['restaurant', 'atmosphere', 'theme', 'space', 'architecture', 'sound', 'location', 'located', 'light', 'building']
['food', 'drink', 'diet', 'fruit', 'health', 'meal', 'juice', 'breakfast', 'vegetable', 'cooking']
['customer', 'service', 'server', 'waiting', 'restaurant', 'staff', 'feedback', 'food', 'table', 'serving']
['price', 'pricing', 'product', 'premium', 'service', 'buyer', 'market', 'cost', 'consumer', 'value']
