In [1]:
import re
import nltk
import string
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

In [2]:
def pre_processing(sentence):
    return remove_stopwords(tokenize_sentence(remove_punctuation(sentence)))

# Remove punctuation from a list of words
def remove_punctuation(sentence):
    return re.sub(r'[^\w\s]', '', sentence)

# Remove stopwords from a list of words
def remove_stopwords(words_list):
    stopwords = open("stop_words_FULL.txt", "r")
    stopwords_list = []
    for word in stopwords:
        stopwords_list.append(word.replace('\n', ''))
    stopwords.close()
    return [value.lower() for value in words_list if value.lower() not in stopwords_list]

# Tokenize the input sentence and also lemmatize its words
def tokenize_sentence(sentence):
    words_list = []
    lmtzr = WordNetLemmatizer()
    for tag in nltk.pos_tag(word_tokenize(sentence)):
        if (tag[1][:2] == "NN"):
            words_list.append(lmtzr.lemmatize(tag[0], pos=wn.NOUN))
        elif (tag[1][:2] == "VB"):
            words_list.append(lmtzr.lemmatize(tag[0], pos=wn.VERB))
        elif (tag[1][:2] == "RB"):
            words_list.append(lmtzr.lemmatize(tag[0], pos=wn.ADV))
        elif (tag[1][:2] == "JJ"):
            words_list.append(lmtzr.lemmatize(tag[0], pos=wn.ADJ))
    return words_list

def get_signature(sense):
    signature = []
    for word in tokenize_sentence(sense.definition()):  # definition tokenization
        signature.append(word)
    for example in sense.examples():  # example tokenization
        for word in tokenize_sentence(example):
            # Merge definition and examples
            signature.append(word)
    return signature

In [3]:
# Topic modelling using gensim
from gensim import corpora, models

FILE = 'italian_cuisine_2.txt'  #file contains a set of documents divided by paragraphs
TOPIC_NUMBER = 10
WORDS_FOR_TOPIC = 5

with open(FILE, encoding='utf-8') as file:
    # for each doc create list of pre-processed words in that doc (list of lists)
    list_document_words = []

    for line in file:
        if "<doc" in line:  # tag for new doc
            document_words = []  # list of words that will be part of the document
            while True:
                next_line = file.readline()  # read next line

                #remove unuseful tags
                new_next_line = next_line.replace("<p> ", "").replace("</p>\n", "").replace("/p", "")

                if "</doc>" in next_line:
                    break
                #pre-processing steps
                sentence_words = pre_processing(new_next_line)
                document_words.extend(sentence_words)
            list_document_words.append(document_words)
    file.close()

    print("Documents number: ", len(list_document_words))
    print()

    # Create a dict with integer keys for all words
    dictionary_LDA = corpora.Dictionary(list_document_words)

    # delete all terms that do NOT appear in at least 3 documents.
    #delete all terms that appear in more than 50% of documents (see filter_extremes official doc).
    dictionary_LDA.filter_extremes(no_below=3)

    # Converts each document into a list of BoW (list of (id_term, term_frequency) for each term in doc)
    corpus = [dictionary_LDA.doc2bow(document_words) for document_words in list_document_words]
    

    lda_model = models.LdaModel(corpus, num_topics=TOPIC_NUMBER, \
                                id2word=dictionary_LDA, \
                                passes=4, alpha=[0.01] * TOPIC_NUMBER, \
                                eta=[0.01] * len(dictionary_LDA.keys()))

    for i, topic in lda_model.show_topics(formatted=True, num_topics=TOPIC_NUMBER, num_words=WORDS_FOR_TOPIC):
        print(str(i) + ": " + topic)
        print()
        
    print("probability distribution of topics in the document 5")
    print(lda_model[corpus[5]])  # corpus[0] means the first document.5

Documents number:  310

0: 0.022*"pan" + 0.018*"cake" + 0.013*"wok" + 0.008*"potato" + 0.007*"chef"

1: 0.026*"italian" + 0.016*"blog" + 0.015*"wedding" + 0.014*"italy" + 0.010*"tradition"

2: 0.044*"pork" + 0.034*"chop" + 0.011*"subscribe" + 0.010*"email" + 0.010*"pan"

3: 0.031*"knife" + 0.019*"chef" + 0.012*"blade" + 0.011*"kitchen" + 0.009*"steel"

4: 0.013*"knife" + 0.012*"chop" + 0.012*"lamb" + 0.011*"meat" + 0.006*"italian"

5: 0.015*"bread" + 0.012*"italian" + 0.010*"cake" + 0.010*"episode" + 0.009*"easter"

6: 0.023*"italian" + 0.010*"dish" + 0.009*"chop" + 0.008*"pasta" + 0.008*"bread"

7: 0.033*"chop" + 0.030*"pork" + 0.016*"chicken" + 0.012*"sauce" + 0.011*"steak"

8: 0.013*"cuisine" + 0.013*"dish" + 0.012*"chef" + 0.011*"chop" + 0.009*"sausage"

9: 0.026*"sauce" + 0.015*"onion" + 0.015*"pork" + 0.014*"tomato" + 0.011*"italian"

probability distribution of topics in the document 5
[(4, 0.99908227)]


[(0, 1), (1, 2), (2, 2), (3, 1), (4, 1), (5, 2), (6, 1), (7, 1), (8, 1), (9, 4), (10, 1), (11, 3), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 2), (20, 1), (21, 4), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 2), (31, 2), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 2), (40, 1), (41, 2), (42, 3), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 5), (49, 3), (50, 1), (51, 1), (52, 2), (53, 1), (54, 1), (55, 2), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 3), (62, 2), (63, 2), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 2), (79, 1), (80, 10), (81, 1)]

[(0, 1), (1, 2), (2, 2), (3, 1), (4, 1), (5, 2), (6, 1), (7, 1), (8, 1), (9, 4), (10, 1), (11, 3), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 2), (20, 1), (21, 4), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 2), (31, 2), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 2), (40, 1), (41, 2), (42, 3), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 5), (49, 3), (50, 1), (51, 1), (52, 2), (53, 1), (54, 1), (55, 2), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 3), (62, 2), (63, 2), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 2), (79, 1), (80, 10), (81, 1)]

{'Topic_0': ['cake', 'chef', 'pan', 'winner', 'italian', 'episode', 'oven', 'chopped', 'dish', 'chop'], 'Topic_1': ['subscribe', 'email', 'newsletter', 'feature', 'sign', 'unsubscribe', 'receive', 'hair', 'week', 'youve'], 'Topic_2': ['italian', 'cuisine', 'dish', 'italy', 'tradition', 'include', 'wedding', 'popular', 'region', 'day'], 'Topic_3': ['pan', 'sausage', 'chef', 'meat', 'italian', 'cooking', 'heat', 'steak', 'copper', 'grill'], 'Topic_4': ['chef', 'potato', 'sushi', 'italian', 'dish', 'french', 'style', 'cuisine', 'large', 'wine'], 'Topic_5': ['pork', 'chop', 'bread', 'christmas', 'italian', 'roast', 'flavor', 'meat', 'slice', 'salad'], 'Topic_6': ['chop', 'pork', 'pan', 'skillet', 'heat', 'side', 'oil', 'salt', 'chops', 'pepper'], 'Topic_7': ['knife', 'blog', 'blade', 'wok', 'chef', 'chefs', 'italian', 'steel', 'handle', 'kitchen'], 'Topic_8': ['italian', 'sauce', 'chop', 'pork', 'chicken', 'tomato', 'pasta', 'oil', 'meat', 'fresh'], 'Topic_9': ['roll', 'sushi', 'chop', 'meat', 'rice', 'steak', 'dish', 'pork', 'love', 'suey']}