In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import gensim
import string 
import nltk
import pickle

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from gensim import corpora, models  

In [2]:
# Building up the dictionary

text_data = []

with open('dictionary_bigram_cleaned_reviews_nverbs_only.txt', 'r') as f:
    for line in f:
        tokens = word_tokenize(line)
        text_data.append(tokens)

dictionary = gensim.corpora.Dictionary(text_data)

In [3]:
# Instantiate lemmatizer
lemmatizer = WordNetLemmatizer()

def preprocess(x):
    
    # 1. Lower case
    tmp = x.lower()
    
    # 2. Tokenize the sentences
    tokens = word_tokenize(tmp)

    # Stopword List
    stopword_list = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't", 'wine', 'drink', 'shows', 'also', 'made', 'like', 'bit', 'give', 'opens', 'alongside', 'along', 'ready', 'yet', 'one', 'feels', 'almost']

    # 3. Remove stopwords
    no_stopwords = [word for word in tokens if word not in stopword_list and word.isalpha()]
    
    # 4. Lemmatize
    lemma_text = ' '.join([lemmatizer.lemmatize(word) for word in no_stopwords])

    # 5. Remove punctuations
    processed_text = lemma_text.translate(str.maketrans('', '', string.punctuation))
        
    return processed_text

In [4]:
test_data = "Delivering profound notes of black and red currants, blackberry fruit, blood orange citrus, and dried raspberries underscored by baking spices and dried red florals, this pinot noir is also a textural masterpiece with mouthwatering acidity and grippy cedar-like tannins"

In [5]:
test_data_cleaned = preprocess(test_data)
test_data_cleaned_tokens = word_tokenize(test_data_cleaned)
test_data_cleaned_tokens_bow = [dictionary.doc2bow(test_data_cleaned_tokens)]

In [6]:
# Reload LDA BIGRAM TF-IDF Model
with open('./LDA Gensim (Initial Model)/lda_tfidf_bigram_full_model.pk', 'rb') as file:
    lda_model = pickle.load(file)

In [7]:
# Sorted in descending order by coherence score
# for index, score in sorted(lda_model[test_data_cleaned_tokens_bow], key = lambda x: -1 * x[1]):
for index, score in sorted(lda_model[test_data_cleaned_tokens_bow][0], key = lambda x: -1 * x[1]):

    print("Score: {}\n\nTopic Number: {}\nTopic Keywords: {}".format(score, lda_model.show_topics()[index][0],lda_model.print_topic(index, 10)), "\n")

Score: 0.38397103548049927

Topic Number: 0
Topic Keywords: 0.010*"cherry" + 0.009*"blackberry" + 0.008*"merlot" + 0.008*"cabernet_sauvignon" + 0.008*"blend" + 0.008*"tannin" + 0.007*"flavor" + 0.006*"fruit" + 0.006*"oak" + 0.006*"herb" 

Score: 0.21869176626205444

Topic Number: 4
Topic Keywords: 0.018*"fruit" + 0.016*"acidity" + 0.014*"tannin" + 0.014*"character" + 0.013*"ripe" + 0.013*"structure" + 0.011*"fruity" + 0.011*"texture" + 0.011*"wood" + 0.011*"age" 

Score: 0.19740286469459534

Topic Number: 5
Topic Keywords: 0.015*"apple" + 0.013*"peach" + 0.012*"citrus" + 0.010*"palate" + 0.009*"note" + 0.009*"acidity" + 0.008*"flavor" + 0.008*"finish" + 0.008*"aroma" + 0.008*"melon" 

Score: 0.10652409493923187

Topic Number: 3
Topic Keywords: 0.011*"cherry" + 0.007*"spice" + 0.007*"cola" + 0.006*"raspberry" + 0.006*"flavor" + 0.006*"blackberry" + 0.006*"tannin" + 0.006*"oak" + 0.006*"fruit" + 0.005*"note" 

Score: 0.08715669065713882

Topic Number: 2
Topic Keywords: 0.012*"crisp" + 0.

In [8]:
lda_model.show_topics()

[(0,
  '0.010*"cherry" + 0.009*"blackberry" + 0.008*"merlot" + 0.008*"cabernet_sauvignon" + 0.008*"blend" + 0.008*"tannin" + 0.007*"flavor" + 0.006*"fruit" + 0.006*"oak" + 0.006*"herb"'),
 (1,
  '0.014*"aroma" + 0.013*"palate" + 0.013*"berry" + 0.012*"cherry" + 0.012*"plum" + 0.010*"tannin" + 0.010*"note" + 0.008*"offer" + 0.007*"finish" + 0.007*"flavor"'),
 (2,
  '0.012*"crisp" + 0.011*"acidity" + 0.010*"fruity" + 0.008*"fruit" + 0.008*"texture" + 0.008*"aftertaste" + 0.007*"apple" + 0.007*"citrus" + 0.007*"character" + 0.006*"refreshing"'),
 (3,
  '0.011*"cherry" + 0.007*"spice" + 0.007*"cola" + 0.006*"raspberry" + 0.006*"flavor" + 0.006*"blackberry" + 0.006*"tannin" + 0.006*"oak" + 0.006*"fruit" + 0.005*"note"'),
 (4,
  '0.018*"fruit" + 0.016*"acidity" + 0.014*"tannin" + 0.014*"character" + 0.013*"ripe" + 0.013*"structure" + 0.011*"fruity" + 0.011*"texture" + 0.011*"wood" + 0.011*"age"'),
 (5,
  '0.015*"apple" + 0.013*"peach" + 0.012*"citrus" + 0.010*"palate" + 0.009*"note" + 0.009*

In [9]:
# Create Function to obtain Topic Number

def assign_topic(x):
    if x == 0:
        return "Red Wine"
    elif x == 1:
        return "Cherry Plum Wine"
    elif x == 2:
        return "Champagne"
    elif x == 3:
        return "Berries Wine"
    elif x == 4:
        return "White Wine"
    elif x == 5:
        return "Peach Wine"

In [10]:
topics = lda_model.get_document_topics(test_data_cleaned_tokens_bow)[0]
sorted_topics = sorted(topics, key=lambda x:x[1],reverse=True)
sorted_topic_top = sorted_topics[0][0]

print(assign_topic(sorted_topic_top))

Red Wine
