# LDA analysis on Red Wine

In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction import text
from gensim import corpora, models, matutils
import nltk

In [5]:
red_clean = pd.read_csv('/Users/divyasusarla/Desktop/DSI-SF-2-divyasusarla/Capstone/Clean_Data/red_reviews_clean.csv', encoding='utf-8')

In [6]:
del red_clean['Unnamed: 0']
red_clean.dropna(inplace=True)

In [7]:
review = red_clean['red_review']

In [8]:
stop_words = text.ENGLISH_STOP_WORDS.union([u'00',
 u'000',
 u'00002',
 u'00s',
 u'01',
 u'02',
 u'02s',
 u'03',
 u'04',
 u'04s',
 u'05',
 u'05914',
 u'05s',
 u'06',
 u'06s',
 u'07',
 u'07s',
 u'08',
 u'08s',
 u'09',
 u'09s',
 u'10',
 u'100',
 u'1000',
 u'1000th',
 u'100pointers',
 u'100points',
 u'100pt',
 u'100pts',
 u'100rp',
 u'100score',
 u'100takes',
 u'100th',
 u'100year',
 u'100yo',
 u'100yrs',
 u'101',
 u'105',
 u'106',
 u'10am',
 u'10ha',
 u'10hrs',
 u'10th',
 u'10years',
 u'10yrs',
 u'11',
 u'110',
 u'1100',
 u'1111',
 u'1148',
 u'115',
 u'117',
 u'11s',
 u'11th',
 u'11yo',
 u'12',
 u'120',
 u'1200',
 u'12000',
 u'120m',
 u'122',
 u'1234',
 u'125',
 u'127',
 u'12daysofvino',
 u'12s',
 u'12th',
 u'12years',
 u'12yrs',
 u'13',
 u'130',
 u'1305',
 u'134',
 u'135',
 u'13s',
 u'13th',
 u'13years',
 u'14',
 u'140',
 u'1400',
 u'145',
 u'1450',
 u'147',
 u'1475',
 u'1499',
 u'14pc',
 u'14th',
 u'15',
 u'150',
 u'1500',
 u'151',
 u'1511',
 u'155',
 u'1556',
 u'1569',
 u'157',
 u'15day',
 u'15min',
 u'15mins',
 u'15pc',
 u'15th',
 u'15years',
 u'15yrs',
 u'16',
 u'160',
 u'165',
 u'1685',
 u'16th',
 u'16yr',
 u'17',
 u'170',
 u'1700',
 u'175',
 u'17c',
 u'17ha',
 u'17th',
 u'17years',
 u'17yo',
 u'18',
 u'180',
 u'1800',
 u'1800s',
 u'181',
 u'1826',
 u'1833',
 u'1840',
 u'1842',
 u'1844',
 u'185',
 u'1850',
 u'1853',
 u'1855',
 u'1859',
 u'187',
 u'1877',
 u'1879',
 u'1882',
 u'1893',
 u'1894',
 u'1899',
 u'18c',
 u'18mnths',
 u'18months',
 u'18mth',
 u'18mths',
 u'18th',
 u'18yr',
 u'19',
 u'190',
 u'1900',
 u'1901',
 u'1907',
 u'1920s',
 u'1924',
 u'1929',
 u'1935',
 u'1945',
 u'1946',
 u'1949',
 u'1950',
 u'1951',
 u'1953',
 u'1955',
 u'1956',
 u'1958',
 u'1961',
 u'1962',
 u'1963',
 u'1964',
 u'1965',
 u'1967',
 u'1970',
 u'1971',
 u'1972',
 u'1973',
 u'1974',
 u'1975',
 u'1976',
 u'1978',
 u'1979',
 u'1980',
 u'1980s',
 u'1981',
 u'1982',
 u'1983',
 u'1984',
 u'1985',
 u'1985s',
 u'1986',
 u'1987',
 u'1988',
 u'1989',
 u'1990',
 u'1990s',
 u'1991',
 u'1992',
 u'1993',
 u'1994',
 u'1995',
 u'1996',
 u'1997',
 u'1998',
 u'1999',
 u'19th',
 u'1er',
 u'1h',
 u'1hour',
 u'1hr',
 u'1of',
 u'1of6',
 u'1pc',
 u'1st',
 u'1vixxbo',
 u'1w7j7yi',
 u'1wk',
 u'1yr',
 u'20',
 u'200',
 u'2000',
 u'20000',
 u'2000s',
 u'2001',
 u'2001s',
 u'2002',
 u'2003',
 u'2003s',
 u'2004',
 u'2005',
 u'2005s',
 u'2006',
 u'2006s',
 u'2007',
 u'2007s',
 u'2008',
 u'2009',
 u'2009s',
 u'200metres',
 u'200th',
 u'2010',
 u'2010s',
 u'2011',
 u'2011s',
 u'2012',
 u'2012s',
 u'2013',
 u'2014',
 u'2015',
 u'2016',
 u'2017',
 u'2018',
 u'2019',
 u'2020',
 u'2021',
 u'2022',
 u'2023',
 u'2024',
 u'2024ish',
 u'2025',
 u'2026',
 u'2027',
 u'2028',
 u'2030',
 u'2035',
 u'2038',
 u'2052',
 u'2055',
 u'20k',
 u'20months',
 u'20s',
 u'20th',
 u'20years',
 u'20yo',
 u'20yr',
 u'20yrs',
 u'21',
 u'210',
 u'2102',
 u'2106',
 u'21st',
 u'22',
 u'220',
 u'225',
 u'225l',
 u'22mths',
 u'22nd',
 u'22pc',
 u'23',
 u'230',
 u'23rd',
 u'24',
 u'240',
 u'2400',
 u'242',
 u'245',
 u'24mth',
 u'25',
 u'250',
 u'2500',
 u'25day',
 u'25plots',
 u'25th',
 u'25yo',
 u'25yrs',
 u'26',
 u'260',
 u'2600',
 u'26th',
 u'27',
 u'2700th',
 u'278',
 u'28',
 u'280',
 u'284',
 u'285th',
 u'28day',
 u'28hrs',
 u'28y',
 u'29',
 u'29th',
 u'29yo',
 u'2days',
 u'2e',
 u'2h',
 u'2hours',
 u'2hrs',
 u'2k',
 u'2kg',
 u'2nd',
 u'2ndary',
 u'2pc',
 u'2pm',
 u'2x',
 u'2y',
 u'2yr',
 u'2yrs',
 u'2\xe8me',
 u'30',
 u'300',
 u'3000',
 u'300l',
 u'300th',
 u'30am',
 u'30hrs',
 u'30ish',
 u'30min',
 u'30mins',
 u'30minutes',
 u'30months',
 u'30pc',
 u'30pm',
 u'30seconds',
 u'30th',
 u'30years',
 u'30yr',
 u'30yrs',
 u'31',
 u'32',
 u'321',
 u'33',
 u'333',
 u'34',
 u'3444',
 u'35',
 u'350',
 u'350th',
 u'36',
 u'365daysofvino',
 u'36months',
 u'37',
 u'375',
 u'375ml',
 u'37th',
 u'38',
 u'380',
 u'38years',
 u'39',
 u'3h',
 u'3hours',
 u'3hrs',
 u'3l',
 u'3malbec',
 u'3o',
 u'3rd',
 u'3week',
 u'3x',
 u'3years',
 u'3yrs',
 u'40',
 u'400',
 u'4000',
 u'4000k',
 u'400s',
 u'400th',
 u'407',
 u'40ha',
 u'40min',
 u'40mins',
 u'40th',
 u'41',
 u'42',
 u'43',
 u'44',
 u'45',
 u'450',
 u'4568',
 u'45min',
 u'45mins',
 u'45sec',
 u'45th',
 u'46',
 u'462',
 u'47',
 u'48',
 u'48hrs',
 u'49',
 u'4cl',
 u'4h',
 u'4hrs',
 u'4k',
 u'4th',
 u'50',
 u'500',
 u'5000',
 u'500l',
 u'500m',
 u'500th',
 u'50ha',
 u'50ish',
 u'50ml',
 u'50s',
 u'50th',
 u'51',
 u'52',
 u'52pc',
 u'53',
 u'54',
 u'54ratings',
 u'55',
 u'56',
 u'57',
 u'570',
 u'5765',
 u'58',
 u'59',
 u'5abv',
 u'5hr',
 u'5hrs',
 u'5k',
 u'5l',
 u'5pc',
 u'5stars',
 u'5th',
 u'5x',
 u'5y',
 u'5years',
 u'5yrs',
 u'60',
 u'600',
 u'6000',
 u'600th',
 u'60ish',
 u'60s',
 u'60th',
 u'61',
 u'6100',
 u'61days',
 u'62',
 u'63',
 u'64',
 u'65',
 u'66',
 u'667',
 u'67',
 u'68',
 u'68pc',
 u'69',
 u'6l',
 u'6pm',
 u'6th',
 u'70',
 u'700',
 u'700th',
 u'707',
 u'70pc',
 u'70s',
 u'70yrs',
 u'71',
 u'72',
 u'73',
 u'74',
 u'75',
 u'750',
 u'750ml',
 u'750s',
 u'75ml',
 u'76',
 u'77',
 u'777',
 u'78',
 u'79',
 u'7hrs',
 u'7pm',
 u'7th',
 u'7years',
 u'7yrs',
 u'80',
 u'800',
 u'8000',
 u'800l',
 u'801',
 u'80s',
 u'80th',
 u'80usd',
 u'81',
 u'82',
 u'82s',
 u'83',
 u'83pts',
 u'83s',
 u'84',
 u'85',
 u'85pc',
 u'85pts',
 u'86',
 u'86pts',
 u'87',
 u'87pts',
 u'88',
 u'88ag',
 u'88pt',
 u'88pts',
 u'88rp',
 u'88st',
 u'89',
 u'890',
 u'89pts',
 u'89st',
 u'89ws',
 u'8hrs',
 u'8th',
 u'8years',
 u'8yr',
 u'8yrs',
 u'90',
 u'900',
 u'904',
 u'90daysoulmate',
 u'90ish',
 u'90min',
 u'90pt',
 u'90pts',
 u'90s',
 u'90wa',
 u'90ws',
 u'91',
 u'91pts',
 u'91ws',
 u'92',
 u'92pts',
 u'92rp',
 u'92we',
 u'92ws',
 u'93',
 u'93pt',
 u'93pts',
 u'93tanzer',
 u'93w',
 u'94',
 u'94pt',
 u'94pts',
 u'94suckling',
 u'94ws',
 u'95',
 u'95pt',
 u'95pts',
 u'95ws',
 u'96',
 u'96pts',
 u'96we',
 u'96ws',
 u'97',
 u'97pts',
 u'98',
 u'98pts',
 u'98s',
 u'99',
 u'994',
 u'999',
 u'99ha',
 u'99pts',
 u'9am',
 u'9pv',
 u'9th',
 u'a1',
 u'aab', 'pinot', 'noir', 'cabernet', 'sauvignon', 'great', 'wine', 'nice', 'needs', 'time', 'cab', 'franc', 
'red', 'ruby', 'dark', 'good', 'nose', 'bottle', 'palate', 'little', 'deep', 'purple', 'best', 'wines', 'years', 
'colour', 'excellent'])

In [9]:
review

0        Great Syrah. Inky purple. I prefer a bit dryer...
1        Jonata is referred to as"blood Syrah" for its ...
2        I wasn't searching for THE Syrah, but boom, I ...
3                    Paired with filet mignon. Incredible.
4                                  Very smooth and mellow.
5        Very few syrahs are made better and more opule...
6        Dark color and dark fruit.   It has aged well ...
7        I savored every single drop of this wine (and ...
8        awesome fruit, excellent structure, great tann...
9                                                    94pts
10       Took a little time to open up but once it did,...
11          Great wine, bold, fragrant, and strong flavor.
12       Hmm.  I have to agree with winery as to the io...
13                                                 Smooth!
14       A super Cab Franc. Big in the mouth, subtle in...
15                          Jonata does the trick tonight.
16       Big enough to need a good decant. Exceptionall.

In [154]:
# stop_words = list(stop_words)
# review = [x.lower() for x in review]

# from gensim import corpora, models, matutils
# from collections import defaultdict

# # remove words that appear only once
# frequency = defaultdict(int)

# for text in review:
#     for token in text.split():
#         frequency[token] += 1

# texts = [[token for token in text.split() if frequency[token] > 1 and token not in stop_words]
#           for text in review]

# # Create gensim dictionary object
# dictionary = corpora.Dictionary(texts)

# # Create corpus matrix
# corpus = [dictionary.doc2bow(text) for text in texts]

In [45]:
vectorizer = CountVectorizer(stop_words= stop_words, ngram_range=(2,2))
X = vectorizer.fit_transform(review)
X.todense()

# use the counter option to get the most common words. 
from collections import Counter
# Count the most common tokens 
summaries = "".join(red_clean['red_review'])
vocab_summaries = vectorizer.build_analyzer()(summaries)
red_vocab = Counter(vocab_summaries).most_common(100)
red_vocab = dict(red_vocab)

vectorizer2 = CountVectorizer(vocabulary=red_vocab.keys(), stop_words= stop_words, ngram_range=(2,2))
X2 = vectorizer2.fit_transform(review)
X2.shape
# X2.todense()

# vocab2 = {v: k for k, v in vectorizer2.vocabulary_.iteritems()}

# lda2 = models.LdaModel(
#    matutils.Sparse2Corpus(X2, documents_columns=False),
# #     corpus,
#    num_topics  =  10,
#    passes      =  2,
#    id2word     =  vocab2
# #     id2word     =  dictionary
# )

# lda2.print_topics(num_topics=10, num_words=10)

(87063, 100)

In [41]:
bow = []
blank_documents = 0
for document in X2.toarray():
    
    single_document = []
    
    for token_id, token_count in enumerate(document):

        if token_count > 0:
            single_document.append((token_id, token_count))
        # print single_document
    if len(single_document):
        bow.append(single_document)
    else:
        blank_documents += 1
        bow.append([])

In [42]:
len(bow), blank_documents

(87063, 60986)

In [43]:
blank_docs = 0

for document in X2.toarray():
    
    if sum(document) == 0:
        blank_docs += 1
        
blank_docs

60986

In [24]:
lda2.get_document_topics(bow[0])

[(0, 0.10000000000000002),
 (1, 0.10000000000000002),
 (2, 0.10000000000000002),
 (3, 0.10000000000000002),
 (4, 0.10000000000000002),
 (5, 0.10000000000000002),
 (6, 0.10000000000000002),
 (7, 0.10000000000000002),
 (8, 0.10000000000000002),
 (9, 0.10000000000000002)]

In [14]:
topic_proba =lda2.get_document_topics(bow)

In [18]:
X2.shape

(87063, 100)

In [16]:
review.shape

(87063,)

In [15]:
simplelist =[]
i=0
for x in topic_proba:
    i+=1
    if i % 250 == 0:
        print i
    simplelist.append(x)

# list of lists , inner list, each one should be only 10 (topic numbers) values (non-tuples)
valuelist = [[y[1] for y in line] for line in simplelist]

pd.DataFrame(valuelist, columns = ['t1','t2','t3','t4','t5','t6','t7','t8','t9','t10'])

250
500
750
1000
1250
1500
1750
2000
2250
2500
2750
3000
3250
3500
3750
4000
4250
4500
4750
5000
5250
5500
5750
6000
6250
6500
6750
7000
7250
7500
7750
8000
8250
8500
8750
9000
9250
9500
9750
10000
10250
10500
10750
11000
11250
11500
11750
12000
12250
12500
12750
13000
13250
13500
13750
14000
14250
14500
14750
15000
15250
15500
15750
16000
16250
16500
16750
17000
17250
17500
17750
18000
18250
18500
18750
19000
19250
19500
19750
20000
20250
20500
20750
21000
21250
21500
21750
22000
22250
22500
22750
23000
23250
23500
23750
24000
24250
24500
24750
25000
25250
25500
25750
26000


Unnamed: 0,t1,t2,t3,t4,t5,t6,t7,t8,t9,t10
0,0.366667,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.366667
1,0.050000,0.050000,0.050000,0.050000,0.050000,0.050005,0.050000,0.050000,0.549995,0.050000
2,0.050000,0.050000,0.050000,0.050000,0.050000,0.050005,0.050000,0.050000,0.549995,0.050000
3,0.050000,0.050000,0.050000,0.050000,0.050000,0.550000,0.050000,0.050000,0.050000,0.050000
4,0.050000,0.050000,0.050000,0.050000,0.550000,0.050000,0.050000,0.050000,0.050000,0.050000
5,0.025000,0.525000,0.025000,0.025000,0.025000,0.025000,0.025000,0.275000,0.025000,0.025000
6,0.366667,0.033333,0.033333,0.366667,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333
7,0.050000,0.050000,0.050000,0.050000,0.050000,0.050005,0.050000,0.050000,0.549995,0.050000
8,0.033333,0.033333,0.366667,0.033333,0.366667,0.033333,0.033333,0.033333,0.033333,0.033333
9,0.550000,0.050000,0.050000,0.050000,0.050000,0.050000,0.050000,0.050000,0.050000,0.050000


In [201]:
topics_labels = {
   0: "Topic 1",
   1: "Topic 2",
   2: "Topic 3", 
    3: "Topic 4", 
    4: "Topic 5", 
    5: "Topic 6",
    6: "Topic 7",
    7: "Topic 8",
    8: "Topic 9",
    9: "Topic 10"
}

In [202]:
doc_topics = [lda2.get_document_topics(doc) for doc in corpus]

topic_data = []

for document_id, topics in enumerate(doc_topics):
    
    document_topics = []
    
    for topic, probability in topics:
       
        topic_data.append({
            'document_id':  document_id,
            'topic_id':     topic,
            'topic':        topics_labels[topic],
            'probability':  probability
        })

topics_df = pd.DataFrame(topic_data)

In [203]:
topics_df

Unnamed: 0,document_id,probability,topic,topic_id
0,0,0.100000,Topic 1,0
1,0,0.100000,Topic 2,1
2,0,0.100000,Topic 3,2
3,0,0.100000,Topic 4,3
4,0,0.100000,Topic 5,4
5,0,0.100000,Topic 6,5
6,0,0.100000,Topic 7,6
7,0,0.100000,Topic 8,7
8,0,0.100000,Topic 9,8
9,0,0.100000,Topic 10,9


In [142]:
topics_df = topics_df.pivot_table(values="probability", index=["document_id", "topic"]).T

In [143]:
topics_df

document_id  topic   
0            Topic 10    0.362121
             Topic 8     0.565142
1            Topic 1     0.112341
             Topic 10    0.426172
             Topic 2     0.350706
             Topic 7     0.090779
2            Topic 10    0.409382
             Topic 2     0.160495
             Topic 3     0.376262
3            Topic 1     0.819957
             Topic 10    0.020001
             Topic 2     0.020003
             Topic 3     0.020002
             Topic 4     0.020028
             Topic 5     0.020000
             Topic 6     0.020000
             Topic 7     0.020006
             Topic 8     0.020002
             Topic 9     0.020001
4            Topic 1     0.033333
             Topic 10    0.033358
             Topic 2     0.033333
             Topic 3     0.033333
             Topic 4     0.699959
             Topic 5     0.033333
             Topic 6     0.033333
             Topic 7     0.033333
             Topic 8     0.033339
             Topic 9     0

# Next Steps:

I need to do some more anaylsis on the topics that are going into my dataframe and check the amount of information I am getting. 

Once I clean that piece up:

1. Heat map of topic correlations - what topics are dominant
2. group by topics - see distrubtion of words within docs given the topic - plot - this will tell you what the key words mean