In [3]:
import MySQLdb
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk

In [None]:
import itertools
from sklearn.feature_extraction.text import TfidfVectorizer
from fuzzywuzzy import fuzz

In [4]:
CONNECTION = MySQLdb.connect("localhost", "eodonnell", "IrishThunder", "equity")
CURSOR = CONNECTION.cursor()
GETDATA = """select concat(t.abbrv,'-S',e.season,'-E',e.episode_number,' ',e.title) as X, 
REPLACE(synopsis,'\n','') as synopsis 
from episode e 
inner join tv_show t on e.show_id = t.show_id 
where synopsis is not null and t.show_id = 1
order by t.abbrv,e.season,e.episode_number limit 100000"""


In [5]:
CURSOR.execute(GETDATA)
rows=CURSOR.fetchall()
episodes=[]
ids=[]
for row in rows:
    ids.append(row[0])
    episodes.append(row[1])

In [6]:
documents = pd.DataFrame(episodes, columns = ['synopsis']) 
documents['index'] = documents.index
documents['episode_len'] = documents['synopsis'].apply(len)

In [7]:
documents.head()

Unnamed: 0,synopsis,index,episode_len
0,Thirteen years before James T. Kirk takes comm...,0,2969
1,The U.S.S. Enterprise arrives at planet M-113 ...,1,2492
2,"As Charles Evans, the lone survivor of a crash...",2,1500
3,"Some years before this second pilot, the S.S. ...",3,1994
4,Spock and Joe Tormolen beam down to planet Psi...,4,1432


### Data Preprocessing

#### Lemmatize example

In [40]:
print(WordNetLemmatizer().lemmatize('went', pos='v'))

go


#### Stemmer Example

In [41]:
stemmer = SnowballStemmer('english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [79]:
doc_sample = documents[documents['index'] == 4].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)

original document: 
['Spock', 'and', 'Joe', 'Tormolen', 'beam', 'down', 'to', 'planet', 'Psi', '2000', 'to', 'pick', 'up', 'a', 'research', 'party', 'before', 'the', 'planet', 'disintegrates.', 'They', 'find', 'everyone', 'has', 'died,', 'frozen', 'when', 'life', 'support', 'was', 'turned', 'off.', 'Even', 'stranger,', 'the', 'positions', 'of', 'the', "researcher's", 'bodies', 'show', 'they', 'were', 'out', 'of', 'their', 'minds', 'when', 'they', 'perished', 'and', 'some', 'died', 'by', 'suicide.Unknowingly,', 'Tormolen', 'carried', 'what', 'is', 'later', 'termed', 'the', 'Psi', '2000', 'virus,', 'back', 'to', 'the', 'U.S.S.', 'Enterprise', 'and', 'it', 'spreads', 'among', 'the', 'crew', 'at', 'an', 'alarming', 'rate,', 'reaching', 'into', 'their', 'souls', 'and', 'pulling', 'out', 'their', 'deepest', 'desires', 'for', 'public', 'display.', 'The', 'virus', 'is', 'water', 'borne', 'and', 'spread', 'by', 'perspiration', 'in', 'a', 'touch.', 'While', 'trying', 'to', 'stop', 'Tormolen', 'f

In [80]:
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))



 tokenized and lemmatized document: 
['spock', 'tormolen', 'beam', 'planet', 'pick', 'research', 'parti', 'planet', 'disintegr', 'die', 'freez', 'life', 'support', 'turn', 'stranger', 'posit', 'research', 'bodi', 'mind', 'perish', 'die', 'suicid', 'unknow', 'tormolen', 'carri', 'later', 'term', 'virus', 'enterpris', 'spread', 'crew', 'alarm', 'rate', 'reach', 'soul', 'pull', 'deepest', 'desir', 'public', 'display', 'virus', 'water', 'bear', 'spread', 'perspir', 'touch', 'tri', 'stop', 'tormolen', 'kill', 'sulu', 'kevin', 'riley', 'infect', 'virus', 'crew', 'begin', 'display', 'action', 'humor', 'exagger', 'desir', 'like', 'sulu', 'threaten', 'bridg', 'fenc', 'foil', 'kirk', 'over', 'romant', 'crew', 'heart', 'wrench', 'like', 'christin', 'chapel', 'admit', 'love', 'spock', 'vulcan', 'weep', 'uncontrol', 'deadliest', 'result', 'riley', 'declar', 'captain', 'enterpris', 'lock', 'engin', 'unfortun', 'lock', 'command', 'ship', 'engin', 'riley', 'shut', 'enterpris', 'pull', 'errat', 'grav

In [81]:
processed_docs = documents['synopsis'].map(preprocess)

In [82]:
processed_docs[:10]

0    [thirteen, year, jam, kirk, take, command, ent...
1    [enterpris, arriv, planet, deliv, suppli, robe...
2    [charl, evan, lone, survivor, crash, colon, ex...
3    [year, second, pilot, valiant, encount, unknow...
4    [spock, tormolen, beam, planet, pick, research...
5    [orbit, planet, alfa, enterpris, experi, trans...
6    [enterpris, pursu, unknown, ship, asteroid, be...
7    [enterpris, arriv, orbit, search, exobiologist...
8    [enterpris, answer, distress, signal, unnam, p...
9    [simon, gelder, psychiatr, staff, tantalus, pe...
Name: synopsis, dtype: object

### Bag of words on the dataset

In [83]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [84]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

442 amok
2017 jail
1134 sound
608 sever
810 actor
787 meaningless
2183 tend
2158 sole
2061 util
738 larg
407 humor


In [85]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [86]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [87]:
bow_doc_4 = bow_corpus[4]
for i in range(len(bow_doc_4)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4[i][0], 
                                                     dictionary[bow_doc_4[i][0]], 
                                                     bow_doc_4[i][1]))

Word 2 ("command") appears 1 time.
Word 7 ("kill") appears 1 time.
Word 12 ("parti") appears 1 time.
Word 25 ("begin") appears 1 time.
Word 27 ("bodi") appears 1 time.
Word 31 ("find") appears 1 time.
Word 41 ("like") appears 2 time.
Word 44 ("tri") appears 1 time.
Word 45 ("vulcan") appears 1 time.
Word 49 ("life") appears 1 time.
Word 52 ("manag") appears 1 time.
Word 53 ("stop") appears 1 time.


### TF-IDF

In [88]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)

In [89]:
corpus_tfidf = tfidf[bow_corpus]

In [90]:
from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.13261549420372357),
 (1, 0.25609843272337907),
 (2, 0.15322925836603835),
 (3, 0.23763489685649733),
 (4, 0.13261549420372357),
 (5, 0.17195811942271177),
 (6, 0.569624676963601),
 (7, 0.07647215428295387),
 (8, 0.1698400168137915),
 (9, 0.1698400168137915),
 (10, 0.1653284426228654),
 (11, 0.24735461636845363),
 (12, 0.07120308462045012),
 (13, 0.3841476490850686),
 (14, 0.10075503480226164),
 (15, 0.1591008185361725),
 (16, 0.14240616924090024),
 (17, 0.08202617374558821),
 (18, 0.12367730818422681),
 (19, 0.1698400168137915),
 (20, 0.17579546783144476),
 (21, 0.07380383966190238),
 (22, 0.13261549420372357)]


### Running LDA using Bag of Words

In [108]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=100, id2word=dictionary, passes=2, workers=2)

In [109]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.149*"return" + 0.081*"kill" + 0.081*"abl" + 0.080*"find" + 0.079*"parti" + 0.079*"transport" + 0.044*"life" + 0.036*"realiz" + 0.034*"take" + 0.032*"forc"
Topic: 1 
Words: 0.149*"life" + 0.111*"space" + 0.076*"earth" + 0.039*"arriv" + 0.039*"aboard" + 0.039*"attempt" + 0.038*"inhabit" + 0.038*"attack" + 0.038*"destroy" + 0.038*"convinc"
Topic: 2 
Words: 0.015*"like" + 0.015*"destroy" + 0.015*"attempt" + 0.015*"vulcan" + 0.015*"tri" + 0.015*"peopl" + 0.015*"make" + 0.015*"help" + 0.015*"come" + 0.015*"tell"
Topic: 3 
Words: 0.085*"space" + 0.082*"transport" + 0.067*"land" + 0.065*"parti" + 0.055*"aboard" + 0.048*"take" + 0.047*"time" + 0.045*"soon" + 0.041*"scotti" + 0.040*"leav"
Topic: 4 
Words: 0.117*"life" + 0.096*"creatur" + 0.066*"discov" + 0.066*"like" + 0.066*"human" + 0.065*"parti" + 0.065*"year" + 0.033*"remain" + 0.033*"destroy" + 0.033*"land"
Topic: 5 
Words: 0.063*"discov" + 0.063*"convinc" + 0.061*"arriv" + 0.061*"like" + 0.042*"realiz" + 0.041*"parti" + 

Cool! Can you distinguish different topics using the words in each topic and their corresponding weights?

### Running LDA using TF-IDF

In [110]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=100, id2word=dictionary, passes=2, workers=4)

In [111]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.015*"like" + 0.015*"destroy" + 0.015*"attempt" + 0.015*"vulcan" + 0.015*"tri" + 0.015*"peopl" + 0.015*"make" + 0.015*"help" + 0.015*"come" + 0.015*"tell"
Topic: 1 Word: 0.015*"like" + 0.015*"destroy" + 0.015*"attempt" + 0.015*"vulcan" + 0.015*"tri" + 0.015*"peopl" + 0.015*"make" + 0.015*"help" + 0.015*"come" + 0.015*"tell"
Topic: 2 Word: 0.015*"like" + 0.015*"destroy" + 0.015*"attempt" + 0.015*"vulcan" + 0.015*"tri" + 0.015*"peopl" + 0.015*"make" + 0.015*"help" + 0.015*"come" + 0.015*"tell"
Topic: 3 Word: 0.015*"like" + 0.015*"destroy" + 0.015*"attempt" + 0.015*"vulcan" + 0.015*"tri" + 0.015*"peopl" + 0.015*"make" + 0.015*"help" + 0.015*"come" + 0.015*"tell"
Topic: 4 Word: 0.015*"like" + 0.015*"destroy" + 0.015*"attempt" + 0.015*"vulcan" + 0.015*"tri" + 0.015*"peopl" + 0.015*"make" + 0.015*"help" + 0.015*"come" + 0.015*"tell"
Topic: 5 Word: 0.015*"like" + 0.015*"destroy" + 0.015*"attempt" + 0.015*"vulcan" + 0.015*"tri" + 0.015*"peopl" + 0.015*"make" + 0.015*"help" + 0.

### Classification of the topics

### Performance evaluation by classifying sample document using LDA Bag of Words model

In [112]:
processed_docs[4]

['spock',
 'tormolen',
 'beam',
 'planet',
 'pick',
 'research',
 'parti',
 'planet',
 'disintegr',
 'die',
 'freez',
 'life',
 'support',
 'turn',
 'stranger',
 'posit',
 'research',
 'bodi',
 'mind',
 'perish',
 'die',
 'suicid',
 'unknow',
 'tormolen',
 'carri',
 'later',
 'term',
 'virus',
 'enterpris',
 'spread',
 'crew',
 'alarm',
 'rate',
 'reach',
 'soul',
 'pull',
 'deepest',
 'desir',
 'public',
 'display',
 'virus',
 'water',
 'bear',
 'spread',
 'perspir',
 'touch',
 'tri',
 'stop',
 'tormolen',
 'kill',
 'sulu',
 'kevin',
 'riley',
 'infect',
 'virus',
 'crew',
 'begin',
 'display',
 'action',
 'humor',
 'exagger',
 'desir',
 'like',
 'sulu',
 'threaten',
 'bridg',
 'fenc',
 'foil',
 'kirk',
 'over',
 'romant',
 'crew',
 'heart',
 'wrench',
 'like',
 'christin',
 'chapel',
 'admit',
 'love',
 'spock',
 'vulcan',
 'weep',
 'uncontrol',
 'deadliest',
 'result',
 'riley',
 'declar',
 'captain',
 'enterpris',
 'lock',
 'engin',
 'unfortun',
 'lock',
 'command',
 'ship',
 'engi

In [113]:
for index, score in sorted(lda_model[bow_corpus[4]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.9292855858802795	 
Topic: 0.147*"like" + 0.074*"parti" + 0.074*"command" + 0.074*"tri" + 0.074*"vulcan" + 0.074*"kill" + 0.074*"begin" + 0.074*"life" + 0.074*"manag" + 0.074*"stop"


Our test document has the highest probability to be part of the topic on the top.

### Performance evaluation by classifying sample document using LDA TF-IDF model

In [114]:
for index, score in sorted(lda_model_tfidf[bow_corpus[4]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.41271159052848816	 
Topic: 0.088*"parti" + 0.085*"kill" + 0.080*"life" + 0.073*"control" + 0.066*"dead" + 0.064*"feder" + 0.063*"land" + 0.059*"escap" + 0.053*"peopl" + 0.049*"find"

Score: 0.20200461149215698	 
Topic: 0.170*"bodi" + 0.119*"time" + 0.112*"offic" + 0.108*"command" + 0.063*"soon" + 0.061*"begin" + 0.036*"take" + 0.035*"remain" + 0.028*"orbit" + 0.025*"fall"

Score: 0.15605366230010986	 
Topic: 0.150*"human" + 0.129*"like" + 0.109*"final" + 0.105*"realiz" + 0.095*"discov" + 0.059*"know" + 0.048*"convinc" + 0.043*"arriv" + 0.035*"fall" + 0.024*"remain"

Score: 0.07846108824014664	 
Topic: 0.142*"power" + 0.113*"space" + 0.104*"appear" + 0.090*"manag" + 0.085*"earth" + 0.076*"learn" + 0.059*"destroy" + 0.058*"orbit" + 0.056*"leav" + 0.054*"explain"

Score: 0.07769134640693665	 
Topic: 0.171*"creatur" + 0.127*"vulcan" + 0.084*"year" + 0.083*"tell" + 0.074*"attack" + 0.067*"peopl" + 0.063*"forc" + 0.061*"know" + 0.058*"order" + 0.040*"make"


Our test document has the highest probability to be part of the topic on the top.

### Testing model on unseen document

In [116]:
##### get random synopsis for testing
GETDATA = """select REPLACE(e.synopsis,'\n','') as synopsis 
from episode e
inner join tv_show t on e.show_id = t.show_id 
where synopsis is not null and t.show_id = 2
order by rand() limit 1"""
CURSOR.execute(GETDATA)
rows=CURSOR.fetchall()
test_synopsis=[]
for row in rows:
    test_synopsis.append(row[0])
test_synopsis

["When Wesley's experiment with a warp field goes awry, Dr.                     Crusher is unknowingly catapulted into a universe created by                     her own mind.                                         After escorting her friend Dr. Dalen Quaice aboard the U.S.S.                     Enterprise, Beverly is deeply moved by his sadness over                     losing his wife and other people he has loved. She seeks out                     her son Wesley, and finds him in engineering, working with a                     warp bubble on a computer screen. While she watches, his                     experiment inadvertently causes a strange flash of light.                     Soon afterward, Dr. Crusher is unable to locate Dr. Quaice.                     The ship's computer reports that he has never been on board,                     and Picard and Worf inform her that they had no knowledge of                     his presence on the ship. Additionally, O'Brien does not            

In [118]:
unseen_document = test_synopsis[0]
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 12)))

Score: 0.37104618549346924	 Topic: 0.065*"vulcan" + 0.065*"go" + 0.062*"control" + 0.041*"surfac" + 0.039*"live" + 0.035*"explain" + 0.033*"final" + 0.031*"bodi" + 0.031*"tell" + 0.030*"find" + 0.029*"parti" + 0.029*"peopl"
Score: 0.3435274064540863	 Topic: 0.074*"starship" + 0.074*"return" + 0.074*"send" + 0.050*"scotti" + 0.050*"refus" + 0.050*"tell" + 0.050*"aboard" + 0.049*"transport" + 0.025*"land" + 0.025*"convinc" + 0.025*"begin" + 0.025*"order"
Score: 0.2519777715206146	 Topic: 0.080*"return" + 0.066*"help" + 0.066*"peopl" + 0.064*"surfac" + 0.054*"inhabit" + 0.051*"earth" + 0.049*"soon" + 0.048*"find" + 0.037*"bodi" + 0.034*"control" + 0.033*"time" + 0.033*"transport"
