<a href="https://colab.research.google.com/github/dhairyaostwal/CSE4020-Codes/blob/main/DA2/ML_Theory_DA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [113]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.feature_extraction import text
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
import re
import string

In [114]:
file = 'inaug_speeches.csv'
import chardet
with open(file, 'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))
result

{'confidence': 0.73, 'encoding': 'ISO-8859-1', 'language': ''}

In [115]:
# Using the dataset of inaugural speeches by US President obtained from here
# https://www.kaggle.com/datasets/adhok93/presidentialaddress

# expand pandas df column display width to enable easy inspection
pd.set_option('max_colwidth', 150)

# read in csv to dataframe
df = pd.read_csv(file, encoding='ISO-8859-1')

# visually inspect dataframe
df.head()

Unnamed: 0.1,Unnamed: 0,Name,Inaugural Address,Date,text
0,4,George Washington,First Inaugural Address,"Thursday, April 30, 1789",Fellow-Citizens of the Senate and of the House of Representatives: AMONG the vicissitudes incident to life no event could have fille...
1,5,George Washington,Second Inaugural Address,"Monday, March 4, 1793",Fellow Citizens: I AM again called upon by the voice of my country to execute the functions of its Chief Magistrate. When the occas...
2,6,John Adams,Inaugural Address,"Saturday, March 4, 1797","WHEN it was first perceived, in early times, that no middle course for America remained between unlimited submission to a foreign le..."
3,7,Thomas Jefferson,First Inaugural Address,"Wednesday, March 4, 1801","Friends and Fellow-Citizens: CALLED upon to undertake the duties of the first executive office of our country, I avail myself of th..."
4,8,Thomas Jefferson,Second Inaugural Address,"Monday, March 4, 1805","PROCEEDING, fellow-citizens, to that qualification which the Constitution requires before my entrance on the charge again conferred ..."


## Isolate Data to Topic Model

In [116]:
# Select Rows that are first term inaugural addresses
df = df.drop_duplicates(subset=['Name'], keep='first')

# Clean Up Index
df = df.reset_index()

# Select only President's Names and their Speeches
df = df[['Name', 'text']]

# Set Index to President's Names
df = df.set_index('Name')

# Visually Inspect
df.head()

Unnamed: 0_level_0,text
Name,Unnamed: 1_level_1
George Washington,Fellow-Citizens of the Senate and of the House of Representatives: AMONG the vicissitudes incident to life no event could have fille...
John Adams,"WHEN it was first perceived, in early times, that no middle course for America remained between unlimited submission to a foreign le..."
Thomas Jefferson,"Friends and Fellow-Citizens: CALLED upon to undertake the duties of the first executive office of our country, I avail myself of th..."
James Madison,"UNWILLING to depart from examples of the most revered authority, I avail myself of the occasion now presented to express the profoun..."
James Monroe,I SHOULD be destitute of feeling if I was not deeply affected by the strong proof which my fellow-citizens have given me of their co...


## Cleaning Data

In [117]:
def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, 
    remove punctuation, remove read errors,
    and remove words containing numbers.'''    
    text = text.lower()
    text = re.sub('\[.*?\]', ' ', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\w*\d\w*', ' ', text)
    text = re.sub('�', ' ', text)    
    return text

round1 = lambda x: clean_text_round1(x)

# Clean Speech Text
df["text"] = df["text"].apply(round1)

# Visually Inspect
df.head()

Unnamed: 0_level_0,text
Name,Unnamed: 1_level_1
George Washington,fellow citizens of the senate and of the house of representatives among the vicissitudes incident to life no event could have fille...
John Adams,when it was first perceived in early times that no middle course for america remained between unlimited submission to a foreign le...
Thomas Jefferson,friends and fellow citizens called upon to undertake the duties of the first executive office of our country i avail myself of th...
James Madison,unwilling to depart from examples of the most revered authority i avail myself of the occasion now presented to express the profoun...
James Monroe,i should be destitute of feeling if i was not deeply affected by the strong proof which my fellow citizens have given me of their co...


## Preprocessing/Lemmatize

In [118]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
  
# Noun extract and lemmatize function

def nouns(text):
    # Given a string of text, tokenize the text 
    # and pull out only the nouns.
    # create mask to isolate words that are nouns
    is_noun = lambda pos: pos[:2] == 'NN'    
    
    # store function to split string of words 
    # into a list of words (tokens)
    tokenized = word_tokenize(text)
    # print(tokenized)    
    
    # store function to lemmatize each word
    wordnet_lemmatizer = WordNetLemmatizer()    
    
    # use list comprehension to lemmatize all words 
    # and create a list of all nouns
    all_nouns = [wordnet_lemmatizer.lemmatize(word) \
    for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    
    #return string of joined list of nouns
    return ' '.join(all_nouns)
    
# Create dataframe of only nouns from speeches
data_nouns = pd.DataFrame(df.text.apply(nouns))

# Visually Inspect
data_nouns.head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0_level_0,text
Name,Unnamed: 1_level_1
George Washington,citizen senate house representative vicissitude incident life event anxiety notification order day month hand i country voice i veneration love re...
John Adams,time course america submission legislature independence claim men reflection danger power fleet army contest dissension form government part count...
Thomas Jefferson,friend citizen duty executive office country i myself presence portion fellow citizen thanks favor consciousness task talent presentiment greatnes...
James Madison,example authority i myself occasion profound impression call country station duty i sanction mark confidence proceeding deliberate suffrage nation...
James Monroe,i destitute i proof fellow citizen confidence office function expression opinion conduct service gratification sensibility estimate importance tru...


In [119]:
type(data_nouns)

pandas.core.frame.DataFrame

In [120]:
data_nouns['text'].head()

Name
George Washington    citizen senate house representative vicissitude incident life event anxiety notification order day month hand i country voice i veneration love re...
John Adams           time course america submission legislature independence claim men reflection danger power fleet army contest dissension form government part count...
Thomas Jefferson     friend citizen duty executive office country i myself presence portion fellow citizen thanks favor consciousness task talent presentiment greatnes...
James Madison        example authority i myself occasion profound impression call country station duty i sanction mark confidence proceeding deliberate suffrage nation...
James Monroe         i destitute i proof fellow citizen confidence office function expression opinion conduct service gratification sensibility estimate importance tru...
Name: text, dtype: object

In [121]:
text_for_word2vec = data_nouns['text'];

f = open("word2vec_file.txt", "w")
f.write(str(text_for_word2vec))
f.close()

#open and read the file after the appending:
f = open("word2vec_file.txt", "r")
text = f.read()
print(text)

Name
George Washington         citizen senate house representative vicissitude incident life event anxiety notification order day month hand i country voice i veneration love re...
John Adams                time course america submission legislature independence claim men reflection danger power fleet army contest dissension form government part count...
Thomas Jefferson          friend citizen duty executive office country i myself presence portion fellow citizen thanks favor consciousness task talent presentiment greatnes...
James Madison             example authority i myself occasion profound impression call country station duty i sanction mark confidence proceeding deliberate suffrage nation...
James Monroe              i destitute i proof fellow citizen confidence office function expression opinion conduct service gratification sensibility estimate importance tru...
John Quincy Adams         compliance coeval existence constitution example predecessor career i citizen presence he

## TF-IDF Vectorizer

In [129]:
# Add additional stop words since we are recreating the document-term matrix
stop_noun = ["america", 'today', 'thing']
stop_words_noun_agg = text.ENGLISH_STOP_WORDS.union(stop_noun)

# Create a document-term matrix with only nouns# Store TF-IDF Vectorizer
tv_noun = TfidfVectorizer(stop_words=stop_words_noun_agg, ngram_range = (1,1), max_df = .8, min_df = .01)

# Fit and Transform speech noun text to a TF-IDF Doc-Term Matrix
data_tv_noun = tv_noun.fit_transform(data_nouns.text)

# Create data-frame of Doc-Term Matrix with nouns as column names
data_dtm_noun = pd.DataFrame(data_tv_noun.toarray(), columns=tv_noun.get_feature_names())

# Set President's Names as Index
data_dtm_noun.index = df.index

# Visually inspect Document Term Matrix
data_dtm_noun.head()

AttributeError: ignored

## word2vec

In [124]:
# Python function to generate word vectors using Word2Vec

def generate_word2vec():
	# importing all necessary modules
	from nltk.tokenize import sent_tokenize, word_tokenize
	import warnings

	warnings.filterwarnings(action = 'ignore')

	import gensim
	from gensim.models import Word2Vec

	# Reads ‘alice.txt’ file
	sample = open("word2vec_file.txt", "r")
	s = sample.read()

	# Replaces escape character with space
	f = s.replace("\n", " ")

	data = []

	# iterate through each sentence in the file
	for i in sent_tokenize(f):
		temp = []
		
		# tokenize the sentence into words
		for j in word_tokenize(i):
			temp.append(j.lower())

		data.append(temp)

	# Create CBOW model
	model1 = gensim.models.Word2Vec(data, min_count = 1,
								size = 100, window = 5)

	# Print results
	print("Cosine similarity between 'citizen' " +
				"and 'duty' - CBOW : ",
		model1.similarity('citizen', 'duty'))
		
	print("Cosine similarity between 'countryman' " +
					"and 'oath' - CBOW : ",
		model1.similarity('countryman', 'oath'))

	print("Cosine similarity between 'citizen' " +
					"and 'world' - CBOW : ",
		model1.similarity('citizen', 'world'))

	# Create Skip Gram model
	model2 = gensim.models.Word2Vec(data, min_count = 1, size = 100,
												window = 5, sg = 1)

	# Print results
	print("Cosine similarity between 'citizen' " +
				"and 'duty' - Skip Gram : ",
		model1.similarity('citizen', 'duty'))
		
	print("Cosine similarity between 'countryman' " +
					"and 'oath' - Skip Gram : ",
		model1.similarity('countryman', 'oath'))

	print("Cosine similarity between 'citizen' " +
					"and 'world' - Skip Gram : ",
		model1.similarity('citizen', 'world'))
 

generate_word2vec()

Cosine similarity between 'citizen' and 'duty' - CBOW :  0.057460777
Cosine similarity between 'countryman' and 'oath' - CBOW :  0.037355673
Cosine similarity between 'citizen' and 'world' - CBOW :  0.16717038
Cosine similarity between 'citizen' and 'duty' - Skip Gram :  0.057460777
Cosine similarity between 'countryman' and 'oath' - Skip Gram :  0.037355673
Cosine similarity between 'citizen' and 'world' - Skip Gram :  0.16717038


## doc2vec

In [144]:
# Doc2Vec vectorised representation of a group of words taken collectively as a single unit

def generate_doc2vec():
  import gensim
  import gensim.downloader as api

  # preprocessing and cleaning data
  # f = open("word2vec_file.txt", "r", encoding='UTF-8')
  dataset = text
  print(dataset)
  data = dataset.split(' ')
  for i in data:
    if i =='':
      data.remove(i)

  def tagged_document(list_of_list_of_words):
    for i, list_of_words in enumerate(list_of_list_of_words):
        yield gensim.models.doc2vec.TaggedDocument(list_of_words, [i])

  data_for_training = list(tagged_document(data))
  print(data_for_training)
  print("\n")
  print(data_for_training[:1])
  model = gensim.models.doc2vec.Doc2Vec(vector_size=40, min_count=2, epochs=30)
  model.build_vocab(data_for_training)
  model.train(data_for_training, total_examples=model.corpus_count, epochs=model.epochs)
  print(model.infer_vector(['citizen', 'oath', 'countryman', 'world', 'duty','president']))


generate_doc2vec()

Name
George Washington         citizen senate house representative vicissitude incident life event anxiety notification order day month hand i country voice i veneration love re...
John Adams                time course america submission legislature independence claim men reflection danger power fleet army contest dissension form government part count...
Thomas Jefferson          friend citizen duty executive office country i myself presence portion fellow citizen thanks favor consciousness task talent presentiment greatnes...
James Madison             example authority i myself occasion profound impression call country station duty i sanction mark confidence proceeding deliberate suffrage nation...
James Monroe              i destitute i proof fellow citizen confidence office function expression opinion conduct service gratification sensibility estimate importance tru...
John Quincy Adams         compliance coeval existence constitution example predecessor career i citizen presence he

In [130]:
data_dtm_noun

Unnamed: 0_level_0,abandonment,abeyance,ability,abode,abraham,absence,absent,absolute,abstraction,abundance,...,wrongdoing,wrought,year,yes,yesterday,york,yorktown,youth,zeal,zone
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
George Washington,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.027063,0.0,0.0,0.0,0.0,0.0,0.0,0.0
John Adams,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.089073,0.0,0.0,0.0,0.0,0.0,0.038005,0.0
Thomas Jefferson,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.049602,0.0
James Madison,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
James Monroe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.03122,0.0,0.0,0.0,0.0,0.0,0.033302,0.050994
John Quincy Adams,0.0,0.0,0.022947,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.117175,0.0,0.0,0.0,0.0,0.0,0.031247,0.0
Andrew Jackson,0.0,0.0,0.048066,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.065452,0.0
Martin Van Buren,0.0,0.0,0.042237,0.0,0.0,0.0,0.044035,0.0,0.0,0.0,...,0.0,0.0,0.040439,0.0,0.0,0.0,0.0,0.0,0.0,0.0
William Henry Harrison,0.0,0.0,0.020825,0.019508,0.0,0.016733,0.0,0.0,0.0,0.0,...,0.0,0.0,0.039877,0.0,0.0,0.0,0.0,0.0,0.0,0.0
James Knox Polk,0.0,0.0,0.046367,0.028957,0.0,0.024837,0.0,0.0,0.0,0.0,...,0.0,0.0,0.01973,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Separating Dataset into Testing and Training

Splitting into 30% Testing and 70% Training Dataset

In [33]:
from sklearn.model_selection import train_test_split

X = data_dtm_noun[:18]
y = data_dtm_noun.iloc[19:37]
print(X,"\n\n", "y:\n", y)
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.30, random_state=42)

                        abandonment  abeyance   ability     abode  abraham  \
Name                                                                         
George Washington          0.000000  0.000000  0.000000  0.000000      0.0   
John Adams                 0.000000  0.000000  0.000000  0.000000      0.0   
Thomas Jefferson           0.000000  0.000000  0.000000  0.000000      0.0   
James Madison              0.000000  0.000000  0.000000  0.000000      0.0   
James Monroe               0.000000  0.000000  0.000000  0.000000      0.0   
John Quincy Adams          0.000000  0.000000  0.022947  0.000000      0.0   
Andrew Jackson             0.000000  0.000000  0.048066  0.000000      0.0   
Martin Van Buren           0.000000  0.000000  0.042237  0.000000      0.0   
William Henry Harrison     0.000000  0.000000  0.020825  0.019508      0.0   
James Knox Polk            0.000000  0.000000  0.046367  0.028957      0.0   
Zachary Taylor             0.000000  0.000000  0.051987  0.00000

## NNMF

In [126]:
def generate_NMF():
  def display_topics(model, feature_names, num_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '", topic_names[ix], "'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-num_top_words - 1:-1]]))
        
  nmf_model = NMF(2)
  # Learn an NMF model for given Document Term Matrix 'V'
  # Extract the document-topic matrix 'W'
  doc_topic = nmf_model.fit_transform(Xtrain, ytrain)
  # Extract top words from the topic-term matrix 'H' display_topics(nmf_model, tv_noun.get_feature_names(), 5)
  display_topics(nmf_model, tv_noun.get_feature_names(), 10)

generate_NMF()


Topic  0
constitution, union, principle, institution, opinion, peace, territory, object, administration, protection

Topic  1
dollar, debt, payment, question, determination, ability, office, year, method, pride


## LDA

In [143]:
var = list(data_dtm_noun.columns)
var

['abandonment',
 'abeyance',
 'ability',
 'abode',
 'abraham',
 'absence',
 'absent',
 'absolute',
 'abstraction',
 'abundance',
 'abuse',
 'academy',
 'accept',
 'acceptance',
 'access',
 'accession',
 'accident',
 'accommodation',
 'accomplishment',
 'accord',
 'accordance',
 'account',
 'accountability',
 'accumulation',
 'achievement',
 'acknowledgment',
 'acquiescence',
 'acquisition',
 'act',
 'action',
 'activism',
 'activity',
 'addiction',
 'addition',
 'address',
 'adequate',
 'adherence',
 'adheres',
 'adjunct',
 'adjustment',
 'administration',
 'administrator',
 'admiration',
 'admission',
 'admonition',
 'adoption',
 'adorns',
 'advance',
 'advancement',
 'advantage',
 'adventure',
 'adversary',
 'adverse',
 'advice',
 'adviser',
 'advocacy',
 'advocate',
 'aegis',
 'affair',
 'affection',
 'affectionate',
 'affiliation',
 'affirmation',
 'affirms',
 'affliction',
 'afford',
 'afghanistan',
 'afield',
 'afloat',
 'aftermath',
 'age',
 'agency',
 'agent',
 'aggrandizement'

In [138]:
var

[Empty DataFrame
 Columns: [abandonment, abeyance, ability, abode, abraham, absence, absent, absolute, abstraction, abundance, abuse, academy, accept, acceptance, access, accession, accident, accommodation, accomplishment, accord, accordance, account, accountability, accumulation, achievement, acknowledgment, acquiescence, acquisition, act, action, activism, activity, addiction, addition, address, adequate, adherence, adheres, adjunct, adjustment, administration, administrator, admiration, admission, admonition, adoption, adorns, advance, advancement, advantage, adventure, adversary, adverse, advice, adviser, advocacy, advocate, aegis, affair, affection, affectionate, affiliation, affirmation, affirms, affliction, afford, afghanistan, afield, afloat, aftermath, age, agency, agent, aggrandizement, aggravation, aggregate, aggression, aggressor, agitate, agitation, agony, agreement, agriculture, aid, aim, air, airport, alacrity, alarm, alien, alienation, aliment, ality, allegation, allegh

In [142]:
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary

print(common_texts)

# Create a corpus from a list of texts
dictionary = Dictionary(common_texts)
corpus = [common_dictionary.doc2bow(text) for text in common_texts]

print(dictionary, "\n")
print(common_dictionary, "\n")
print(common_texts, "\n")
print(corpus)

# Train the model on the corpus.
lda = LdaModel(corpus, num_topics=10)

[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey']]
Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) 

Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) 

[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey']] 

[[(0, 1), (1, 1), (2, 1)], [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (5, 1), (7, 1), (8, 1)], [(1, 1), (5, 2), (8, 1)], [(3, 1), (6, 1), (7, 1)], [(9, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1

In [71]:
import numpy 

from gensim.test.utils import common_texts, common_dictionary
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel

print(numpy.array(common_texts).shape)
print(numpy.array(common_dictionary).shape)

ct = text.split(' ')
cd = ['citizen', 'oath', 'countryman', 'world', 'duty','president']

# Create a corpus from a list of texts
dictionary = Dictionary(common_texts)
corpus = [cd for text in common_texts]

# Train the model on the corpus.
lda = LdaModel(corpus, num_topics=10)

lda.fit(X_train, y_train)

from sklearn.metrics import classification_report
print(classification_report(ytest, ytrain[:6]))


(9,)
(12,)


ValueError: ignored

In [73]:
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary

# Create a corpus from a list of texts
dictionary = Dictionary(common_texts)
corpus = [common_dictionary.doc2bow(text) for text in common_texts]

# Train the model on the corpus.
lda = LdaModel(corpus, num_topics=10)

In [None]:
# confusion matrix in sklearn
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# actual values
actual = [1,0,0,1,0,0,1,0,0,1]
# predicted values
predicted = [1,0,0,1,0,0,0,1,0,0]

# confusion matrix
matrix = confusion_matrix(actual,predicted, labels=[1,0])
print('Confusion matrix : \n',matrix)

# outcome values order in sklearn
tp, fn, fp, tn = confusion_matrix(actual,predicted,labels=[1,0]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy
matrix = classification_report(actual,predicted,labels=[1,0])
print('Classification report : \n',matrix)

## lda2vec

In [78]:
model = LDA2Vec(n_words, max_length, n_hidden, counts)
model.add_component(n_docs, n_topics, name='document id')
model.fit(clean, components=[doc_ids])
topics = model.prepare_topics('document_id', vocab)
prepared = pyLDAvis.prepare(topics)
pyLDAvis.display(prepared)

NameError: ignored

## LSA

In [145]:
from gensim.test.utils import common_dictionary, common_corpus
from gensim.models import LsiModel

model = LsiModel(common_corpus, id2word=common_dictionary)
vectorized_corpus = model[common_corpus] 

In [146]:
print(type(model))

<class 'gensim.models.lsimodel.LsiModel'>


In [None]:
model = LsiModel(common_corpus, id2word=common_dictionary)
W = model.fit_transform(X)
# print("W:\n", W)
H = model.fit_transform(ytest)

matrix = classification_report(W[:6], H, labels=[1,0])
print('Classification report : \n',matrix)