In [1]:
import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import gensim
from gensim.corpora import Dictionary
from gensim.models import LsiModel
from gensim.matutils import corpus2csc


df = pd.read_csv("QP7.csv")
data = df.sample(n=1000, axis=0)
data = data['Question']


nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')


lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def preprocess(text):
    text = text.lower()
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    import re
    special_chars = r'[,.:;?\(\'"\s]'
    words = [re.sub(special_chars, '', word) for word in words]
    return words


data = data.apply(preprocess)


dictionary = Dictionary(data)


dictionary.filter_extremes(no_below=5, no_above=0.5)


bow_corpus = [dictionary.doc2bow(text) for text in data]


num_topics = 5
lsamodel = LsiModel(bow_corpus, num_topics=num_topics, id2word=dictionary)


topics = lsamodel.show_topics(num_topics=num_topics, num_words=10)


top_topics = []
for topic in topics:
    top_topics.append(topic[1])


print("Top 5 LSA Topics:")
for i, topic in enumerate(top_topics, start=1):
    print("Topic {}: {}".format(i, topic))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\RVCCF021\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\RVCCF021\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RVCCF021\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Top 5 LSA Topics:
Topic 1: -0.886*"best" + -0.196*"way" + -0.169*"s" + -0.149*"get" + -0.130*")" + -0.100*"book" + -0.097*"thing" + -0.090*"learn" + -0.082*"``" + -0.080*"one"
Topic 2: 0.569*"``" + -0.318*"best" + 0.315*"s" + 0.314*"one" + 0.268*"get" + 0.200*")" + 0.183*"nt" + 0.155*"like" + 0.130*"need" + 0.115*"good"
Topic 3: -0.721*"get" + 0.519*"``" + -0.177*"university" + -0.133*")" + -0.119*"like" + 0.111*"best" + 0.108*"s" + -0.093*"job" + -0.090*"engineering" + -0.089*"back"
Topic 4: 0.448*")" + 0.444*"like" + -0.430*"get" + 0.356*"india" + -0.302*"``" + 0.201*"time" + 0.148*"people" + 0.112*"much" + 0.111*"country" + 0.078*"think"
Topic 5: 0.501*"s" + 0.444*"one" + -0.428*"``" + -0.420*")" + -0.155*"university" + -0.118*"get" + 0.109*"trump" + 0.106*"life" + 0.099*"need" + 0.095*"donald"
