# TOPIC MODELING

### Importing libraries

In [238]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
import string
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer, SnowballStemmer
import gensim
from gensim.corpora import Dictionary

In [211]:
df = pd.read_excel('topics.xlsx')

In [298]:
df.head(22)

Unnamed: 0,topic,content
0,covid,Coronaviruses are a large family of viruses th...
1,ai,"Artificial intelligence (AI), sometimes called..."
2,neural link,Neuralink is a device that will be surgically ...
3,gdp,\nWhat is Gross Domestic Product (GDP)?\nBy: F...
4,ipl,The Indian Premier League (IPL) is a professio...
5,ssr,The AIIMS medical board will hold a meeting wi...
6,jee,"The Ministry of Human Resource Development, Go..."
7,global warming,Global Warming is a term almost everyone is fa...
8,space,Indian space programme encompasses research in...
9,football,Football is a family of team sports that invol...


In [297]:
len(df['content'])

20

### Splitting data for training and testing

In [304]:
df_train = df[0:15]
df_test = df.iloc[15:21,-1:]

In [305]:
print("Training data: ",df_train.shape," Testing data: ",df_test.shape)

Training data:  (15, 2)  Testing data:  (5, 1)


### Data Preprocessing

In [306]:

def data_processing(text):
    
    #Lower case the text
    text = text.lower()
    
    #normalizing URLs
    text = re.sub(r'http\S+', '', text)
    
    #normalizing email ids
    text = re.sub('([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,})', '', text)
    
    # replace dollar with dollar
    text = re.sub(r'£|\$', ' ', text)
    
    #replace mobile number with strig mobilenumber
    text = re.sub('(?:\+ *)?\d[\d\- ]{7,}\d', '', text)
    
    #replace any numbers with string 'numbr'
    text = re.sub(r'\d+(\.\d+)?', ' ',text)
    
    # Remove punctuation
    text = text.replace(r'[^\w\d\s]', ' ')
    text = text.replace('\\n', " ") 
    text = text.replace('\\\n', " ") 
    text = text.replace('\n', " ")
    text = text.replace(r'“', " ")
    text = text.replace(r'”', " ")
    
    
    # define punctuation
    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    no_punct = ""
    for char in text:
        if char not in punctuations:
            no_punct = no_punct + char

    

    # Replace whitespace between terms with a single space
    text = no_punct.replace(r'\s+', ' ')

    # Remove leading and trailing whitespace
    text = text.replace(r'^\s+|\s+?$', '')
    
    #Tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    
    #Removing stopwords
    stopword = gensim.parsing.preprocessing.STOPWORDS
    review_text = [word for word in text if word not in stopword if(len(word)>3)]
    
    #Lemmatize words
    
    clean_text = []
    stemmer = SnowballStemmer("english")
    for word in review_text:
        if(len(word)>3):
            clean = stemmer.stem(WordNetLemmatizer().lemmatize(word,pos='v'))
            clean_text.append(word)
    
    #review_text = ([wnl.lemmatize(word,pos = "v") for word in text if(len(word)>3)])
    
    
   
    return review_text

In [331]:
#Testing on dummy data

In [307]:
doc_sample = 'After completion, ()\n\n upload your solutions in the worksheets repository of your GitHub profile and share that link with us. '

print("Original Document: ")
words=[]
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print("\n\nTokenized and lemmatized document: ")
print(data_processing(doc_sample))

Original Document: 
['After', 'completion,', '()\n\n', 'upload', 'your', 'solutions', 'in', 'the', 'worksheets', 'repository', 'of', 'your', 'GitHub', 'profile', 'and', 'share', 'that', 'link', 'with', 'us.', '']


Tokenized and lemmatized document: 
['completion', 'upload', 'solutions', 'worksheets', 'repository', 'github', 'profile', 'share', 'link']


In [308]:
processed_text = []

for doc in df_train.content:
    doc1 = data_processing(doc)
    doc1 = ' '.join(doc1)
    processed_text.append(data_processing(doc1))
    

In [309]:
print(processed_text[:2])

[['coronaviruses', 'large', 'family', 'viruses', 'actually', 'common', 'world', 'cause', 'respiratory', 'illness', 'people', 'animals', 'known', 'coronaviruses', 'infect', 'people', 'usually', 'cause', 'mild', 'respiratory', 'disease', 'common', 'cold', 'previously', 'identified', 'coronaviruses', 'caused', 'severe', 'illness', 'severe', 'acute', 'respiratory', 'syndrome', 'sars', 'coronavirus', 'middle', 'east', 'respiratory', 'syndrome', 'mers', 'coronavirus', 'whats', 'different', 'coronavirus', 'covid', 'coronaviruses', 'common', 'coronavirus', 'covid', 'strain', 'coronavirus', 'previously', 'identified', 'humans', 'features', 'covid', 'respiratory', 'symptoms', 'fever', 'cough', 'like', 'infections', 'understanding', 'covid', 'important', 'changes', 'rapidly', 'proactively', 'monitoring', 'virus', 'taking', 'measures', 'like', 'providing', 'guidance', 'health', 'care', 'workers', 'issuing', 'travel', 'recommendations', 'coronaviruses', 'spread', 'investigation', 'covid', 'ongoing'

So far we have preprocessed our data by cleaning, removing stopwords, removin puncuations, and by lemmatizing words

### Creating bag of words

In [310]:
#bag of words on the data set
dictionary = Dictionary(processed_text)

In [311]:
print (dictionary)

Dictionary(1526 unique tokens: ['actually', 'acute', 'affected', 'animals', 'area']...)


In [312]:
#Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many
#words and how many times those words appear. Save this to 'bow_corpus'
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_text]

In [313]:
bow_corpus[1]

[(3, 1),
 (35, 3),
 (36, 2),
 (40, 3),
 (41, 1),
 (47, 2),
 (57, 1),
 (88, 1),
 (95, 1),
 (96, 1),
 (97, 1),
 (98, 1),
 (99, 1),
 (100, 1),
 (101, 2),
 (102, 1),
 (103, 6),
 (104, 1),
 (105, 1),
 (106, 1),
 (107, 3),
 (108, 1),
 (109, 1),
 (110, 1),
 (111, 1),
 (112, 1),
 (113, 1),
 (114, 1),
 (115, 1),
 (116, 1),
 (117, 1),
 (118, 1),
 (119, 1),
 (120, 1),
 (121, 1),
 (122, 1),
 (123, 1),
 (124, 1),
 (125, 1),
 (126, 2),
 (127, 1),
 (128, 1),
 (129, 1),
 (130, 1),
 (131, 1),
 (132, 1),
 (133, 1),
 (134, 1),
 (135, 1),
 (136, 1),
 (137, 1),
 (138, 1),
 (139, 1),
 (140, 1),
 (141, 1),
 (142, 1),
 (143, 1),
 (144, 1),
 (145, 1),
 (146, 1),
 (147, 1),
 (148, 1),
 (149, 1),
 (150, 1),
 (151, 1),
 (152, 1),
 (153, 1),
 (154, 1),
 (155, 1),
 (156, 3),
 (157, 2),
 (158, 2),
 (159, 2),
 (160, 1),
 (161, 1),
 (162, 2),
 (163, 1),
 (164, 1),
 (165, 1),
 (166, 4),
 (167, 1),
 (168, 1),
 (169, 1),
 (170, 1),
 (171, 1),
 (172, 1),
 (173, 1),
 (174, 1),
 (175, 1),
 (176, 11),
 (177, 2),
 (178, 1),
 

In [314]:
#preview 
document_num = 10
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     dictionary[bow_doc_x[i][0]],
                                                     bow_doc_x[i][1]))

Word 80 ("sure") appears 1 time.
Word 97 ("achieving") appears 1 time.
Word 153 ("factors") appears 1 time.
Word 157 ("fields") appears 1 time.
Word 161 ("functions") appears 1 time.
Word 163 ("game") appears 8 time.
Word 201 ("natural") appears 1 time.
Word 345 ("place") appears 1 time.
Word 348 ("played") appears 1 time.
Word 357 ("right") appears 1 time.
Word 377 ("team") appears 5 time.
Word 482 ("number") appears 1 time.
Word 519 ("time") appears 2 time.
Word 545 ("committee") appears 1 time.
Word 549 ("cricket") appears 10 time.
Word 564 ("international") appears 2 time.
Word 580 ("players") appears 3 time.
Word 583 ("prevent") appears 1 time.
Word 598 ("teams") appears 1 time.
Word 717 ("talent") appears 2 time.
Word 756 ("especially") appears 1 time.
Word 783 ("national") appears 1 time.
Word 808 ("score") appears 3 time.
Word 809 ("scores") appears 1 time.
Word 821 ("test") appears 2 time.
Word 825 ("twice") appears 1 time.
Word 996 ("unique") appears 1 time.
Word 1005 ("ball"

### Modeling Using LDA

In [317]:
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 15, 
                                   id2word = dictionary,                                    
                                   passes = 5,
                                   workers = 2)

In [318]:
for idx, topic in lda_model.print_topics():
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.045*"mission" + 0.018*"mars" + 0.015*"chandrayaan" + 0.015*"orbiter" + 0.015*"astrosat" + 0.012*"orbit" + 0.012*"space" + 0.012*"spacecraft" + 0.009*"xray" + 0.009*"module"


Topic: 1 
Words: 0.002*"politics" + 0.002*"team" + 0.001*"batsmen" + 0.001*"aiims" + 0.001*"cricket" + 0.001*"medical" + 0.001*"data" + 0.001*"singh" + 0.001*"game" + 0.001*"political"


Topic: 2 
Words: 0.016*"country" + 0.016*"economic" + 0.016*"product" + 0.013*"domestic" + 0.013*"data" + 0.013*"gross" + 0.013*"services" + 0.011*"transmission" + 0.011*"factor" + 0.011*"numbers"


Topic: 3 
Words: 0.002*"entrance" + 0.001*"main" + 0.001*"exam" + 0.001*"examination" + 0.001*"examinations" + 0.001*"year" + 0.001*"conduct" + 0.001*"india" + 0.001*"april" + 0.001*"mission"


Topic: 4 
Words: 0.002*"brain" + 0.002*"control" + 0.002*"device" + 0.002*"data" + 0.002*"wires" + 0.002*"neuralink" + 0.002*"warming" + 0.002*"place" + 0.001*"global" + 0.001*"size"


Topic: 5 
Words: 0.002*"politics" + 0.001

Here are have successfully created a topic modeling on our train data

Let's check it on our test data

#### Testing on unknown data

In [328]:
test_data = df_test.content[16]
print(test_data)

In telecommunications, 5G is the fifth generation technology standard for cellular networks, which cellular phone companies began deploying worldwide in 2019, the planned successor to the 4G networks which provide connectivity to most current cellphones.[1] Like its predecessors, 5G networks are cellular networks, in which the service area is divided into small geographical areas called cells. All 5G wireless devices in a cell are connected to the Internet and telephone network by radio waves through a local antenna in the cell. The main advantage of the new networks is that they will have greater bandwidth, giving higher download speeds,[1] eventually up to 10 gigabits per second (Gbit/s).[2] Due to the increased bandwidth, it is expected that the new networks will not just serve cellphones like existing cellular networks, but also be used as general internet service providers for laptops and desktop computers, competing with existing ISPs such as cable internet, and also will make po

In [329]:
bow_vector = dictionary.doc2bow(data_processing(test_data))

for index, score in sorted(lda_model[bow_vector]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 10)))

Score: 0.08485377579927444	 Topic: 0.045*"mission" + 0.018*"mars" + 0.015*"chandrayaan" + 0.015*"orbiter" + 0.015*"astrosat" + 0.012*"orbit" + 0.012*"space" + 0.012*"spacecraft" + 0.009*"xray" + 0.009*"module"
Score: 0.09912040829658508	 Topic: 0.016*"country" + 0.016*"economic" + 0.016*"product" + 0.013*"domestic" + 0.013*"data" + 0.013*"gross" + 0.013*"services" + 0.011*"transmission" + 0.011*"factor" + 0.011*"numbers"
Score: 0.020709555596113205	 Topic: 0.040*"batsmen" + 0.037*"cricket" + 0.029*"game" + 0.018*"team" + 0.015*"runs" + 0.011*"score" + 0.011*"players" + 0.011*"bowler" + 0.011*"balls" + 0.008*"bowls"
Score: 0.05884911119937897	 Topic: 0.027*"covid" + 0.027*"coronaviruses" + 0.023*"respiratory" + 0.019*"coronavirus" + 0.019*"spread" + 0.019*"virus" + 0.016*"people" + 0.016*"infected" + 0.012*"community" + 0.012*"symptoms"
Score: 0.06291854381561279	 Topic: 0.029*"religious" + 0.021*"religions" + 0.021*"religion" + 0.013*"belief" + 0.009*"world" + 0.009*"people" + 0.009*"m

Here we took data related to technology, hence it is showing a score 0.3277151584625244 for a topic