<a href="https://colab.research.google.com/github/eriksali/Text-Analytics_LDA/blob/master/lda_skearn_k_10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# for text preprocessing
import re
import spacy

from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string

# import vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# import numpy for matrix operation
import numpy as np

# import LDA from sklearn
from sklearn.decomposition import LatentDirichletAllocation


In [2]:
# to suppress warnings
from warnings import filterwarnings
filterwarnings('ignore')

In [3]:
nlp = spacy.load('en_core_web_sm')

In [4]:
D1 =  'prepar execut eval client eval client client list client proxi fail client success client arr array arr serial ndarray byte arr evalu evalu paramet tensor arr serial tensor type client client result failur evalu client client len result len failur result loss result num exampl'
D2 =  'img draw creat imag origin img creat imag output img origin img copi input1 output img draw run input1 origin img shape output img shape rais error equal origin img output img equal output img array equal output img array'
D3 =  'cli logger init src custom node init default parent dir cwd case log captur result cli runner invok cli result exit code captur record get messag cwd parent dir exist parent dir exist cwd exist open cwd infil case dict equal yaml safe load infil'
D4 =  'record collect record copi deepcopi record record copi get record record record sum copi record len record sum len record'
D5 =  'rmse correct pred torch tensor repeat targ torch tensor repeat loss equal pred targ' 

In [5]:
# combining all the documents into a list:

corpus = [D1, D2, D3, D4, D5]

In [6]:
# the complete corpus as below:

corpus

['prepar execut eval client eval client client list client proxi fail client success client arr array arr serial ndarray byte arr evalu evalu paramet tensor arr serial tensor type client client result failur evalu client client len result len failur result loss result num exampl',
 'img draw creat imag origin img creat imag output img origin img copi input1 output img draw run input1 origin img shape output img shape rais error equal origin img output img equal output img array equal output img array',
 'cli logger init src custom node init default parent dir cwd case log captur result cli runner invok cli result exit code captur record get messag cwd parent dir exist parent dir exist cwd exist open cwd infil case dict equal yaml safe load infil',
 'record collect record copi deepcopi record record copi get record record record sum copi record len record sum len record',
 'rmse correct pred torch tensor repeat targ torch tensor repeat loss equal pred targ']

In [8]:
# Apply Preprocessing on the Corpus
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
# stop loss words 
stop = set(stopwords.words('english'))

# punctuation 
exclude = set(string.punctuation) 

# lemmatization
lemma = WordNetLemmatizer() 

# One function for all the steps:
def clean(doc):
    
    # convert text into lower case + split into words
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    
    # remove any stop words present
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)  
    
    # remove punctuations + normalize the text
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())  
    return normalized

# clean data stored in a new list
clean_corpus = [clean(doc).split() for doc in corpus]   

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [9]:
clean_corpus

[['prepar',
  'execut',
  'eval',
  'client',
  'eval',
  'client',
  'client',
  'list',
  'client',
  'proxi',
  'fail',
  'client',
  'success',
  'client',
  'arr',
  'array',
  'arr',
  'serial',
  'ndarray',
  'byte',
  'arr',
  'evalu',
  'evalu',
  'paramet',
  'tensor',
  'arr',
  'serial',
  'tensor',
  'type',
  'client',
  'client',
  'result',
  'failur',
  'evalu',
  'client',
  'client',
  'len',
  'result',
  'len',
  'failur',
  'result',
  'loss',
  'result',
  'num',
  'exampl'],
 ['img',
  'draw',
  'creat',
  'imag',
  'origin',
  'img',
  'creat',
  'imag',
  'output',
  'img',
  'origin',
  'img',
  'copi',
  'input1',
  'output',
  'img',
  'draw',
  'run',
  'input1',
  'origin',
  'img',
  'shape',
  'output',
  'img',
  'shape',
  'rais',
  'error',
  'equal',
  'origin',
  'img',
  'output',
  'img',
  'equal',
  'output',
  'img',
  'array',
  'equal',
  'output',
  'img',
  'array'],
 ['cli',
  'logger',
  'init',
  'src',
  'custom',
  'node',
  'init',
 

In [10]:
# Converting text into numerical representation
tf_idf_vectorizer = TfidfVectorizer(tokenizer=lambda doc: doc, lowercase=False)

# Converting text into numerical representation
cv_vectorizer = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)
# Array from TF-IDF Vectorizer 
tf_idf_arr = tf_idf_vectorizer.fit_transform(clean_corpus)

# Array from Count Vectorizer 
cv_arr = cv_vectorizer.fit_transform(clean_corpus)
# this is our converted text to numerical representation from the Tf-IDF vectorizer

tf_idf_arr

<5x72 sparse matrix of type '<class 'numpy.float64'>'
	with 82 stored elements in Compressed Sparse Row format>

In [11]:
# this is our converted text to numerical representation from the Count vectorizer
cv_arr

<5x72 sparse matrix of type '<class 'numpy.int64'>'
	with 82 stored elements in Compressed Sparse Row format>

In [12]:
# Creating vocabulary array which will represent all the corpus 
vocab_tf_idf = tf_idf_vectorizer.get_feature_names()

# get the vocb list
vocab_tf_idf

['arr',
 'array',
 'byte',
 'captur',
 'case',
 'cli',
 'client',
 'code',
 'collect',
 'copi',
 'correct',
 'creat',
 'custom',
 'cwd',
 'deepcopi',
 'default',
 'dict',
 'dir',
 'draw',
 'equal',
 'error',
 'eval',
 'evalu',
 'exampl',
 'execut',
 'exist',
 'exit',
 'fail',
 'failur',
 'get',
 'imag',
 'img',
 'infil',
 'init',
 'input1',
 'invok',
 'len',
 'list',
 'load',
 'log',
 'logger',
 'loss',
 'messag',
 'ndarray',
 'node',
 'num',
 'open',
 'origin',
 'output',
 'paramet',
 'parent',
 'pred',
 'prepar',
 'proxi',
 'rais',
 'record',
 'repeat',
 'result',
 'rmse',
 'run',
 'runner',
 'safe',
 'serial',
 'shape',
 'src',
 'success',
 'sum',
 'targ',
 'tensor',
 'torch',
 'type',
 'yaml']

In [13]:
# Creating vocabulary array which will represent all the corpus 
vocab_cv = cv_vectorizer.get_feature_names()

# get the vocb list
vocab_cv

['arr',
 'array',
 'byte',
 'captur',
 'case',
 'cli',
 'client',
 'code',
 'collect',
 'copi',
 'correct',
 'creat',
 'custom',
 'cwd',
 'deepcopi',
 'default',
 'dict',
 'dir',
 'draw',
 'equal',
 'error',
 'eval',
 'evalu',
 'exampl',
 'execut',
 'exist',
 'exit',
 'fail',
 'failur',
 'get',
 'imag',
 'img',
 'infil',
 'init',
 'input1',
 'invok',
 'len',
 'list',
 'load',
 'log',
 'logger',
 'loss',
 'messag',
 'ndarray',
 'node',
 'num',
 'open',
 'origin',
 'output',
 'paramet',
 'parent',
 'pred',
 'prepar',
 'proxi',
 'rais',
 'record',
 'repeat',
 'result',
 'rmse',
 'run',
 'runner',
 'safe',
 'serial',
 'shape',
 'src',
 'success',
 'sum',
 'targ',
 'tensor',
 'torch',
 'type',
 'yaml']

In [14]:
display(len(vocab_tf_idf))
display(len(vocab_cv))

72

72

In [21]:
 # Implementation of LDA:
    
# Create object for the LDA class 
# Inside this class LDA: define the components:
lda_model = LatentDirichletAllocation(n_components = 10, max_iter = 20, random_state = 20)

# fit transform on model on our count_vectorizer : running this will return our topics 
X_topics = lda_model.fit_transform(tf_idf_arr)

# .components_ gives us our topic distribution 
topic_words = lda_model.components_

In [22]:
#  Define the number of Words that we want to print in every topic : n_top_words
n_top_words = 10

for i, topic_dist in enumerate(topic_words):
    
    # np.argsort to sorting an array or a list or the matrix acc to their values
    sorted_topic_dist = np.argsort(topic_dist)
    
    # Next, to view the actual words present in those indexes we can make the use of the vocab created earlier
    topic_words = np.array(vocab_tf_idf)[sorted_topic_dist]
    
    # so using the sorted_topic_indexes we ar extracting the words from the vocabulary
    # obtaining topics + words
    # this topic_words variable contains the Topics  as well as the respective words present in those Topics
    topic_words = topic_words[:-n_top_words:-1]
    print ("Topic", str(i+1), topic_words)

Topic 1 ['record' 'copi' 'sum' 'len' 'collect' 'deepcopi' 'get' 'array' 'equal']
Topic 2 ['cwd' 'exist' 'cli' 'dir' 'parent' 'infil' 'init' 'case' 'captur']
Topic 3 ['copi' 'array' 'get' 'equal' 'loss' 'rais' 'error' 'run' 'list']
Topic 4 ['copi' 'array' 'get' 'equal' 'loss' 'rais' 'error' 'run' 'list']
Topic 5 ['img' 'output' 'origin' 'equal' 'shape' 'draw' 'creat' 'imag' 'input1']
Topic 6 ['client' 'arr' 'result' 'evalu' 'failur' 'serial' 'eval' 'len' 'tensor']
Topic 7 ['copi' 'array' 'get' 'equal' 'loss' 'rais' 'error' 'run' 'list']
Topic 8 ['torch' 'targ' 'pred' 'repeat' 'tensor' 'correct' 'rmse' 'loss' 'equal']
Topic 9 ['copi' 'array' 'get' 'equal' 'loss' 'rais' 'error' 'run' 'list']
Topic 10 ['copi' 'array' 'get' 'equal' 'loss' 'rais' 'error' 'run' 'list']


In [23]:
# To view what topics are assigned to the douments:

doc_topic = lda_model.transform(tf_idf_arr)  

# iterating over ever value till the end value
for n in range(doc_topic.shape[0]):
    
    # argmax() gives maximum index value
    topic_doc = doc_topic[n].argmax()
    
    # document is n+1  
    print ("Document", n+1, " -- Topic:" ,topic_doc)

Document 1  -- Topic: 5
Document 2  -- Topic: 4
Document 3  -- Topic: 1
Document 4  -- Topic: 0
Document 5  -- Topic: 7
