# Week 4 Assignment

This notebook needs the <b>AUTO_BIOS</b> parameter updated BEFORE it can be run using "Run All".

<b>Note</b> - Since I intend to use the Sklearn, nltk and other available libraries for my projects, I need to learn how to run Pipelines. So I'm going to follow the examples from the 'Applied Text Analysis with Python' (ATAP) textbook.

In [None]:
# This parameter needs to be updated to point to the HTML of the Class Introductions
# The file I used was captured from the web page on February 5th 2019

AUTO_BIOS = 'C:\\Users\\camer\\Downloads\\20190205 Topic_ Class Introductions.html'

#### Created a custom Class to parse the HTML file and tokenize the document entries

In [None]:
from bs4 import BeautifulSoup
from nltk import pos_tag, sent_tokenize, wordpunct_tokenize

class readClassIntroductions():
    """
    Custom class to read the HTML file containing the class introductions, and tokenize the entries into 
    lists of lists of token/tag tuples.
    """
    
    def __init__(self, fileToRead):
        self.char_scrub_list = ["\n","â","€","™","\xa0","/",'"',"(",")",";",":"]
        
        soup = BeautifulSoup(open(fileToRead), 'html.parser')
        soup_entries = soup.find('ul', class_='discussion-entries')
        
        self.entries = []
        for child in soup_entries.children:
            entry = child.find(class_='message').text
            entry = self.replace_list(entry, " ")
            self.entries.append(entry)

    def replace_list(self, string, replacement=" "):
        for item in self.char_scrub_list:
            string = string.replace(item, replacement)
        return string

    def tokenize(self):
        """
        Taken and Modified from p. 49 of ATAP textbook
        """
        for entry in self.entries:
            yield [
                pos_tag(wordpunct_tokenize(sent))
                for sent in sent_tokenize(entry)
            ]
            
            

#### Two additional Classes are needed to construct the Sklearn Pipeline

In [None]:
import unicodedata
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn

class TextNormalizer(BaseEstimator, TransformerMixin):
    """
    Taken and Modified from pp. 72-73 of ATAP textbook
    """
    def __init__(self, language='english'):
        self.stopwords = set(stopwords.words(language))
        self.lemmatizer = WordNetLemmatizer()
        
    def is_punctuation(self, token):
        return all(unicodedata.category(char).startswith('P') for char in token)
    
    def is_stopword(self, token):
        return token.lower() in self.stopwords
    
    def normalize(self, entry):
        return [
            self.lemmatize(token, tag).lower()
            for sentence in entry
            for (token, tag) in sentence
            if not self.is_punctuation(token) and not self.is_stopword(token)
        ]
    
    def lemmatize(self, token, pos_tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(pos_tag[0], wn.NOUN)
        return self.lemmatizer.lemmatize(token, tag)
    
    def fit(self, entries):
        return self
    
    def transform(self, entries):
        for entry in entries:
            yield self.normalize(entry)
            
            
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import TruncatedSVD

def identity(words):
    """
    Taken from p. 90 of ATAP textbook
    An identity function is a function that returns its arguments.
    """
    return words


class SklearnTopicModels(object):
    """
    Taken and Modified from pp. 112, 113 and 120 of ATAP textbook
    """
    
    def __init__(self, n_topics=5, estimator='LDA'):
        """
        n_topics: the number of topics to identify
        estimator: this class will support LDA or LSA analysis (default to LDA)
        """
        
        self.n_topics = n_topics
        
        if estimator == 'LSA':
            self.estimator = TruncatedSVD(n_components=self.n_topics)
        else:
            self.estimator = LatentDirichletAllocation(n_components=self.n_topics, learning_method='batch')
            
        self.model = Pipeline([
            ('normalize', TextNormalizer()),
            ('vectorizer', CountVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)),
            ('model', self.estimator)
        ])
        
    def fit_transform(self, entries):
        self.model.fit_transform(entries)
        
        return self.model
    
    def get_topics(self, n=10):
        """
        n is the number of top terms to show for each topic
        """
        
        vectorizer = self.model.named_steps['vectorizer']
        model = self.model.steps[-1][1]
        names = vectorizer.get_feature_names()
        topics = dict()
        
        for idx, topic in enumerate(model.components_):
            features = topic.argsort()[:-(n-1):-1]
            tokens = [names[i] for i in features]
            topics[idx] = tokens
            
        return topics
    
    

#### The below program logic calls the Classes above

It mimics the:

`if __name__ == '__main__':`

found in a .py file

In [None]:
from time import time

entries = list(readClassIntroductions(AUTO_BIOS).tokenize())

lda = SklearnTopicModels(5,'LDA')
lsa = SklearnTopicModels(5,'LSA')

lda_start = time()
lda.fit_transform(entries)
lda_time = time() - lda_start

lsa_start = time()
lsa.fit_transform(entries)
lsa_time = time() - lsa_start


print(f"Time to fit and transform using LDA: {lda_time:06.6f} seconds\n")
lda_topics = lda.get_topics()
for topic, terms in lda_topics.items():
    print(f"LDA Topic #{topic+1}::")
    print(terms)

print("\n\n")

print(f"Time to fit and transform using LSA: {lsa_time:06.6f} seconds\n")
lsa_topics = lsa.get_topics()
for topic, terms in lsa_topics.items():
    print(f"LSA Topic #{topic+1}::")
    print(terms)
    

#### Results obtained when re-running the last code cell after initial run of the notebook

Time to fit and transform using LDA: 0.045001 seconds

LDA Topic #1::  
['new', 'work', 'use', 'hello', 'python', 'forward', 'area', 'look']  
LDA Topic #2::  
['certificate', 'work', 'course', 'take', 'everyone', 'hello', 'name', 'since']  
LDA Topic #3::  
['machine', 'learning', 'currently', 'data', 'work', 'complete', 'would', 'course']  
LDA Topic #4::  
['work', 'take', 'live', 'currently', 'hi', 'year', 'class', 'background']  
LDA Topic #5::  
['learn', 'learning', 'machine', 'data', 'deep', 'certification', 'year', 'class']  



Time to fit and transform using LSA: 0.008966 seconds

LSA Topic #1::  
['work', 'take', 'machine', 'course', 'learning', 'currently', 'data', 'everyone']  
LSA Topic #2::  
['linguistics', 'background', 'take', 'academic', 'language', 'productivity', 'empower', 'coaching']  
LSA Topic #3::  
['year', 'learn', 'business', 'enrol', 'space', 'interest', 'intelligence', 'analytics']  
LSA Topic #4::  
['sentiment', 'data', 'context', 'everyone', 'course', 'complete', 'project', 'work']  
LSA Topic #5::  
['medium', 'social', 'mention', 'carolina', 'would', 'apply', 'background', 'part']  

#### Results obtained when running this notebook with Run All command

Time to fit and transform using LDA: 1.415000 seconds

LDA Topic #1::  
['field', 'text', 'new', 'hello', 'complete', 'interest', 'intelligence', 'take']  
LDA Topic #2::  
['analytics', 'data', 'work', 'learn', 'class', 'intelligence', 'interest', 'would']  
LDA Topic #3::  
['work', 'machine', 'learning', 'learn', 'class', 'year', 'hi', 'name']  
LDA Topic #4::  
['take', 'course', 'certificate', 'machine', 'learning', 'work', 'background', 'everyone']  
LDA Topic #5::  
['data', 'course', 'certification', 'machine', 'work', 'learning', 'complete', 'hello']  



Time to fit and transform using LSA: 0.009966 seconds

LSA Topic #1::  
['work', 'take', 'machine', 'course', 'learning', 'currently', 'data', 'everyone']  
LSA Topic #2::  
['linguistics', 'background', 'take', 'language', 'productivity', 'empower', 'academic', 'coaching']  
LSA Topic #3::  
['year', 'learn', 'business', 'enrol', 'space', 'intelligence', 'interest', 'analytics']  
LSA Topic #4::  
['sentiment', 'data', 'context', 'everyone', 'course', 'complete', 'project', 'work']  
LSA Topic #5::  
['mention', 'medium', 'carolina', 'social', 'would', 'apply', 'background', 'part']  