In [1]:
import spacy
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
nlp = spacy.load('en')
DOC_PATH = './data/2011_Q2.txt'

In [3]:
def load_sentense(path):
    """
        load the paragraph and tokenize to sentences
    """
    print("Loading the text...")
    res = []
    with open(path, 'r') as f:
        for line in f:
            sentences = nlp(line).sents
            for sentence in sentences:
                res.append(sentence.string.strip())
    print("Text loading successfully...")
    return res

In [4]:
text = load_sentense(DOC_PATH)

Loading the text...
Text loading successfully...


In [5]:
def tfidf_processing(sentenses):
    sent = [s for s in sentenses if len(s.split()) > 5]  # choose meaning sentence

    print( "TFIDF calculation...")
    tf_vectorizer = TfidfVectorizer(max_features=200, min_df=2, max_df=.5, stop_words='english', ngram_range=(1, 3)) 
    tfidf_sent = tf_vectorizer.fit_transform(sent)
    print( "TFIDF is done...")
    
    return tf_vectorizer, tfidf_sent

In [6]:
tf_vectorizer, tfidf_sent = tfidf_processing(text)

TFIDF calculation...


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


TFIDF is done...


In [7]:
def lda_model_training(tfidf, topic_num=5):
    """
        train the LDA model
        topic is chosed as 5
    """
    print("LDA training...")
    lda = LatentDirichletAllocation(n_topics=topic_num, max_iter=5, random_state=7).fit(tfidf)   
    print("LDA has been trained...")
    return lda

In [8]:
lda = lda_model_training(tfidf_sent)
tf_feature_names = tf_vectorizer.get_feature_names()

LDA training...




LDA has been trained...


In [9]:
def print_top_words(model, feature_names, n_top_words):
    """
        show the topic and related words
    """
    for topic_idx,topic in enumerate(model.components_):
        print("Topic #%d:"%topic_idx)
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words-1:-1]]), '\n')

In [10]:
N_WORDS = 20
print_top_words(lda, tf_feature_names, N_WORDS)

Topic #0:
mean paper say right good really better little ph market pipeline like kind probably yields said indiscernible big note guidance 

Topic #1:
year earnings rate ve lending quarter core book kind continue roe core earnings short years fixed second environment hard second quarter rates 

Topic #2:
going loans forward business going forward portfolio look did coming special lnr gaap opportunities single single family family income financial team servicing 

Topic #3:
think don ll like ve want maybe just market value question guys term long rates doing shareholders don want 25 talk 

Topic #4:
quarter company price million 100 servicer additional basis lnr purchase special largest purchase price financial scale acquisition actually basis points value points 

