In [1]:
from sklearn.externals.joblib import load
import pandas as pd
import re

In [2]:
tfidf_transformer = load("./models/keyword/tfidf_transformer.joblib")
cv = load("./models/keyword/CountVector.joblib")

In [3]:
def pre_process(text):
    
    # lowercase
    text=text.lower()
    
    #remove tags
    text=re.sub("</?.*?>"," <> ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    return text

In [5]:
# read test docs into a dataframe and concatenate title and body
df_test=pd.read_csv("./dataset/meta.csv")
df_test['text'] = df_test['title'] + df_test['content']
df_test['text'] =df_test['text'].apply(lambda x:pre_process(x))

# get test docs into a list
docs_test=df_test['text'].tolist()
docs_title=df_test['title'].tolist()
docs_body=df_test['content'].tolist()

In [6]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []

    for idx, score in sorted_items:
        fname = feature_names[idx]
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [7]:
# you only needs to do this once
feature_names=cv.get_feature_names()

# get the document that we want to extract keywords from
doc=docs_test[2]

#generate tf-idf for the given document
tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))

#sort the tf-idf vectors by descending order of scores
sorted_items=sort_coo(tf_idf_vector.tocoo())

#extract only the top n; n here is 10
keywords=extract_topn_from_vector(feature_names,sorted_items,10)

# now print the results
print("\n=====Title=====")
print(docs_title[2])
print("\n=====Body=====")
print(docs_body[2])
print("\n===Keywords===")
for k in keywords:
    print(k,keywords[k])


=====Title=====
Video sparks fears Hong Kong protesters being... | Taiwan News

=====Body=====

 TAIPEI (Taiwan News) - A video that surfaced on Monday (Nov. 18) appearing to show Hong Kong protesters being loaded onto a train near the border with China has sparked fears online that they are being sent to a detention center in the communist country.
As clashes continued on a university campus between riot police and pro-democracy protesters, a video was posted on Twitter spurring concerns that arrested demonstrators were being illegally extradited to China. At 2:33 p.m. on Monday, Twitter user @Woppa1Woppa posted a video showing handcuffed protesters being forced onto a train and wrote that it was unknown where they were being sent.
Below the Tweet, several netizens identified the train as belonging to the East Rail Line of the Mass Transit Railway (MTR) system. Although the detainees' destination is unknown, netizens pointed out that the last two stations are Lo Wu and Lok Ma Chau, w

  'stop_words.' % sorted(inconsistent))
