In [1]:
from sklearn.externals.joblib import load
import pandas as pd
import re

In [2]:
tfidf_transformer = load("./models/keyword/tfidf_transformer.joblib")
cv = load("./models/keyword/CountVector.joblib")

In [3]:
def pre_process(text):
    
    # lowercase
    text=text.lower()
    
    #remove tags
    text=re.sub("</?.*?>"," <> ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    return text

In [4]:
# read test docs into a dataframe and concatenate title and body
df_test=pd.read_csv("./dataset/meta.csv")

df_test['text'] = df_test['title'] + df_test['content']
df_test['text'] =df_test['text'].apply(lambda x:pre_process(x))

# get test docs into a list
docs_test=df_test['text'].tolist()
docs_title=df_test['title'].tolist()
docs_body=df_test['content'].tolist()

In [5]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []

    for idx, score in sorted_items:
        fname = feature_names[idx]
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [6]:
# you only needs to do this once
feature_names=cv.get_feature_names()

# get the document that we want to extract keywords from
doc=docs_test[16]

#generate tf-idf for the given document
tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))

#sort the tf-idf vectors by descending order of scores
sorted_items=sort_coo(tf_idf_vector.tocoo())

#extract only the top n; n here is 10
keywords=extract_topn_from_vector(feature_names,sorted_items,10)

# now print the results
print("\n=====Title=====")
print(docs_title[16])
print("\n=====Body=====")
print(docs_body[16])
print("\n===Keywords===")
for k in keywords:
    print(k,keywords[k])


=====Title=====
Seattle police captain arrested in undercover vice operation

=====Body=====

A 31-year veteran Seattle Police Department captain was arrested by his fellow officers during a vice sting late Wednesday, in an investigation of sexual exploitation. 
According to several sources, 53-year-old veteran Seattle police captain Randal Woolery offered $40 to an undercover officer who was posing as a prostitute, and he was arrested on the spot. 
Woolery was one of five men arrested in the Aurora sting, and according to jail records, he was booked and released in less than 30 minutes. 
Woolery was also elected as a Fire Commissioner in Snohomish County Fire District 7, which manages eight fire departments. A spokesperson with the District directed questions to Seattle police. 
More news from KIRO 7 DOWNLOAD OUR FREE NEWS APP    



===Keywords===
police 0.62
fire 0.326
district 0.305
vice 0.244
news 0.235
year 0.191
according 0.183
county 0.149
departments 0.147
late 0.146


  'stop_words.' % sorted(inconsistent))


In [21]:
from pprint import pprint as pp
pp(docs_test)

['video sparks fears hong kong protesters being taiwan newsvideo of hong kong '
 'protesters being loaded onto train spurs concerns they are being sent to '
 'china a video surfacing on nov appears to show hong kong protesters being '
 'loaded onto a train near the border with china sparking fears online that '
 'they are being sent to a detention center in the communist country ',
 'woppa on twitterarrested protesters are getting transported out on a train '
 'unknown at this time where they will be sent residents and press are heard '
 'asking for their names via on 谷 standwithhongkong hongkongprotests https t '
 'co n i mmxu r',
 ' intolerably offensive boy s nazi costume at elementary school halloween '
 'parade sparks outragethe student wore a dark brown dress shirt tucked into '
 'tan pants with a black tie and matching dress shoes had that been the extent '
 'of the boy s outfit he probably wouldn t have drawn much attention as he '
 'walked alongside his costumed peers during a

In [7]:
'seattl polic captain arrest undercov vice oper'.split(" ")

['seattl', 'polic', 'captain', 'arrest', 'undercov', 'vice', 'oper']