In [1]:
import os
import re
import nltk
import string
import operator
from sklearn.feature_extraction.text import TfidfVectorizer
import sys 
sys.path.append(os.path.abspath("./Code"))
from library import *

nltk.download('punkt') # for tokenization
nltk.download('maxent_treebank_pos_tagger') # for POS tagging
nltk.download('stopwords')

stpwds = set(nltk.corpus.stopwords.words("english"))
stemmer = nltk.stem.PorterStemmer()
punct = string.punctuation.replace("-", "")

[nltk_data] Downloading package punkt to /home/zapfack/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_treebank_pos_tagger to
[nltk_data]     /home/zapfack/nltk_data...
[nltk_data]   Package maxent_treebank_pos_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/zapfack/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# nltk.download()

In [2]:
##################################
# read and pre-process abstracts #
##################################
print "reading and pre-processing abstracts\n"

root_dir = os.getcwd()
path = os.getcwd() + "/Data/Hulth2003testing/abstracts"
# path = "./Data/Hulth2003testing/abstracts"
os.chdir(path)

# os.listdir(path)
abstracts = []
counter = 0

for filename in sorted(os.listdir(path)):
   f = open(filename, 'r')
   content = f.read()
   # remove formatting
   content =  re.sub("\s+", " ", content)
   # convert to lower case
   content = content.lower()
   # remove punctuation (preserving intra-word dashes)
   content = "".join(letter for letter in content if letter not in punct)
   # remove dashes attached to words but that are not intra-word
   content = re.sub("[^[:alnum:]['-]", " ", content)
   content = re.sub("[^[:alnum:][-']", " ", content)
   # remove extra white space
   content = re.sub(" +"," ", content)
   # remove leading and trailing white space
   content = content.strip()
   # tokenize
   tokens = content.split(" ")
   # remove stopwords
   tokens_keep = [token for token in tokens if token not in stpwds]
   # POS-tag 
   tagged_tokens = nltk.pos_tag(tokens_keep)
   # keep only nouns and adjectives    
   tokens_keep = [pair[0] for pair in tagged_tokens if (pair[1] in ["NN","NNS","NNP","NNPS","JJ","JJS","JJR"])]
   # apply Porter stemmer
   tokens_keep = [stemmer.stem(token) for token in tokens_keep]
   # store list of tokens
   abstracts.append(tokens_keep)
   
   counter += 1
   if counter % 100 == 0:
 		print counter, 'abstracts have been processed'

reading and pre-processing abstracts

100 abstracts have been processed
200 abstracts have been processed
300 abstracts have been processed
400 abstracts have been processed
500 abstracts have been processed


In [3]:
os.chdir(root_dir)

print "reading and pre-processing humman assigned keywords\n"   
   
path = os.getcwd() + "/Data/Hulth2003testing/uncontr"
os.chdir(path)

golden_keywords = []

for filename in sorted(os.listdir(path)):
   f = open(filename, 'r')
   content = f.read()
   # remove formatting
   content =  re.sub("\s+", " ", content)
   # convert to lower case
   content = content.lower()
   # turn string into list of keywords, preserving intra-word dashes 
   # but breaking n-grams into unigrams to easily compute precision and recall
   content = content.split(";")
   content = [keyword.split(" ") for keyword in content]
   # flatten list
   content = [keyword for sublist in content for keyword in sublist]
   # remove empty elements
   content = [keyword for keyword in content if len(keyword)>0]
   # remove stopwords (rare but can happen due to n-gram breaking)
   content = [keyword for keyword in content if keyword not in stpwds]
   # apply Porter's stemmer
   content = [stemmer.stem(keyword) for keyword in content]
   # remove leading and trailing whitespace
   content = [keyword.strip() for keyword in content]
   # remove duplicates (can happen due to n-gram breaking)
   content = list(set(content))
   # save final list
   golden_keywords.append(content)

reading and pre-processing humman assigned keywords



In [4]:
print "creating a gow for each abstract \n"
all_graphs = []
window = 3
for abstract in abstracts:
    all_graphs.append(terms_to_graph(abstract, window))



creating a gow for each abstract 

extracting main cores from each graph 



In [9]:
print "extracting main cores from each graph \n"
mcs_weighted = []
mcs_unweighted = []
counter = 0
for graph in all_graphs:
    mcs_weighted.append(core_dec(graph, weighted = True)["main_core"])
    mcs_unweighted.append(core_dec(graph, weighted = False)["main_core"])

extracting main cores from each graph 



In [15]:
# mcs_weighted[0].vs["name"]
# mcs_unweighted[0].vs["name"]
# len(mcs_weighted[0])
print "average size of weighted main cores:", sum([len(main_core.vs["name"]) 
                                                   for main_core in mcs_weighted])/len([len(main_core.vs["name"]) for main_core in mcs_weighted])
print "average size of unweighted main cores:",sum([len(main_core.vs["name"]) 
                                                    for main_core in mcs_unweighted])/len([len(main_core.vs["name"]) for main_core in mcs_unweighted])


average size of weighted main cores: 16
average size of unweighted main cores: 26


In [16]:
# TF_IDF
print "baseline tf-idf \n"

corpus = [str(" ".join(abstract)) for abstract in abstracts]

# (optional) find out the number of unique words in abstracts
temp = [list(set(abstract)) for abstract in abstracts]
temptemp = [item for sublist in temp for item in sublist]
print len(set(temptemp))

vectorizer = TfidfVectorizer()

# transforms the input data into bow feature vectors with tf-idf weights
features = vectorizer.fit_transform(corpus)

colnames = vectorizer.get_feature_names()

features_dense_list = features.todense().tolist()

tf_idf = []

for element in features_dense_list:
    
    # bow feature vector as list of tuples
    row = zip(colnames,element)
    
    # keep only non zero values
    # (the final length of the list should match exactly the number of unique terms in the document)
    nonzero = [tuple for tuple in row if tuple[1]!=0]
    
    # rank in decreasing order
    nonzero = sorted(nonzero, key=operator.itemgetter(1), reverse=True)
    
    # retain top 33% words as keywords
    numb_to_retain = int(round(len(nonzero)/3))
    
    keywords = [tuple[0] for tuple in nonzero[:numb_to_retain]]
    
    tf_idf.append(keywords)

baseline tf-idf 

4286


In [20]:
import numpy as np
np.array(features_dense_list).shape


(500, 4192)

In [26]:
# PageRank
print "baseline PageRank"

PageRank = []
counter = 0

for graph in all_graphs:
    
    pr_scores = graph.pagerank()
    pr_scores = zip(graph.vs["name"],pr_scores)
    
    # rank in decreasing order
    pr_scores = sorted(pr_scores, key=operator.itemgetter(1), reverse=True)
    
    # retain top 33% words as keywords
    numb_to_retain = int(round(len(pr_scores)/3))
    
    keywords = [tuple[0] for tuple in pr_scores[:numb_to_retain]]
    
    PageRank.append(keywords)    
    
    counter+=1
    if counter % 100 == 0:
        print counter, "graphs have been processed"

baseline PageRank
100 graphs have been processed
200 graphs have been processed
300 graphs have been processed
400 graphs have been processed
500 graphs have been processed


In [45]:
accuracy_mcs_weighted = []
accuracy_mcs_unweighted = []
accuracy_PageRank = []
accuracy_tf_idf = []

for i in range(len(golden_keywords)):
    truth = golden_keywords[i]
    
    accuracy_mcs_weighted.append(accuracy_metrics(candidate = mcs_weighted[i].vs["name"], truth = truth))
    accuracy_mcs_unweighted.append(accuracy_metrics(candidate = mcs_unweighted[i].vs["name"], truth = truth))
    accuracy_PageRank.append(accuracy_metrics(candidate = PageRank[i], truth = truth))
    accuracy_tf_idf.append(accuracy_metrics(candidate = tf_idf[i], truth = truth))

# macro-averaged results (collection level)
print "macro-averaged precision:", sum([tuple[0] for tuple in accuracy_mcs_weighted])/len(golden_keywords)
print "macro-averaged recall:", sum([tuple[1] for tuple in accuracy_mcs_weighted])/len(golden_keywords)
print "macro-averaged F-1 score:", sum([tuple[2] for tuple in accuracy_mcs_weighted])/len(golden_keywords)


macro-averaged precision: 0.60335486
macro-averaged recall: 0.43577744
macro-averaged F-1 score: 0.40802114


In [47]:
print sum([tuple[0] for tuple in accuracy_mcs_unweighted])/len(golden_keywords)
print sum([tuple[1] for tuple in accuracy_mcs_unweighted])/len(golden_keywords)
print sum([tuple[2] for tuple in accuracy_mcs_unweighted])/len(golden_keywords)



0.44342218
0.60200324
0.47536026


In [49]:
print sum([tuple[0] for tuple in accuracy_PageRank])/len(golden_keywords)
print sum([tuple[1] for tuple in accuracy_PageRank])/len(golden_keywords)
print sum([tuple[2] for tuple in accuracy_PageRank])/len(golden_keywords)



0.54944962
0.36183812
0.41912078


In [50]:
print sum([tuple[0] for tuple in accuracy_tf_idf])/len(golden_keywords)
print sum([tuple[1] for tuple in accuracy_tf_idf])/len(golden_keywords)
print sum([tuple[2] for tuple in accuracy_tf_idf])/len(golden_keywords)

0.58528554
0.38595718
0.44686596
