In [6]:
import pandas as pd
from sklearn import svm
from sklearn import metrics
import sys 
import os
sys.path.append(os.path.abspath("./Code"))
from library import *

##################################
# data loading and preprocessing #
##################################

train = pd.read_csv("./Data/Classification/webkb-train-stemmed.txt", header=None, delimiter="\t")
print train.shape

test = pd.read_csv("./Data/Classification/webkb-test-stemmed.txt", header=None, delimiter="\t")
print test.shape

(2803, 2)
(1396, 2)


In [7]:
# inspect head of data frames
print "first five rows of training data:"
print train.ix[:5,:]

print "first five rows of testing data:"
print test.ix[:5,:]

first five rows of training data:
         0                                                  1
0  student  brian comput scienc depart univers wisconsin d...
1  student  denni swanson web page mail pop uki offic hour...
2  faculty  russel impagliazzo depart comput scienc engin ...
3  student  dave phd student depart comput scienc univers ...
4  project  center lifelong learn design univers colorado ...
5  faculty  steve liu associ professor depart comput scien...
first five rows of testing data:
         0                                                  1
0  student  eric homepag eric wei tsinghua physic fudan genet
1   course  comput system perform evalu model new sept ass...
2  student  home page comput scienc grad student ucsd work...
3  student  toni web page toni face thing call toni studen...
4   course  ec advanc comput architectur credit parallel a...
5  faculty  faculti member ci depart research interest par...


In [10]:
# get index of empty (nan) and less than four words documents
index_remove = [i for i in range(len(train.ix[:,1])) if (train.ix[i,1]!=train.ix[i,1]) or ((train.ix[i,1]==train.ix[i,1])and(len(train.ix[i,1].split(" "))<4))]

# remove those documents
print "removing", len(index_remove), "documents from training set"
train = train.drop(train.index[index_remove])
print train.shape

# repeat above steps for testing set
index_remove = [i for i in range(len(test.ix[:,1])) if (test.ix[i,1]!=test.ix[i,1]) or ((test.ix[i,1]==test.ix[i,1])and(len(test.ix[i,1].split(" "))<4))]
print "removing", len(index_remove), "documents from testing set"
test = test.drop(test.index[index_remove])
print test.shape

removing 29 documents from training set
(2774, 2)
removing 20 documents from testing set
(1376, 2)


In [12]:
labels = train.ix[:,0]
unique_labels = list(set(labels))

truth = test.ix[:,0]
unique_truth = list(set(truth))
print "number of observations per class:"
for label in unique_labels:
    print label, ":", len([temp for temp in labels if temp==label])

number of observations per class:
project : 335
course : 620
student : 1075
faculty : 744


In [13]:
print "storing terms from training documents as list of lists"
terms_by_doc = [document.split(" ") for document in train.ix[:,1]]
n_terms_per_doc = [len(terms) for terms in terms_by_doc]

print "storing terms from testing documents as list of lists"
terms_by_doc_test = [document.split(" ") for document in test.ix[:,1]]

print "min, max and average number of terms per document:", min(n_terms_per_doc), max(n_terms_per_doc), sum(n_terms_per_doc)/len(n_terms_per_doc)


storing terms from training documents as list of lists
storing terms from testing documents as list of lists
min, max and average number of terms per document: 4 20628 134


In [14]:
# store all terms in list
all_terms = [terms for sublist in terms_by_doc for terms in sublist]

# compute average number of terms
avg_len = sum(n_terms_per_doc)/len(n_terms_per_doc)

# unique terms
all_unique_terms = list(set(all_terms))

# store IDF values in dictionary
n_doc = len(labels)

idf = dict(zip(all_unique_terms,[0]*len(all_unique_terms)))

In [16]:
import math
counter = 0
for element in idf.keys():
    # number of documents in which each term appears
    df = sum([element in terms for terms in terms_by_doc])
    # idf
    idf[element] = math.log10(float(n_doc+1)/df)
    
    counter+=1
    if counter % 200 == 0:
        print counter, "terms have been processed"

200 terms have been processed
400 terms have been processed
600 terms have been processed
800 terms have been processed
1000 terms have been processed
1200 terms have been processed
1400 terms have been processed
1600 terms have been processed
1800 terms have been processed
2000 terms have been processed
2200 terms have been processed
2400 terms have been processed
2600 terms have been processed
2800 terms have been processed
3000 terms have been processed
3200 terms have been processed
3400 terms have been processed
3600 terms have been processed
3800 terms have been processed
4000 terms have been processed
4200 terms have been processed
4400 terms have been processed
4600 terms have been processed
4800 terms have been processed
5000 terms have been processed
5200 terms have been processed
5400 terms have been processed
5600 terms have been processed
5800 terms have been processed
6000 terms have been processed
6200 terms have been processed
6400 terms have been processed
6600 terms h

In [18]:
print "creating a graph-of-words for each training document \n"
all_graphs = []
window = 4
for abstract in terms_by_doc:
    all_graphs.append(terms_to_graph(abstract, window))
    
# sanity checks (should return True)
print len(terms_by_doc)==len(all_graphs)
print len(set(terms_by_doc[0]))==len(all_graphs[0].vs)

creating a graph-of-words for each training document 

True
True


In [19]:
# sanity checks (should return True)
print len(terms_by_doc)==len(all_graphs)
print len(set(terms_by_doc[0]))==len(all_graphs[0].vs)

print "computing vector representations of each training document"

b = 0.003
features_degree = []
features_w_degree = []
features_closeness = []
features_w_closeness = []
features_tfidf = []

len_all = len(all_unique_terms)

counter = 0

idf_keys = idf.keys()

for i in xrange(len(all_graphs)):
    
    graph = all_graphs[i]
    terms_in_doc = terms_by_doc[i]
    doc_len = len(terms_in_doc)
    
    # returns node (1) name, (2) degree, (3) weighted degree, (4) closeness, (5) weighted closeness
    metrics = compute_node_centrality(graph)
    
    feature_row_degree = [0]*len_all
    feature_row_w_degree = [0]*len_all
    feature_row_closeness = [0]*len_all
    feature_row_w_closeness = [0]*len_all
    feature_row_tfidf = [0]*len_all
    
    for term in list(set(terms_in_doc)):
        index = all_unique_terms.index(term)
        idf_term = idf[term]
        denominator = (1-b+(b*(float(doc_len)/avg_len)))
        metrics_term = [tuple[1:5] for tuple in metrics if tuple[0]==term][0]
        
        # store TW-IDF values
        feature_row_degree[index] = (float(metrics_term[0])/denominator) * idf_term
        feature_row_w_degree[index] = (float(metrics_term[1])/denominator) * idf_term
        feature_row_closeness[index] = (float(metrics_term[2])/denominator) * idf_term
        feature_row_w_closeness[index] = (float(metrics_term[3])/denominator) * idf_term
        
        # number of occurences of word in document
        tf = terms_in_doc.count(term)
        
        # store TF-IDF value
        feature_row_tfidf[index] = ((1+math.log1p(1+math.log1p(tf)))/(1-0.2+(0.2*(float(doc_len)/avg_len)))) * idf_term
    
    features_degree.append(feature_row_degree)
    features_w_degree.append(feature_row_w_degree)
    features_closeness.append(feature_row_closeness)
    features_w_closeness.append(feature_row_w_closeness)
    features_tfidf.append(feature_row_tfidf)

    counter += 1
    if counter % 100 == 0:
        print counter, "documents have been processed"

True
True
computing vector representations of each training document
100 documents have been processed
200 documents have been processed
300 documents have been processed
400 documents have been processed
500 documents have been processed
600 documents have been processed
700 documents have been processed
800 documents have been processed
900 documents have been processed
1000 documents have been processed
1100 documents have been processed
1200 documents have been processed
1300 documents have been processed
1400 documents have been processed
1500 documents have been processed
1600 documents have been processed
1700 documents have been processed
1800 documents have been processed
1900 documents have been processed
2000 documents have been processed
2100 documents have been processed
2200 documents have been processed
2300 documents have been processed
2400 documents have been processed
2500 documents have been processed
2600 documents have been processed
2700 documents have been proce