In [2]:
%load_ext autoreload
%autoreload 2
import itertools
import os
import numpy as np
from collections import defaultdict, Counter
import operator
import pandas as pd
import json
import math
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
import copy

http://scikit-learn.org/stable/modules/multiclass.html
http://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsRestClassifier.html
https://stats.stackexchange.com/questions/260754/better-performace-using-random-forest-one-vs-all-than-random-forest-multiclass
https://github.com/scikit-learn/scikit-learn/issues/9602
http://scikit-learn.org/stable/modules/model_evaluation.html

http://scikit-learn.org/stable/supervised_learning.html

# Define features

    1. term idf
    2. unigram frequency - query in OR
    3. bigram frequency - the query in AND
    4. query length
    5. number of stopwords
    -6. query type: faceted, ambiguous-
    7. is anchor ???
    8. proximity information from matteo's file - proximity models (e.g., [Peng et al. 2007])
        that act on pairs of query terms
    9. link analysis approaches that consider the linkage patterns within the top-ranked
        documents (e.g., SALSA [Lempel and Moran 2001])
    10. features from Wikipedia title articles
    11. google n-grams
    12. query freq in query log, on terms also


In [3]:
def parseLine(line):
    data = line.replace("\n","").split("\t")
    qid = int(data[0])
    df_list = json.loads(data[2])
    docid = int(data[4])
    docno = data[5]
    doclen = int(data[6])
    tf_list = np.array(json.loads(data[7]))
    return df_list, tf_list, doclen

In [4]:
def getTermFeatures(qid_str):
    directory = "/home/muntean/terrier-passage/tfs-per-qid/all-matches-top-10000/"
    filename = directory + "all-matches-fields-tfs-qid-" + qid_str + "-filtered.txt"
    countLines = 0
    with open(filename, "r") as inputFile:    
        df_list, sum_tf_list, sum_doclen = parseLine(inputFile.readline())
        countLines += 1
        #print(df_list, sum_tf_list, sum_doclen)
        for line in inputFile:
            countLines += 1
            df_list, tf_list, doclen = parseLine(line)
            #print(df_list, tf_list, doclen)
            sum_tf_list = sum_tf_list + tf_list
            sum_doclen = sum_doclen + doclen
  
        avg_tf_list =  sum_tf_list / countLines
        avg_doclen = sum_doclen / countLines
        
    return df_list, np.sum(avg_tf_list, axis=0), avg_doclen

In [5]:
getTermFeatures("1")

([1036360, 1688512, 9909812],
 array([  1.8164,   1.5978,   5.1386,   4.2499,   7.1304,  32.1869]),
 1730.221)

In [6]:
def idf(N, df):
    return math.log((N - df + 0.5) / (df + 0.5))

# GET them features

In [7]:
"""
We have 2 query types: faceted, ambiguous
"""
N = 50220423
AVG_LEN = 963.90334

def getQueryFeatures(info):
    featureDict=defaultdict(float)
    
    # Query Type
    if info[1] == "faceted":
        featureDict["faceted"]=1
        featureDict["ambiguous"]=0
    else:
        featureDict["faceted"]=0
        featureDict["ambiguous"]=1
        
    # Query len
    featureDict["qlen"] = len(info[2].split(" "))
    
    # Stopwords
    featureDict["num_stopwords"] = len([x for x in info[2].split(" ") if x in ENGLISH_STOP_WORDS])
    
    # Term info
    df_list, tf_list_avg, doclen_avg  = getTermFeatures(str(info[0]))
    
    featureDict["maxDF"] = max(df_list)
    featureDict["minDF"] = min(df_list)
    featureDict["avgDF"] = np.mean(df_list)
    
    featureDict["maxIDF"] = max([idf(N, df) for df in df_list])
    featureDict["minIDF"] = min([idf(N, df) for df in df_list])
    featureDict["avgIDF"] = np.mean([idf(N, df) for df in df_list])
    
    for i,tf_avg in enumerate(tf_list_avg):
        featureDict["avgTF"+str(i+1)] = tf_avg
    featureDict["avgDoclen"] = doclen_avg
    
    deltaTF = [t - s for s, t in zip(tf_list_avg, tf_list_avg[1:])]
    for i, delta in enumerate(deltaTF):
        featureDict["deltaTF"+str(i+1)] = delta

    return featureDict

# Classification Setup

In [8]:
%store -r resultListAltered
%store -r resultDict
%store -r qidInfo

In [9]:
def reverseMappingClusterQid(resultDict):
    queryClassDict = {}
    for key, values in resultDict.items():
        for value in values:
            queryClassDict[value]=(key)
    return queryClassDict

In [10]:
def getXy(qidInfo, queryClassDict):
    X_as_list = list()
    y = list()
    for qid, info in qidInfo.items():
        if qid in queryClassDict:
            featureDict = getQueryFeatures(info)
            X_as_list.append(list(featureDict.values()))
            queryClass = queryClassDict[qid]
            y.append(queryClass)

    feature_name = list(featureDict.keys())

    # X is the matrix
    # y are the classes
    X = np.array(X_as_list)
    y = np.array(y)
    return X,y, feature_name

In [11]:
queryClassDict = reverseMappingClusterQid(resultListAltered[0])
X,y, feature_name = getXy(qidInfo, queryClassDict)

In [12]:
## Cluster distribution
Counter(y).most_common()

[(1, 179), (0, 15)]

# Classification attempts

## 1. OneVsRestClassifier - LinearSVC

In [13]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.cross_validation import StratifiedKFold

clf1 = OneVsRestClassifier(LinearSVC(random_state=0)).fit(X, y)
print(clf1)
print(clf1.score(X,y))



OneVsRestClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
     verbose=0),
          n_jobs=1)
0.5


## 2. Random forest classifier

In [14]:
## Random forest classifier - inherently multiclass (not one vs all!)
#### http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

clf2 = RandomForestClassifier(max_depth=2, random_state=0)
print(clf2.fit(X, y))
print(clf2.score(X,y))
print(sorted([(x,y) for x,y in zip(clf2.feature_importances_, feature_name)], reverse=True))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)
0.927835051546
[(0.16608284018470368, 'avgDoclen'), (0.15477483350888507, 'deltaTF2'), (0.13366828351118978, 'qlen'), (0.11543202102715948, 'minDF'), (0.10685713482858081, 'avgDF'), (0.1008059292759466, 'avgTF6'), (0.075220897839400946, 'avgTF3'), (0.042207901366331027, 'maxIDF'), (0.040712645268819077, 'avgTF1'), (0.021937965070843772, 'avgTF5'), (0.021505948496861729, 'deltaTF4'), (0.016491715966748415, 'deltaTF3'), (0.0043018836545296217, 'minIDF'), (0.0, 'num_stopwords'), (0.0, 'maxDF'), (0.0, 'faceted'), (0.0, 'deltaTF5'), (0.0, 'deltaTF1'), (0.0, 'avgTF4'), (0.0, 'avgTF2'), (0.0, 'avg

## 3. OneVsRestClassifier - Gradient Boosting classifier

In [15]:
## Gradient Boosting classifier - one vs all
# http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html#sklearn.ensemble.GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier

clf3 = OneVsRestClassifier(GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0))
print(clf3.fit(X, y))
print(clf3.score(X,y))
# print([(x,y) for x,y in zip(clf3.feature_importances_, feature_name)])

OneVsRestClassifier(estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=1.0, loss='deviance', max_depth=1,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=0, subsample=1.0, verbose=0,
              warm_start=False),
          n_jobs=1)
1.0


In [16]:
# train multiple classifiers -> soft voting
# http://scikit-learn.org/stable/modules/ensemble.html#gradient-boosting

## K-Fold

In [17]:
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, LeaveOneOut

k_fold = KFold(n_splits=5)
strat_k_fold = StratifiedKFold(n_splits=5)
# for train_indices, test_indices in k_fold.split(X):
#     print('Train: %s | test: %s' % (train_indices, test_indices))


print("Classifier 1")
a = np.array([clf1.fit(X[train], y[train]).score(X[test], y[test]) for train, test in k_fold.split(X)])
print(a)
print("Accuracy: %0.2f (+/- %0.2f)" % (a.mean(), a.std() * 2))

b = np.array([clf1.fit(X[train], y[train]).score(X[test], y[test]) for train, test in strat_k_fold.split(X,y)])
print(b)
print("Accuracy: %0.2f (+/- %0.2f)" % (b.mean(), b.std() * 2))
print("--------------")


print("Classifier 2")
a = np.array([clf2.fit(X[train], y[train]).score(X[test], y[test]) for train, test in k_fold.split(X)])
print(a)
print("Accuracy: %0.2f (+/- %0.2f)" % (a.mean(), a.std() * 2))

b = np.array([clf2.fit(X[train], y[train]).score(X[test], y[test]) for train, test in strat_k_fold.split(X,y)])
print(b)
print("Accuracy: %0.2f (+/- %0.2f)" % (b.mean(), b.std() * 2))
print("--------------")

print("Classifier 3")
a = np.array([clf3.fit(X[train], y[train]).score(X[test], y[test]) for train, test in k_fold.split(X)])
print(a)
print("Accuracy: %0.2f (+/- %0.2f)" % (a.mean(), a.std() * 2))

b = np.array([clf3.fit(X[train], y[train]).score(X[test], y[test]) for train, test in strat_k_fold.split(X,y)])
print(b)
print("Accuracy: %0.2f (+/- %0.2f)" % (b.mean(), b.std() * 2))



Classifier 1
[ 0.94871795  0.92307692  0.92307692  0.76923077  0.86842105]
Accuracy: 0.89 (+/- 0.13)
[ 0.92307692  0.92307692  0.12820513  0.8974359   0.86842105]
Accuracy: 0.75 (+/- 0.62)
--------------
Classifier 2
[ 1.          0.97435897  0.94871795  0.76923077  0.92105263]
Accuracy: 0.92 (+/- 0.16)
[ 0.92307692  0.8974359   0.92307692  0.92307692  0.92105263]
Accuracy: 0.92 (+/- 0.02)
--------------
Classifier 3
[ 1.          0.94871795  0.8974359   0.76923077  0.89473684]
Accuracy: 0.90 (+/- 0.15)
[ 0.8974359   0.92307692  0.92307692  0.84615385  0.89473684]
Accuracy: 0.90 (+/- 0.06)


In [27]:
scores = cross_val_score(clf2, X, y, cv=k_fold, n_jobs=-1) #, scoring='precision_macro')
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.92 (+/- 0.16)


In [28]:
loo = LeaveOneOut() # this sucks! what the STD DEV
c = [clf2.fit(X[train], y[train]).score(X[test], y[test]) for train, test in loo.split(X,y)]
print(c)
print("Accuracy: %0.2f (+/- %0.2f)" % (np.array(c).mean(), np.array(c).std() * 2))

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0]
Accuracy: 0.92 (+/- 0.53)
