In [1]:
# import
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
import numpy as np
import sys 
#http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction

In [2]:
# prepare corpus
corpus = []
for d in range(1400):
    f = open("./d/"+str(d+1)+".txt")
    corpus.append(f.read())
query_corp = []
# add query to corpus
for q in range(1,225):
    f = open("./q/"+str(q)+".txt")
    query_corp.append(f.read())

In [3]:
relevant = []
for r in range(1,225):
    f = open("./r/"+str(r)+".txt")
    relevant.append(f.read().split())

In [4]:
# top relevant documents 
top = 25

In [5]:
# Next part just a test for small data

In [6]:
small_data = []
for d in range(10):
    f = open("./d/"+str(d+1)+".txt")
    small_data.append(f.read())

In [7]:
# remove stop words, punctuation.
# token_pattern was used to delete numbers 

In [8]:
vectorizer = CountVectorizer(binary=True, stop_words= 'english', analyzer= 'word', token_pattern = r'[^\W\d]+')

In [9]:
X = vectorizer.fit_transform(small_data)
X.toarray()

array([[0, 0, 1, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [10]:
vectorizer.get_feature_names()

['additional',
 'aerodynamic',
 'aerodynamics',
 'againbe',
 'agreement',
 'agreewell',
 'air',
 'airinjection',
 'alsobeen',
 'analysis',
 'analytic',
 'analyticsolutions',
 'andfree',
 'angle',
 'angles',
 'aninvestigation',
 'aninviscid',
 'appeared',
 'appears',
 'appreciably',
 'approximate',
 'approximately',
 'approximation',
 'areincomplete',
 'arises',
 'ashear',
 'atained',
 'atriangular',
 'attack',
 'auniform',
 'available',
 'awing',
 'based',
 'basis',
 'bedesirable',
 'beginswhen',
 'behave',
 'body',
 'boundary',
 'boundarylayer',
 'breakdown',
 'breaks',
 'bureau',
 'californiainstitute',
 'cases',
 'caused',
 'classical',
 'comparative',
 'compared',
 'comparison',
 'complete',
 'composite',
 'conducted',
 'conduction',
 'configuration',
 'consequently',
 'consider',
 'considered',
 'constant',
 'contamination',
 'continuum',
 'contribution',
 'control',
 'controlled',
 'course',
 'curved',
 'curves',
 'data',
 'degree',
 'densities',
 'destalling',
 'determine',
 'de

In [11]:
transformer = TfidfTransformer(use_idf= False)
tf = transformer.fit_transform(X)
tf.toarray()

array([[ 0.        ,  0.        ,  0.13130643, ...,  0.13130643,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.13245324,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.09950372],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [12]:
def VSP (data, query, representation):
    data.append(query)
    vectorizer = CountVectorizer(binary=True, stop_words= 'english', analyzer= 'word', token_pattern = r'[^\W\d]+')
    X = vectorizer.fit_transform(data)
    if representation == "binary":
        return X.toarray()
    elif representation == "TF":
        transformer = TfidfTransformer(use_idf= False)
        tf = transformer.fit_transform(X)
        return tf.toarray()
    elif representation == "TF-IDF":
        transformer = TfidfTransformer(use_idf= True)
        tf = transformer.fit_transform(X)
        return tf.toarray()
    else:
        print("Posible representations: 'binary', 'TF', 'TF-IDF'")    

In [13]:
def distance_euklid(matrix):
    M_ED = euclidean_distances(matrix[-1,:].reshape(1, -1), matrix[:len(matrix) - 1])[0]
    M_ED_sort = M_ED.argsort() + 1
    return M_ED_sort[:top]

In [14]:
def distance_cos(matrix):
    M_ED = cosine_similarity(matrix[-1,:].reshape(1, -1), matrix[:len(matrix) - 1])[0]
    M_ED_sort = M_ED.argsort()[::-1] +1
    return M_ED_sort[:top]

In [15]:
def precision(retrieved, relevant):
    intersection=list(set(retrieved) & set(relevant))
    return len(intersection) / len(retrieved)

In [16]:
def recall(retrieved, relevant):
    intersection=list(set(retrieved) & set(relevant))
    return len(intersection) / len(relevant)

In [17]:
def F_measure(retrieved, relevant):
    intersection=list(set(retrieved) & set(relevant))
    precision = len(intersection) / len(retrieved)
    recall = len(intersection) / len(relevant)
    if precision == 0 and recall == 0:
        return 0
    else:
        return 2* precision*recall/(precision + recall)

In [29]:
# to white in log_file
#sys.stdout = open("log_all_query.txt", "w+")

In [18]:
for i, query in enumerate(query_corp[:5]): 
    print("\n\nQuery number : ", i+1)
    print("Limit to elevant documents : ", top, '\n')
    represent = ['binary', 'TF', 'TF-IDF']
    for rep in represent:
        print("Use ", rep, " representation. \n")
        matrix_tfidf = VSP(corpus, query, rep)
        euklid = distance_euklid(matrix_tfidf)
        cosinus = distance_cos(matrix_tfidf)
        relevant[i] = list(map(int, relevant[i]))
        print("Use Euclidean distance:")
        print('Precision : %8.4f' % precision(euklid, relevant[i]))
        print('Recall : %8.4f' % recall(euklid, relevant[i]))
        print('F-measure : %8.4f \n' % F_measure(euklid, relevant[i]))
        print("Use Cosine similarity measure:")
        print('Precision : %8.4f' % precision(cosinus, relevant[i]))
        print('Recall : %8.4f' % recall(cosinus, relevant[i]))
        print('F-measure : %8.4f \n' % F_measure(cosinus, relevant[i]))



Query number :  1
Limit to elevant documents :  25 

Use  binary  representation. 

Use Euclidean distance:
Precision :   0.0800
Recall :   0.0690
F-measure :   0.0741 

Use Cosine similarity measure:
Precision :   0.2400
Recall :   0.2069
F-measure :   0.2222 

Use  TF  representation. 

Use Euclidean distance:
Precision :   0.2400
Recall :   0.2069
F-measure :   0.2222 

Use Cosine similarity measure:
Precision :   0.2400
Recall :   0.2069
F-measure :   0.2222 

Use  TF-IDF  representation. 

Use Euclidean distance:
Precision :   0.2400
Recall :   0.2069
F-measure :   0.2222 

Use Cosine similarity measure:
Precision :   0.2400
Recall :   0.2069
F-measure :   0.2222 



Query number :  2
Limit to elevant documents :  25 

Use  binary  representation. 

Use Euclidean distance:
Precision :   0.0000
Recall :   0.0000
F-measure :   0.0000 

Use Cosine similarity measure:
Precision :   0.1200
Recall :   0.1200
F-measure :   0.1200 

Use  TF  representation. 

Use Euclidean distance:
Pre