In [1]:
# Import the pandas package, then use the "read_csv" function to read
# the labeled training data
import pandas as pd       
train = pd.read_csv("ICHI-corpus3-MOD.tsv", header=0, \
                    delimiter="\t", quoting=2)

In [2]:
# Verify the number of reviews that were read (100,000 in total)
print "Read %d  train and test " % (train["question"].size)

Read 105  train and test 


In [3]:
print train["question"][0]

Sugar free My 90 year old Daddy just got diagnosed, the one thing he loves is Ice cream, can he eat sugar free ice cream?


In [4]:
print (train["id"][0])

11


In [5]:
from bs4 import BeautifulSoup             
import re
import nltk
#nltk.download()  # Download text data sets, including stop words
from nltk.corpus import stopwords # Import the stop word list
#print (stopwords.words("english") )


In [6]:
def question_to_words( raw_review ):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review, "html.parser").get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words )) 


In [7]:
# Get the number of reviews based on the dataframe column size
num_questions_train = train["question"].size
print num_questions_train


105


In [8]:
print ("Cleaning and parsing  questions...\n")
clean_train_questions = []
num_questions_train = train["question"].size

for i in range( 0, num_questions_train ):
    # If the index is evenly divisible by 1000, print a message
    if( (i+1)%10 == 0 ):
        print ("Question from train %d of %d\n" % ( i+1, num_questions_train )  )                                                                  
    clean_train_questions.append( question_to_words( train["question"][i] ))



Cleaning and parsing  questions...

Question from train 10 of 105

Question from train 20 of 105

Question from train 30 of 105

Question from train 40 of 105

Question from train 50 of 105

Question from train 60 of 105

Question from train 70 of 105

Question from train 80 of 105

Question from train 90 of 105

Question from train 100 of 105



In [9]:
print "The first  clean question is \n",(clean_train_questions[0])

The first  clean question is 
sugar free year old daddy got diagnosed one thing loves ice cream eat sugar free ice cream


In [10]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors


# Load pretrained model (since intermediate data is not included, the model cannot be refined with additional data)
model = KeyedVectors.load_word2vec_format('c:/users/gachet/downloads/GoogleNews-vectors-negative300.bin', binary=True)




In [11]:
#test the word2vect pretrained model
model.most_similar("metformin")

[(u'pioglitazone', 0.7217838168144226),
 (u'statin', 0.7187392711639404),
 (u'statins', 0.7124181985855103),
 (u'sitagliptin', 0.7081882953643799),
 (u'allopurinol', 0.7003565430641174),
 (u'insulin_glargine', 0.6983603239059448),
 (u'Metformin', 0.6977676153182983),
 (u'glimepiride', 0.6926617622375488),
 (u'methotrexate', 0.6917198896408081),
 (u'sulfonylurea', 0.6905825734138489)]

In [12]:
 def expand( word, items):
        """ expand the word using the word2vec model """

        try:
            result = model.most_similar(positive=[word], negative=[], topn=items)
        except:
            result = []
        return [pair[0] for pair in result]

In [13]:
#test exapnd() function
expand("sugar",3)

[u'refined_sugar', u'cane_sugar', u'turbinado']

In [14]:
# expand first 10 text (test questions) with 3 terms
lista = []
for i in range( 0, 10 ):
    # If the index is evenly divisible by 1000, print a message
    frase = ""
    for palabra in clean_train_questions[i].split():
        frase += palabra
        lista_p = expand(palabra,3)
        for p in lista_p:
            frase += " "
            frase += p
        frase += " "
    lista.append(frase)
    


In [15]:
#test first question expanded 
print lista[0]

sugar refined_sugar cane_sugar turbinado free Free Six_Flags_website_http://www.sixflags.com FREE year month week months old yearold boy 0ld daddy momma mama dad got get gotten getting diagnosed Diagnosed misdiagnosed rediagnosed one only two three thing things something stuff loves adores hates likes ice Ice Francies_tossed Melting_polar cream crème creams Maximum_Moisture eat eating ate eaten sugar refined_sugar cane_sugar turbinado free Free Six_Flags_website_http://www.sixflags.com FREE ice Ice Francies_tossed Melting_polar cream crème creams Maximum_Moisture 


In [16]:
#replace the original 10 first question with expanded ones
for i in range( 0, 10 ):
   
    clean_train_questions[i] = lista[i]
   

In [17]:
#test first question expanded 
print clean_train_questions[0]

sugar refined_sugar cane_sugar turbinado free Free Six_Flags_website_http://www.sixflags.com FREE year month week months old yearold boy 0ld daddy momma mama dad got get gotten getting diagnosed Diagnosed misdiagnosed rediagnosed one only two three thing things something stuff loves adores hates likes ice Ice Francies_tossed Melting_polar cream crème creams Maximum_Moisture eat eating ate eaten sugar refined_sugar cane_sugar turbinado free Free Six_Flags_website_http://www.sixflags.com FREE ice Ice Francies_tossed Melting_polar cream crème creams Maximum_Moisture 


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
print("Creating the tf/idf...\n")
# Initialize the "TfidfVectorizer" object, which is scikit-learn's tf/idf tool.
"""
tfidf_vectorizer = TfidfVectorizer(max_df=1, \
                                   max_features=20, \
                                   min_df=1, \
                                   stop_words=None, \
                                   use_idf=True, \
                                   tokenizer=None, \
                                   ngram_range=(1,1)) 

"""

"""
CALCULA EL IF-tdf DE ACUERDO A LA EXPRESION 
morm (tf * LOGN(dT+1/dt+1)+1)
"""


tfidf_vectorizer = TfidfVectorizer(max_features=900)
# Tf-idf-weighted term-document sparse matrix 

tfidf_train_data_features = tfidf_vectorizer.fit_transform(clean_train_questions)

# Convert the result to nampy array 

tfidf_train_data_features_array= tfidf_train_data_features.toarray() 

print(tfidf_train_data_features_array.shape)
vocab = tfidf_vectorizer.get_feature_names()
#print vocab

Creating the tf/idf...

(105L, 900L)


In [30]:
#sanity: check the first question with all content of the tfidf-matrix (all other questions including the first)
from sklearn.metrics.pairwise import linear_kernel
cosine_similarities = linear_kernel(tfidf_train_data_features[0:1], tfidf_train_data_features).flatten()
cosine_similarities

array([ 1.        ,  0.        ,  0.0317715 ,  0.07730862,  0.11417995,
        0.066547  ,  0.00398448,  0.00446716,  0.        ,  0.01152385,
        0.01205096,  0.        ,  0.02942527,  0.01861577,  0.03034839,
        0.0112765 ,  0.02558099,  0.02221115,  0.01471993,  0.01322647,
        0.02146023,  0.00722879,  0.0153696 ,  0.        ,  0.03149349,
        0.03243212,  0.        ,  0.01382045,  0.01809368,  0.03949409,
        0.02707776,  0.04003335,  0.01927105,  0.02581786,  0.        ,
        0.        ,  0.01421368,  0.02777035,  0.01582768,  0.        ,
        0.05236545,  0.28181401,  0.00791609,  0.03404059,  0.        ,
        0.01837193,  0.04532673,  0.        ,  0.08087924,  0.        ,
        0.        ,  0.01862787,  0.02413602,  0.05421281,  0.26925085,
        0.        ,  0.        ,  0.00949573,  0.        ,  0.        ,
        0.02887804,  0.        ,  0.02683942,  0.03122145,  0.02875108,
        0.03023074,  0.21367708,  0.        ,  0.01714192,  0.  

In [31]:
related_indices = cosine_similarities.argsort()[:-5:-1]
related_indices
#print (train["id"][0])

array([ 0, 41, 54, 94], dtype=int64)

In [32]:
for i in related_indices:
    print (train["id"][i])

11
137
152
193


In [22]:
#it is obvius that the firs question is equal to the first, so the cosine is 1 
cosine_similarities[related_indices]

array([ 1.        ,  0.28181401,  0.26925085,  0.23580676])

In [40]:
# compute all
"""
for i in range(0,10):
    cosine_similarities = linear_kernel(tfidf_train_data_features[i:i+1], tfidf_train_data_features).flatten()
#cosine_similarities
    related_indices = cosine_similarities.argsort()[:-5:-1]
    print ("Question  %d id %d most similar to \n" % ( i, train["id"][i])  )  
    #related_indices
    for i in related_indices:
        print (train["id"][i])
    #print (train["id"][related_indices])
    print
    print "Cosine vector "
    print
    print cosine_similarities[related_indices]
    print
    """

'\nfor i in range(0,10):\n    cosine_similarities = linear_kernel(tfidf_train_data_features[i:i+1], tfidf_train_data_features).flatten()\n#cosine_similarities\n    related_indices = cosine_similarities.argsort()[:-5:-1]\n    print ("Question  %d id %d most similar to \n" % ( i, train["id"][i])  )  \n    #related_indices\n    for i in related_indices:\n        print (train["id"][i])\n    #print (train["id"][related_indices])\n    print\n    print "Cosine vector "\n    print\n    print cosine_similarities[related_indices]\n    print\n    '

In [41]:
#compute all similarities but not including the first 10 text (test questions)
for i in range(0,10):
    cosine_similarities = linear_kernel(tfidf_train_data_features[i:i+1], tfidf_train_data_features[10:105]).flatten()
#cosine_similarities
    related_indices = cosine_similarities.argsort()[:-5:-1]
    #related_indices+10
    print ("Question  %d id %d most similar to \n" % ( i, train["id"][i])  ) 
    for i in related_indices+10:
        print (train["id"][i])
    #print (train["id"][related_indices])
    print
    print "Cosine vector "
    print
    print cosine_similarities[related_indices]
    print
    #print (train["id"][related_indices+10])
    #print cosine_similarities[related_indices]
    #print

Question  0 id 11 most similar to 

137
152
193
165

Cosine vector 

[ 0.28181401  0.26925085  0.23580676  0.21367708]

Question  1 id 12 most similar to 

127
203
167
177

Cosine vector 

[ 0.4655647   0.16698895  0.14794502  0.13020878]

Question  2 id 13 most similar to 

125
200
204
154

Cosine vector 

[ 0.11571468  0.11552708  0.09170331  0.08086858]

Question  3 id 14 most similar to 

103
113
179
164

Cosine vector 

[ 0.17622609  0.10791909  0.10778491  0.1026144 ]

Question  4 id 15 most similar to 

162
113
149
179

Cosine vector 

[ 0.24260646  0.21810148  0.20816221  0.20060492]

Question  5 id 16 most similar to 

192
142
113
183

Cosine vector 

[ 0.19987297  0.15302599  0.1286492   0.12121819]

Question  6 id 17 most similar to 

141
194
106
204

Cosine vector 

[ 0.29029673  0.12869883  0.10625505  0.09289083]

Question  7 id 18 most similar to 

142
156
151
145

Cosine vector 

[ 0.14382795  0.11282771  0.08260155  0.0817832 ]

Question  8 id 19 most similar to 

129
