# Task
Given M documents compute the term-term relevance and for output return the term pairs and their similarity score descending

To calculate term-term relevance:
1.  Calculate tfidf of every term
2.  Compute and sort term-term relevance between a term and other terms


# Term Frequency Inverse Document Frequency
![Term Frequency functions](https://upload.wikimedia.org/wikipedia/commons/0/05/Plot_IDF_functions.png)
TF-IDF is a numerial statistic which symbolizes how important a word is in a document from
a collection of documents. In our context, we are using it first to determine each term's relevance within our collection of documents then using it again to determine similarity scores between term pairs

In [1]:
# Need to install pyspark bc it is not available
!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/88/01/a37e827c2d80c6a754e40e99b9826d978b55254cc6c6672b5b08f2e18a7f/pyspark-2.4.0.tar.gz (213.4MB)
[K    100% |████████████████████████████████| 213.4MB 81kB/s  eta 0:00:01   30% |██████████                      | 66.1MB 53.1MB/s eta 0:00:03
[?25hCollecting py4j==0.10.7 (from pyspark)
[?25l  Downloading https://files.pythonhosted.org/packages/e3/53/c737818eb9a7dc32a7cd4f1396e787bd94200c3997c72c1dbe028587bd76/py4j-0.10.7-py2.py3-none-any.whl (197kB)
[K    100% |████████████████████████████████| 204kB 43.9MB/s ta 0:00:01
[?25hBuilding wheels for collected packages: pyspark
  Running setup.py bdist_wheel for pyspark ... [?25ldone
[?25h  Stored in directory: /tmp/.cache/pip/wheels/cd/54/c2/abfcc942eddeaa7101228ebd6127a30dbdf903c72db4235b23
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.7 pyspark-2.4.0


In [2]:
# Libraries
import pandas as pd
import math
from pyspark import SparkContext, SparkConf
#import os
#print(os.listdir("../input"))

In [3]:
def perform_check(result,print_limit):
    """
    Takes result of Mapping to a RDD
    and prints a certain amount of lines
    """
    limit = print_limit
    count = 0
    for x in result.collect():
        count = count + 1
        print(x)
        if count == limit:
            break

In [4]:
def get_length(result):
    """
    Takes result of Mapping to a RDD
    and prints a certain amount of lines
    """
    count = 0
    for x in result.collect():
        count = count + 1
    print(count)
    return count

In [5]:
# A Quick Look At Our Data
source = pd.read_table("../input/project2_data.txt",header=None)
source[0] = source[0].str.split()
source["doc"] = source[0].apply(lambda x: x[0])
source["terms"] = source[0].apply(lambda x: x[1:])
source.head()

Unnamed: 0,0,doc,terms
0,"[doc1, chem_n-methyl-d-aspartate_chem, recepto...",doc1,"[chem_n-methyl-d-aspartate_chem, receptor, gen..."
1,"[doc2, low, dose, chem_n-methyl-d-aspartate_ch...",doc2,"[low, dose, chem_n-methyl-d-aspartate_chem, re..."
2,"[doc3, chem_nmda_chem, receptor, gene_nmdars_g...",doc3,"[chem_nmda_chem, receptor, gene_nmdars_gene, i..."
3,"[doc4, rodent, primary, somatosensory, cortex,...",doc4,"[rodent, primary, somatosensory, cortex, confi..."
4,"[doc5, short-sleep, ss, mouse, exhibit, high, ...",doc5,"[short-sleep, ss, mouse, exhibit, high, locomo..."


In [6]:
# Set app name and master for spark
appName = "TFIDF"
master = "local"
conf = SparkConf().setAppName(appName).setMaster(master)
sc = SparkContext(conf=conf)

In [7]:
# Convert our given text file for spark
data_file_path = "../input/project2_data.txt"
project2_data = sc.textFile(data_file_path)

In [8]:
key_value = project2_data.map(lambda x: x.split())

In [9]:
# Get the total number of documents in corpus
# Necessary for our TF-IDF calculation
TOTAL_DOCS = get_length(key_value)

8357


In [10]:
perform_check(key_value,1)

['doc1', 'chem_n-methyl-d-aspartate_chem', 'receptor', 'gene_nmdars_gene', 'represent', 'subclass', 'chem_glutamate_chem', 'receptor', 'play', 'critical', 'role', 'neuronal', 'development', 'physiology', 'report', 'here', 'generation', 'mouse', 'express', 'normal', 'level', 'essential', 'nmdar1', 'nr1', 'subunit', 'unlike', 'nr1', 'null', 'mouse', 'mouse', 'survive', 'adulthood', 'display', 'behavioral', 'abnormality', 'include', 'increase', 'motor', 'activity', 'stereotypy', 'deficit', 'social', 'sexual', 'interaction', 'behavioral', 'alteration', 'similar', 'those', 'observe', 'pharmacological', 'induce', 'animal', 'model', 'disease_schizophrenia_disease', 'ameliorate', 'treatment', 'chem_haloperidol_chem', 'chem_clozapine_chem', 'antipsychotic', 'drug', 'antagonize', 'dopaminergic', 'serotonergic', 'receptor', 'finding', 'support', 'model', 'reduce', 'chem_nmda_chem', 'receptor', 'activity', 'result', 'schizophrenic-like', 'disease_behavior_disease', 'reveal', 'pharmacological', 'ma

In [11]:
def Filter_Terms(x):
    """
    Filters all extraneous terms within the list
    of the terms that we have to consider
    In this case we want to get the words
    with the following formats:
    (1) gene_word_gene
    (2) disease_word_disease
    """
    relevant_terms = []
    for word in x[1:]:
        if (word.startswith("gene_") and word.endswith("_gene")):
            relevant_terms.append((word,x[0]))
        if (word.startswith("disease_") and word.endswith("_disease")):
            relevant_terms.append((word,x[0]))
    return relevant_terms

In [12]:
word_count = key_value.flatMap(lambda x: Filter_Terms(x))\
.map(lambda x: (x, 1))\
.reduceByKey(lambda x, y: x + y)

In [13]:
perform_check(word_count,5)

(('gene_nmdars_gene', 'doc1'), 1)
(('disease_schizophrenia_disease', 'doc1'), 1)
(('disease_behavior_disease', 'doc1'), 1)
(('disease_ls_disease', 'doc2'), 3)
(('disease_ss_disease', 'doc2'), 3)


In [14]:
def Get_All_Pairs(x):
    """
    Gathers all word doc pairs and
    resturctures so we have all words
    followed by their counts for each doc
    """
    docid_word_pair = x[0]
    word_count = x[1]
    unique_word = docid_word_pair[0]
    docid = docid_word_pair[1]
    return (docid, list((unique_word,word_count)))

In [15]:
doc_word_counts = word_count.map(lambda x: Get_All_Pairs(x))\
.cache()\
.reduceByKey(lambda x, y: x + y)

In [16]:
perform_check(doc_word_counts,5)

('doc1', ['gene_nmdars_gene', 1, 'disease_schizophrenia_disease', 1, 'disease_behavior_disease', 1])
('doc2', ['disease_ls_disease', 3, 'disease_ss_disease', 3, 'gene_nmdars_gene', 3])
('doc3', ['gene_nmdars_gene', 3])
('doc4', ['gene_nmdars_gene', 2])
('doc5', ['gene_nmdars_gene', 5])


In [17]:
def CreateTuple(x):
    """
    Gathers word and their counts
    from the list and puts
    them in tuples with the format:
    (word,word_count)
    This way a word and it's count
    is explicit
    """
    docid = x[0]
    converted_list = []
    tuple_list = x[1]
    for i in range(0, len(tuple_list), 2):
        converted_list.append((tuple_list[i], tuple_list[i+1]))
    return (docid, converted_list)

In [18]:
tuple_result = doc_word_counts.map(lambda x: CreateTuple(x))

In [19]:
perform_check(tuple_result,1)

('doc1', [('gene_nmdars_gene', 1), ('disease_schizophrenia_disease', 1), ('disease_behavior_disease', 1)])


In [20]:
def WordCountPerDoc(x):
    """
    Gets word and its document pair and reports 
    the occurences of the word in the document and the total
    number of words in the document in the following
    format:
    ((word,doc),(word occurences,total # of words in doc))
    """
    list_ = []
    docid = x[0]
    list_of_tuples = x[1]
    number_of_terms_in_doc = 0
    for each_tuple in list_of_tuples:
        number_of_terms_in_doc += each_tuple[1]
    for each in list_of_tuples:
        unique_word = each[0]
        word_occurences = each[1]
        list_.append(
            (
                (unique_word, docid),
                (word_occurences, number_of_terms_in_doc)
            )
        )
    return list_

In [21]:
word_count_per_doc= tuple_result.flatMap(lambda x: WordCountPerDoc(x))

In [22]:
perform_check(word_count_per_doc,5)

(('gene_nmdars_gene', 'doc1'), (1, 3))
(('disease_schizophrenia_disease', 'doc1'), (1, 3))
(('disease_behavior_disease', 'doc1'), (1, 3))
(('disease_ls_disease', 'doc2'), (3, 9))
(('disease_ss_disease', 'doc2'), (3, 9))


In [23]:
def All_Doc_Word_Count_Pairs(x):
    """
    Get all word counts and the total
    counts for each document within our
    database
    """
    word_and_doc = x[0]
    word = word_and_doc[0]
    docid = word_and_doc[1]
    word_count_and_total_word_in_doc = x[1]
    word_count = word_count_and_total_word_in_doc[0]
    total_word_count = word_count_and_total_word_in_doc[1]
    return (word, (docid, word_count, total_word_count)) 

In [24]:
word_per_doc = word_count_per_doc.map(lambda x: All_Doc_Word_Count_Pairs(x))\
.cache()\
.reduceByKey(lambda x, y: x + y)

In [25]:
perform_check(word_per_doc,1)

('gene_nmdars_gene', ('doc1', 1, 3, 'doc2', 3, 9, 'doc3', 3, 3, 'doc4', 2, 2, 'doc5', 5, 5, 'doc6', 1, 1, 'doc7', 2, 2, 'doc8', 2, 2, 'doc9', 3, 5, 'doc10', 2, 3, 'doc11', 4, 4, 'doc12', 5, 14, 'doc13', 1, 2, 'doc14', 1, 6, 'doc15', 2, 3, 'doc16', 3, 7, 'doc17', 1, 1, 'doc18', 4, 4, 'doc19', 1, 4, 'doc20', 2, 8, 'doc21', 2, 8, 'doc22', 1, 1, 'doc23', 1, 1, 'doc24', 1, 1, 'doc25', 1, 11, 'doc26', 1, 8, 'doc27', 4, 11, 'doc28', 2, 3, 'doc29', 1, 1, 'doc30', 7, 7, 'doc31', 2, 2, 'doc32', 5, 5, 'doc33', 1, 17, 'doc34', 2, 2, 'doc35', 1, 8, 'doc36', 2, 2, 'doc37', 1, 4, 'doc38', 3, 3, 'doc39', 2, 2, 'doc40', 1, 6, 'doc41', 3, 3, 'doc42', 2, 9, 'doc43', 1, 1, 'doc44', 1, 2, 'doc45', 3, 5, 'doc46', 1, 1, 'doc47', 1, 1, 'doc48', 3, 6, 'doc49', 1, 1, 'doc50', 3, 3, 'doc51', 2, 2, 'doc52', 2, 9, 'doc53', 1, 6, 'doc54', 2, 13, 'doc55', 2, 9, 'doc56', 1, 2, 'doc57', 2, 2, 'doc58', 3, 4, 'doc59', 3, 16, 'doc60', 2, 2, 'doc61', 2, 2, 'doc62', 2, 11, 'doc63', 2, 2, 'doc64', 2, 6, 'doc65', 1, 2, 'doc6

In [26]:
def All_Word_Count_Pairs(x):
    """
    From list forms tuples
    of word and every document it appears
    and the total number words in that document
    """
    list_ = []
    word = x[0]
    tuple_list = x[1]
    for i in range(0,len(tuple_list),3):
        list_.append((tuple_list[i], tuple_list[i+1], tuple_list[i+2]))
    return (word, list_)

In [27]:
all_doc_word_counts= word_per_doc.map(lambda x: All_Word_Count_Pairs(x))

In [28]:
perform_check(all_doc_word_counts,1)

('gene_nmdars_gene', [('doc1', 1, 3), ('doc2', 3, 9), ('doc3', 3, 3), ('doc4', 2, 2), ('doc5', 5, 5), ('doc6', 1, 1), ('doc7', 2, 2), ('doc8', 2, 2), ('doc9', 3, 5), ('doc10', 2, 3), ('doc11', 4, 4), ('doc12', 5, 14), ('doc13', 1, 2), ('doc14', 1, 6), ('doc15', 2, 3), ('doc16', 3, 7), ('doc17', 1, 1), ('doc18', 4, 4), ('doc19', 1, 4), ('doc20', 2, 8), ('doc21', 2, 8), ('doc22', 1, 1), ('doc23', 1, 1), ('doc24', 1, 1), ('doc25', 1, 11), ('doc26', 1, 8), ('doc27', 4, 11), ('doc28', 2, 3), ('doc29', 1, 1), ('doc30', 7, 7), ('doc31', 2, 2), ('doc32', 5, 5), ('doc33', 1, 17), ('doc34', 2, 2), ('doc35', 1, 8), ('doc36', 2, 2), ('doc37', 1, 4), ('doc38', 3, 3), ('doc39', 2, 2), ('doc40', 1, 6), ('doc41', 3, 3), ('doc42', 2, 9), ('doc43', 1, 1), ('doc44', 1, 2), ('doc45', 3, 5), ('doc46', 1, 1), ('doc47', 1, 1), ('doc48', 3, 6), ('doc49', 1, 1), ('doc50', 3, 3), ('doc51', 2, 2), ('doc52', 2, 9), ('doc53', 1, 6), ('doc54', 2, 13), ('doc55', 2, 9), ('doc56', 1, 2), ('doc57', 2, 2), ('doc58', 3, 

In [29]:
def CountDocsPerWord(x):
    """
    Determines the number of documents
    a term appears in
    """
    list_ = []
    docsPerWord = 0
    word = x[0]
    tuple_list = x[1]
    for each in tuple_list:
        docsPerWord += 1
    for each in tuple_list:
        docid = each[0]
        word_count = each[1]
        total_w_count = each[2]
        list_.append(
            (
                (word, docid),
                (word_count, total_w_count, docsPerWord)
            )
        )
    return list_

In [30]:
docs_per_word_result = all_doc_word_counts.flatMap(lambda x: CountDocsPerWord(x))

In [31]:
perform_check(docs_per_word_result,5)

(('gene_nmdars_gene', 'doc1'), (1, 3, 108))
(('gene_nmdars_gene', 'doc2'), (3, 9, 108))
(('gene_nmdars_gene', 'doc3'), (3, 3, 108))
(('gene_nmdars_gene', 'doc4'), (2, 2, 108))
(('gene_nmdars_gene', 'doc5'), (5, 5, 108))


In [32]:
def TFIDF(x,total_docs):
    """
    Calculates the term-frequency inverse document frequency
    for each term
    """
    term_name = x[0][0]
    second_tuple = x[1]
    term_word_count = second_tuple[0]
    all_word_count = second_tuple[1]
    docs_with_term = second_tuple[2]
    term_frequency = term_word_count / all_word_count
    inverse_doc_frequency = math.log(total_docs/docs_with_term)
    tfidf = term_frequency * inverse_doc_frequency
    return (term_name, tfidf)    

In [33]:
tfidf_result = docs_per_word_result.map(lambda x: TFIDF(x,TOTAL_DOCS)).groupByKey()

In [34]:
perform_check(tfidf_result,3)

('gene_nmdars_gene', <pyspark.resultiterable.ResultIterable object at 0x7f133f0303c8>)
('disease_schizophrenia_disease', <pyspark.resultiterable.ResultIterable object at 0x7f133f030438>)
('disease_behavior_disease', <pyspark.resultiterable.ResultIterable object at 0x7f133f0304a8>)


In [35]:
# Gathers all calculated tfidf by
# term
tfidf = tfidf_result.cache().map(lambda x: (x[0], list(x[1])))

In [36]:
perform_check(tfidf,1)

('gene_nmdars_gene', [1.449574520959072, 1.449574520959072, 4.3487235628772165, 4.3487235628772165, 4.3487235628772165, 4.3487235628772165, 4.3487235628772165, 4.3487235628772165, 2.6092341377263297, 2.899149041918144, 4.3487235628772165, 1.5531155581704346, 2.1743617814386083, 0.724787260479536, 2.899149041918144, 1.8637386698045213, 4.3487235628772165, 4.3487235628772165, 1.0871808907193041, 1.0871808907193041, 1.0871808907193041, 4.3487235628772165, 4.3487235628772165, 4.3487235628772165, 0.3953385057161106, 0.5435904453596521, 1.5813540228644425, 2.899149041918144, 4.3487235628772165, 4.3487235628772165, 4.3487235628772165, 4.3487235628772165, 0.25580726840454215, 4.3487235628772165, 0.5435904453596521, 4.3487235628772165, 1.0871808907193041, 4.3487235628772165, 4.3487235628772165, 0.724787260479536, 4.3487235628772165, 0.9663830139727148, 4.3487235628772165, 2.1743617814386083, 2.6092341377263297, 4.3487235628772165, 4.3487235628772165, 2.1743617814386083, 4.3487235628772165, 4.34

In [37]:
def GetQueryVector(x, query_term):
    """
    Will return the vector for the query term
    """
    if x[0] == query_term:
        return True
    return False

In [38]:
query_term = 'gene_nmdars_gene'

In [39]:
query_vector = tfidf.filter(lambda x: GetQueryVector(x, query_term))

In [40]:
perform_check(query_vector,1)

('gene_nmdars_gene', [1.449574520959072, 1.449574520959072, 4.3487235628772165, 4.3487235628772165, 4.3487235628772165, 4.3487235628772165, 4.3487235628772165, 4.3487235628772165, 2.6092341377263297, 2.899149041918144, 4.3487235628772165, 1.5531155581704346, 2.1743617814386083, 0.724787260479536, 2.899149041918144, 1.8637386698045213, 4.3487235628772165, 4.3487235628772165, 1.0871808907193041, 1.0871808907193041, 1.0871808907193041, 4.3487235628772165, 4.3487235628772165, 4.3487235628772165, 0.3953385057161106, 0.5435904453596521, 1.5813540228644425, 2.899149041918144, 4.3487235628772165, 4.3487235628772165, 4.3487235628772165, 4.3487235628772165, 0.25580726840454215, 4.3487235628772165, 0.5435904453596521, 4.3487235628772165, 1.0871808907193041, 4.3487235628772165, 4.3487235628772165, 0.724787260479536, 4.3487235628772165, 0.9663830139727148, 4.3487235628772165, 2.1743617814386083, 2.6092341377263297, 4.3487235628772165, 4.3487235628772165, 2.1743617814386083, 4.3487235628772165, 4.34

In [41]:
def FilterOutQuery(x):
    """
    Filter out query terms
    """
    if x[0] == query_term:
        return False
    else:
        return True

In [42]:
cartesian_filter = tfidf.filter(lambda x: FilterOutQuery(x))\
.cache()\
.cartesian(query_vector)\

In [43]:
perform_check(cartesian_filter,1)

(('disease_schizophrenia_disease', [1.6511057820319055, 1.1007371880212704, 2.4766586730478584, 1.2383293365239292, 0.8255528910159528, 1.2383293365239292, 0.8255528910159528, 2.4766586730478584, 2.4766586730478584, 2.830467054911838, 4.953317346095717, 0.5827432171877314, 1.6511057820319055, 2.4766586730478584, 0.9906634692191434, 4.127764455079764, 0.9906634692191434, 1.6511057820319055, 2.4766586730478584, 0.8255528910159528, 0.9906634692191434, 1.1430732337143963, 2.4766586730478584, 0.6191646682619646, 4.953317346095717, 4.953317346095717, 3.9626538768765736, 0.8255528910159528, 1.6511057820319055, 0.9906634692191434, 0.7076167637279595, 1.2383293365239292, 0.4953317346095717, 0.7076167637279595, 2.4766586730478584, 1.2383293365239292, 1.9813269384382868, 0.6191646682619646, 1.2383293365239292, 1.6511057820319055, 2.4766586730478584, 4.953317346095717, 1.2383293365239292, 0.9906634692191434, 1.6511057820319055, 0.5503685940106352, 2.4766586730478584, 1.6511057820319055, 0.99066346

In [44]:
def SemanticSimilarity(x):
    """
    Calculates the Semantic Similarity
    for all term-term pairs
    """
    A_vector = x[0][1]
    B_vector = x[1][1]
    A_denominator = 0
    B_denominator = 0
    A_B_denominator = 0
    A_B_numerator = 0
    semantic_similarity = 0
    
    #calculates the denominator part for the A vector 
    for i in range(0, len(A_vector), 1):
        A_denominator += A_vector[i] * A_vector[i]

    A_denominator = math.sqrt(A_denominator)
    
    #calculates the denominator part for the B vector
    for i in range(0, len(B_vector), 1):
        B_denominator += B_vector[i] * B_vector[i]

    B_denominator = math.sqrt(B_denominator)
    
    #makes the vectors equal sized in order 
    #to allow multiplication of both vectors
    if len(B_vector) <= len(A_vector):
        difference = len(A_vector) - len(B_vector)
        for i in range(difference):
            B_vector.append(0)
    elif len(A_vector) <= len(B_vector):
        difference = len(B_vector) - len(A_vector)
        for i in range(difference):
            A_vector.append(0)

    #multiplies each element of A and B to find the numerator 
    #of the semantic similarity formula
    for i in range(len(A_vector)):
        A_B_numerator += A_vector[i] * B_vector[i]
    
    #calculates the denominator of the semantic similarity formula
    A_B_denominator = A_denominator * B_denominator
    
    #output is ((A-term, B-term), semantic similarity)
    return (x[1][0], x[0][0]), A_B_numerator/A_B_denominator

In [45]:
semantic_result = cartesian_filter.map(lambda x: SemanticSimilarity(x))\
.map(lambda x: (x[1], x[0]))\
.sortByKey(False)\
.map(lambda x: (x[1], x[0]))\

In [46]:
perform_check(semantic_result,1)

(('gene_nmdars_gene', 'gene_bovine_rhodopsin_gene'), 0.8105520964898542)


In [47]:
final_output = semantic_result.collect()

In [48]:
len(final_output)

3720

In [49]:
final_output[:5]

[(('gene_nmdars_gene', 'gene_bovine_rhodopsin_gene'), 0.8105520964898542),
 (('gene_nmdars_gene', 'disease_pkc_disease'), 0.7596701980985593),
 (('gene_nmdars_gene', 'disease_rgs_disease'), 0.6819155880242336),
 (('gene_nmdars_gene', 'disease_obesity_disease'), 0.608462311067258),
 (('gene_nmdars_gene', 'disease_kaposi_sarcoma_disease'), 0.5877393062155959)]

In [50]:
def Write_To_File(x):
    """
    Taking our final output
    and writes it file
    """
    list_ = x
    file = open("Final_Output.txt", "w")
    file.write('\n'.join('%s %s' % x for x in list_))

In [51]:
Write_To_File(final_output)

Authors:
# [Gael Blanchard](https://github.com/gaelblanchard)
# [Lloyd Massiah](https://github.com/lazypassion)