#### Visualization example: cosine similarity between ANN-generated exoplanet title and actual exoplanet titles

In [26]:
import pandas as pd
import numpy as np
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
tfidf_vectorizer = TfidfVectorizer() #term frequency inverse document frequency

In [9]:
# import test file - the first line of the file is a random ANN-generated text trained on 1000 exoplanet titles
exoplanet_test1K = [line.rstrip('\n') for line in open('exoplanet_1000_with_test.txt')]
df_exoplanet_test1K = pd.DataFrame(exoplanet_test)
df_exoplanet_test1K.columns = ['Title']

In [10]:
# convert title column to tuple
title_tuple = tuple(list(df_exoplanet_test1K['Title']))

In [11]:
# create TFIDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(title_tuple)
tfidf_matrix.shape # check the number of tfidf terms (the number of columns from the matrix)

(1001, 2839)

In [12]:
# calculate the cosine similarity of sample title compared to the whole training set
# Compare the first title (the test title) to all other titles
cosine_sim_array = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix)

In [27]:
# Show most similar title in degrees
cos_sim = np.partition(cosine_sim_array.flatten(), -2)[-2] # second largest value in array (most similar title)
angle_in_degrees = math.acos(cos_sim)
most_similar1K = math.degrees(angle_in_degrees)

In [18]:
# find index of most similar title
itemindex = np.where(cosine_sim_array==cos_sim)
itemindex = itemindex[1]
itemindex = itemindex[0]

In [15]:
# find most similar title and print it along with first title used for comparison
comparison_title = title_tuple[0]
most_similar_title = title_tuple[itemindex]

In [28]:
print "--------------------------------------------"
print "LSTM ANN Generated Title |", comparison_title
print "Most Similar Real Title  |", most_similar_title
print "Cosine Similarity Angle  |", most_similar1K
print "--------------------------------------------"

--------------------------------------------
LSTM ANN Generated Title | +pent, spectrling Wepulation and perture of HD 161068n microlensing the to circe S	
Most Similar Real Title  | "Stellar Companions to the Exoplanet Host Stars HD 2638 and HD 164509"
Cosine Similarity Angle  | 80.5912319964
--------------------------------------------
