In [1]:
import math
import string
import sys

# reading the text file
# This functio will return a
# list of the lines of text
# in the file.
def read_file(filename):
	
	try:
		with open(filename, 'r') as f:
			data = f.read()
		return data
	
	except IOError:
		print("Error opening or reading input file: ", filename)
		sys.exit()

# splitting the text lines into words
# translation table is a global variable
# mapping upper case to lower case and
# punctuation to spaces
translation_table = str.maketrans(string.punctuation+string.ascii_uppercase,
									" "*len(string.punctuation)+string.ascii_lowercase)
	
# returns a list of the words
# in the file
def get_words_from_char_list(text):
	
	text = text.translate(translation_table)
	word_list = text.split()
	
	return word_list


In [2]:
# counts frequency of each word
# returns a dictionary which maps
# the words to their frequency.
def count_frequency(word_list):
	
	D = {}
	
	for new_word in word_list:
		
		if new_word in D:
			D[new_word] = D[new_word] + 1
			
		else:
			D[new_word] = 1
			
	return D

# returns dictionary of (word, frequency)
# pairs from the previous dictionary.
def word_frequencies_for_file(filename):
	
	char_list = read_file(filename)
	word_list = get_words_from_char_list(char_list)
	freq_mapping = count_frequency(word_list)

	print("File", filename, ":", )
	print(len(char_list), "chars, ", )
	print(len(word_list), "words, ", )
	print(len(freq_mapping), "distinct words")

	return freq_mapping


In [3]:
# returns the dot product of two documents
def dotProduct(D1, D2):
	Sum = 0.0
	
	for key in D1:
		
		if key in D2:
			Sum += (D1[key] * D2[key])
			
	return Sum

# returns the angle in radians
# between document vectors
def vector_angle(D1, D2):
	numerator = dotProduct(D1, D2)
	denominator = math.sqrt(dotProduct(D1, D1)*dotProduct(D2, D2))
	
	return math.acos(numerator / denominator)


In [4]:
def documentSimilarity(filename_1, filename_2):
	
# filename_1 = sys.argv[1]
# filename_2 = sys.argv[2]
	sorted_word_list_1 = word_frequencies_for_file(filename_1)
	sorted_word_list_2 = word_frequencies_for_file(filename_2)
	distance = vector_angle(sorted_word_list_1, sorted_word_list_2)
	degrees = distance*180*7/22
	print("The distance between the documents is: % 0.6f (radians)"% distance)
	print("The distance between the documents is: % 0.2f (degrees)"% degrees)
	print("This value ranges from 0 to 90 degrees")    
    


In [5]:
def commonwords(filename1, filename2):
    
    char_list1 = read_file(filename1)
    word_list1 = get_words_from_char_list(char_list1)
    freq_mapping1 = count_frequency(word_list1)
    
    char_list2 = read_file(filename2)
    word_list2 = get_words_from_char_list(char_list2)
    freq_mapping2 = count_frequency(word_list2)
    
    str1_words = set(freq_mapping1)
    str2_words = set(freq_mapping2)
    
    common = str1_words & str2_words
    print("count of common words: ", len(common))
    print("doc1 percent common = {:.0%}".format(len(common)/len(freq_mapping1)))
    print("doc2 percent common = {:.0%}".format(len(common)/len(freq_mapping2)))
    return common

In [6]:
# Running code
a = 'reco1.txt'
b = 'reco2.txt'
documentSimilarity(a, b)
commonwords(a, b)

File reco1.txt :
2135 chars, 
355 words, 
222 distinct words
File reco2.txt :
2159 chars, 
357 words, 
213 distinct words
The distance between the documents is:  0.747052 (radians)
The distance between the documents is:  42.79 (degrees)
This value ranges from 0 to 90 degrees
count of common words:  65
doc1 percent common = 29%
doc2 percent common = 31%


{'a',
 'am',
 'an',
 'and',
 'as',
 'at',
 'be',
 'best',
 'beyond',
 'business',
 'could',
 'customer',
 'deepak',
 'designing',
 'different',
 'during',
 'feel',
 'financial',
 'for',
 'further',
 'good',
 'has',
 'he',
 'helped',
 'him',
 'his',
 'i',
 'if',
 'in',
 'into',
 'is',
 'it',
 'makes',
 'my',
 'not',
 'of',
 'on',
 'out',
 'program',
 'python',
 'quite',
 'recommend',
 'singh',
 'solutions',
 'student',
 'such',
 'teams',
 'technical',
 'technology',
 'that',
 'the',
 'things',
 'this',
 'time',
 'to',
 'using',
 'value',
 'was',
 'way',
 'when',
 'which',
 'will',
 'with',
 'working',
 'your'}