In [52]:
# Practical 11 - Cosine Similarity

import nltk
from nltk.corpus import stopwords
from collections import defaultdict
import math

def stopword_removal(filename):
    stopwords_set = set(stopwords.words('english'))
    with open(filename, 'r') as f:
        text = f.read()
        data = ''.join(char for char in text if char.isalnum() or char.isspace())
        tokens = data.lower().split()
        tokens = [token for token in tokens if token not in stopwords_set]
    return tokens

def get_tf(tokens):
    tf = defaultdict(int)
    for token in tokens: tf[token] += 1
    return tf

def get_idf(doc_tokens):
    n = len(doc_tokens)
    df = defaultdict(int)
    for tokens in doc_tokens:
        for token in set(tokens): df[token] += 1
    idf = {}
    for token, count in df.items(): idf[token] = math.log(n / count)
    return idf

def get_tfidf(tf, idf):
    tfidf = defaultdict(int)
    for token, freq in tf.items(): tfidf[token] = freq * idf[token]
    return tfidf

def cosine_similarity(vec1, vec2):
    numerator = denominator = 0 
    for qi, di in zip(vec1, vec2): numerator += (qi * di)
    q = d = 0
    for qi in vec1: q += (qi*qi)
    for di in vec2: d += (di*di)
    denominator = math.sqrt(q) * math.sqrt(d)
    if numerator == 0 or denominator == 0: return 0
    return numerator/denominator
    
def main():
    n = 3
    docs = [f'file{i}.txt' for i in range(1, n+1)]
    all_tokens = [stopword_removal(doc) for doc in docs]
    tfs = [get_tf(tokens) for tokens in all_tokens]
    idfs = get_idf(all_tokens)
    tfidfs = [get_tfidf(tf, idfs) for tf in tfs]
    
    vocabulary = set()
    for tfidf in tfidfs: vocabulary.update(tfidf.keys())
    vocabulary = list(vocabulary)
    
    vectors = []
    for tfidf in tfidfs:
        vector = [tfidf[token] for token in vocabulary]
        vectors.append(vector)
    
    for i in range(n):
        for j in range(i+1, n):
            sim = cosine_similarity(vectors[i], vectors[j])
            print(f"{docs[i]} and {docs[j]}: {sim*100:.4f}%")

if __name__ == '__main__':
    main()

file1.txt and file2.txt: 7.2592%
file1.txt and file3.txt: 17.9957%
file2.txt and file3.txt: 95.9976%
