In [17]:
# Practical 11

# TF-IDF Weights

import nltk
from nltk.corpus import stopwords
from collections import defaultdict
import math

def display(vocabulary, tfidfs):
    for term in vocabulary:
        temp = []
        for tfidf in tfidfs: temp.append(tfidf[term])
        print(f'{term}: {temp}')

def stopword_removal(filename, folder):
    stopwords_set = set(stopwords.words('english'))
    with open(f'{folder}/{filename}', 'r') as f:
        text = f.read()
        data = ''.join(char for char in text if char.isalnum() or char.isspace())
        tokens = data.lower().split()
        tokens = [token for token in tokens if token not in stopwords_set]
    return tokens

def get_tf(tokens):
    tf = defaultdict(int)
    for token in tokens: tf[token] += 1
    return tf

def get_idf(doc_tokens, n):
    df = defaultdict(int)
    for document in doc_tokens:
        for token in set(document): df[token] += 1
    idf = {}
    for token, count in df.items(): idf[token] = round(math.log10(n / count), 2)
    return idf

def get_tfidf(tf, idf):
    tfidf = defaultdict(int)
    for token, freq in tf.items(): tfidf[token] = round(math.log10(1 + freq) * idf[token], 2)
    return tfidf
    
def main():
    corpus = 4
    n = 4
    folder = 'corpus2'
    docs = [f'file{i}.txt' for i in range(1, n+1)]
    all_tokens = [stopword_removal(doc, folder) for doc in docs]
    tfs = [get_tf(tokens) for tokens in all_tokens]
    idfs = get_idf(all_tokens, corpus)
    tfidfs = [get_tfidf(tf, idfs) for tf in tfs]
    
    vocabulary = set()
    for tfidf in tfidfs: vocabulary.update(tfidf.keys())
    vocabulary = list(vocabulary)

    #display(vocabulary, tfidfs)
    weights = []
    for tfidf in tfidfs:
        weights.append(round(sum(tfidf.values()), 2))
        
    for doc, weight in zip(docs, weights): print(f'{doc}: {weight}')

if __name__ == '__main__':
    main()

file1.txt: 0.27
file2.txt: 0.46
file3.txt: 0.15
file4.txt: 0.12


In [43]:
# Log Frequency

import nltk
from nltk.corpus import stopwords
from collections import defaultdict
import math

def display(vocabulary, normalized):
    for term in vocabulary:
        print(f'{term}: {[normal[term] for normal in normalized]}')

def stopword_removal(filename, folder):
    stopwords_set = set(stopwords.words('english'))
    with open(f'{folder}/{filename}', 'r') as f:
        text = f.read()
        data = ''.join(char for char in text if char.isalnum() or char.isspace())
        tokens = data.lower().split()
        tokens = [token for token in tokens if token not in stopwords_set]
    return tokens

def get_tf(tokens):
    tf = defaultdict(int)
    for token in tokens: tf[token] += 1
    return tf

def get_logf(tf):
    logf = defaultdict(int)
    for token, freq in tf.items(): logf[token] = round((math.log10(freq) + 1), 2)
    return logf

def normalize(logf):
    normalized = defaultdict(int)
    factor = 0
    for token, freq in logf.items():
        factor += (freq * freq)
    factor = round(math.sqrt(factor), 2)

    for token, freq in logf.items():
        normalized[token] = round((freq / factor), 2)
    return normalized

def cosine_similarity(vec1, vec2):
    dot = sum(a * b for a, b in zip(vec1, vec2))
    norm1 = math.sqrt(sum(a * a for a in vec1))
    norm2 = math.sqrt(sum(b * b for b in vec2))
    if norm1 == 0 or norm2 == 0: return 0.0
    return round(dot / (norm1 * norm2), 2)
    
def main():
    corpus = 3
    n = 3
    folder = 'corpus1'
    docs = [f'file{i}.txt' for i in range(1, n+1)]
    all_tokens = [stopword_removal(doc, folder) for doc in docs]
    tfs = [get_tf(tokens) for tokens in all_tokens]
    logfs = [get_logf(tf) for tf in tfs]
    normalized = [normalize(logf) for logf in logfs]
    
    vocabulary = set()
    for logf in logfs: vocabulary.update(logf.keys())
    vocabulary = list(vocabulary)
    
    vectors = []
    for normal in normalized:
        vector = [normal.get(token, 0.0) for token in vocabulary]
        vectors.append(vector)
    
    for i in range(n):
        for j in range(i+1, n):
            sim = cosine_similarity(vectors[i], vectors[j])
            print(f"{docs[i]} and {docs[j]}: {sim}")

if __name__ == '__main__':
    main()

file1.txt and file2.txt: 0.59
file1.txt and file3.txt: 0.59
file2.txt and file3.txt: 0.76
