Name: Dhruv Pithadia

Roll No: R013

Program: MBA Tech AI

Course: Natural Language Processing

Topic: Label Encoding, One hot Encoding, Bag Of Words, TF-IDF, Cosine Similarity

Contact: pithadia.dhruv@gmail.com

In [48]:
document1 = '''Artificial intelligence (AI) is a branch of computer science that aims to create machines capable of performing tasks that typically require human intelligence. These tasks include reasoning, learning, problem-solving, perception, and language understanding. AI systems are increasingly used in various applications such as speech recognition, image analysis, and autonomous vehicles. The field of AI encompasses machine learning, neural networks, and natural language processing.
'''

document2 = '''Machine learning is a subset of artificial intelligence that focuses on developing algorithms and statistical models that enable computers to learn from and make decisions based on data. It involves training models on large datasets to identify patterns and make predictions. Machine learning techniques include supervised learning, unsupervised learning, and reinforcement learning. Applications of machine learning can be found in areas such as recommendation systems, fraud detection, and predictive analytics.  '''

In [15]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def preprocess_document(document):
    """
    Tokenizes and removes stop words from a document.

    Parameters:
    document (str): The document text to be processed.

    Returns:
    list: A list of preprocessed tokens.
    """
    # Tokenize the document
    tokens = word_tokenize(document)
    
    # Convert tokens to lowercase
    tokens = [token.lower() for token in tokens]
    
    # Remove punctuation
    tokens = [token for token in tokens if token.isalnum()]
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    return tokens

In [46]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

def compute_cosine_similarity(matrix):
    """
    Computes cosine similarity for a given matrix.

    Parameters:
    matrix (numpy.ndarray or pandas.DataFrame): The feature matrix.

    Returns:
    float: Cosine similarity score between the first and second rows.
    """
    cosine_sim = cosine_similarity(matrix)
    return cosine_sim[0, 1]

def label_encode(tokens, all_tokens):
    """
    Encodes tokens into numerical labels.
    
    Parameters:
    tokens (list): A list of tokens to be encoded.
    all_tokens (list): List of all tokens for creating consistent label mapping.
    
    Returns:
    numpy.ndarray: An array of encoded labels.
    """
    label_encoder = LabelEncoder()
    label_encoder.fit(all_tokens)
    encoded_labels = label_encoder.transform(tokens)
    return encoded_labels

def one_hot_encode(tokens, all_tokens):
    """
    One-hot encodes a list of tokens.
    
    Parameters:
    tokens (list): A list of tokens to be one-hot encoded.
    all_tokens (list): List of all tokens for creating consistent one-hot encoding.
    
    Returns:
    numpy.ndarray: An array of one-hot encoded vectors.
    """
    one_hot_encoder = OneHotEncoder(sparse_output=False, categories=[sorted(set(all_tokens))])
    one_hot_encoded = one_hot_encoder.fit_transform(np.array(tokens).reshape(-1, 1))
    return one_hot_encoded

def bag_of_words(documents):
    """
    Computes the Bag of Words representation of documents.

    Parameters:
    documents (list): A list of document texts.

    Returns:
    pandas.DataFrame: A DataFrame with BoW features.
    """
    vectorizer = CountVectorizer()
    X_bow = vectorizer.fit_transform(documents)
    return X_bow

def tfidf(documents):
    """
    Computes the TF-IDF representation of documents.

    Parameters:
    documents (list): A list of document texts.

    Returns:
    pandas.DataFrame: A DataFrame with TF-IDF features.
    """
    vectorizer = TfidfVectorizer()
    X_tfidf = vectorizer.fit_transform(documents)
    return X_tfidf

def compare_similarities(doc1, doc2):
    """
    Compares cosine similarities using different feature representations.

    Parameters:
    doc1 (str): The first document.
    doc2 (str): The second document.

    Returns:
    dict: A dictionary with similarity scores for each representation.
    """
    # Preprocess documents
    tokens1 = preprocess_document(doc1)
    tokens2 = preprocess_document(doc2)
    
    # Combine tokens from both documents to create a vocabulary
    all_tokens = list(set(tokens1 + tokens2))
    
    # Label Encoding
    label_encoded_all = label_encode(all_tokens, all_tokens)
    label_encoded_doc1 = label_encode(tokens1, all_tokens)
    label_encoded_doc2 = label_encode(tokens2, all_tokens)
    
    # Create feature matrix for label encoding
    label_encoded_docs = np.zeros((2, len(all_tokens)))
    for i, doc_encoded in enumerate([label_encoded_doc1, label_encoded_doc2]):
        for token in doc_encoded:
            if token < len(label_encoded_docs[i]):
                label_encoded_docs[i, token] += 1
    
    # Compute cosine similarity for Label Encoding
    label_similarity = compute_cosine_similarity(label_encoded_docs)
    
    # One-Hot Encoding
    one_hot_encoded_all = one_hot_encode(all_tokens, all_tokens)
    one_hot_encoded_doc1 = one_hot_encode(tokens1, all_tokens)
    one_hot_encoded_doc2 = one_hot_encode(tokens2, all_tokens)
    
    # Aggregate one-hot encodings
    one_hot_encoded_docs = np.zeros((2, one_hot_encoded_all.shape[1]))
    for i, one_hot_encoded in enumerate([one_hot_encoded_doc1, one_hot_encoded_doc2]):
        one_hot_encoded_docs[i, :] = one_hot_encoded.mean(axis=0)
    
    # Compute cosine similarity for One-Hot Encoding
    one_hot_similarity = compute_cosine_similarity(one_hot_encoded_docs)
    
    # Bag of Words
    bow_df = bag_of_words([doc1, doc2])
    bow_similarity = compute_cosine_similarity(bow_df.toarray())
    
    # TF-IDF
    tfidf_df = tfidf([doc1, doc2])
    tfidf_similarity = compute_cosine_similarity(tfidf_df.toarray())
    
    # Compile results
    similarities = {
        'Label Encoding': label_similarity,
        'One-Hot Encoding': one_hot_similarity,
        'Bag of Words': bow_similarity,
        'TF-IDF': tfidf_similarity
    }
    
    return similarities

In [49]:
# Compare similarities
similarities = compare_similarities(document1, document2)

# Print similarity scores for each representation
for method, score in similarities.items():
    print(f"Cosine Similarity using {method}: {score}")

Cosine Similarity using Label Encoding: 0.29311204305478855
Cosine Similarity using One-Hot Encoding: 0.2931120430547886
Cosine Similarity using Bag of Words: 0.4590638463165382
Cosine Similarity using TF-IDF: 0.31818644767206233


In [53]:
d1 = 'Inflation has increased unemployement'
d2 = 'The company has increased it sales'
d3 = 'Fear increased his pulse'
d = []

d.append(d1)
d.append(d2)
d.append(d3)

In [54]:
d

['Inflation has increased unemployement',
 'The company has increased it sales',
 'Fear increased his pulse']

In [55]:
print(tfidf(d))

  (0, 10)	0.5844829010200651
  (0, 4)	0.34520501686496574
  (0, 2)	0.444514311537431
  (0, 5)	0.5844829010200651
  (1, 8)	0.4505040726431979
  (1, 6)	0.4505040726431979
  (1, 0)	0.4505040726431979
  (1, 9)	0.4505040726431979
  (1, 4)	0.2660749625405929
  (1, 2)	0.34261995919180055
  (2, 7)	0.546454011634009
  (2, 3)	0.546454011634009
  (2, 1)	0.546454011634009
  (2, 4)	0.3227445421804912


All the functions from scratch

In [61]:
def label_encode_scratch(tokens, all_tokens):
    token_to_label = {token: idx for idx, token in enumerate(all_tokens)}
    encoded_labels = [token_to_label[token] for token in tokens]
    return encoded_labels

def one_hot_encode_scratch(tokens, all_tokens):
    token_to_index = {token: idx for idx, token in enumerate(all_tokens)}
    one_hot_encoded = np.zeros(len(all_tokens))
    for token in tokens:
        if token in token_to_index:
            index = token_to_index[token]
            one_hot_encoded[index] = 1
    return one_hot_encoded

def bag_of_words_scratch(documents):
    tokenized_docs = [doc.split() for doc in documents]
    all_tokens = list(set(token for doc in tokenized_docs for token in doc))
    all_tokens.sort()
    bow_matrix = np.zeros((len(documents), len(all_tokens)))
    for i, tokens in enumerate(tokenized_docs):
        for token in tokens:
            if token in all_tokens:
                index = all_tokens.index(token)
                bow_matrix[i, index] += 1
    return bow_matrix

def tfidf_scratch(documents, output_csv='tfidf_output.csv'):
    tokenized_docs = [doc.split() for doc in documents]
    all_tokens = list(set(token for doc in tokenized_docs for token in doc))
    all_tokens.sort()
    tf_matrix = np.zeros((len(documents), len(all_tokens)))
    for i, tokens in enumerate(tokenized_docs):
        for token in tokens:
            if token in all_tokens:
                index = all_tokens.index(token)
                tf_matrix[i, index] += 1
    df = np.sum(tf_matrix > 0, axis=0)
    num_docs = len(documents)
    idf = np.log(num_docs / (df + 1)) + 1
    tfidf_matrix = tf_matrix * idf
    tfidf_df = pd.DataFrame(tfidf_matrix, columns=all_tokens)
    tfidf_df.to_csv(output_csv, index=False)
    return tfidf_df

In [58]:
t1 = preprocess_document(document1)
t2 = preprocess_document(document2)

t = t1+t2

document = [document1, document2]

print(t,'\n')
print(t1,'\n')
print(t2,'\n')

['artificial', 'intelligence', 'ai', 'branch', 'computer', 'science', 'aims', 'create', 'machines', 'capable', 'performing', 'tasks', 'typically', 'require', 'human', 'intelligence', 'tasks', 'include', 'reasoning', 'learning', 'perception', 'language', 'understanding', 'ai', 'systems', 'increasingly', 'used', 'various', 'applications', 'speech', 'recognition', 'image', 'analysis', 'autonomous', 'vehicles', 'field', 'ai', 'encompasses', 'machine', 'learning', 'neural', 'networks', 'natural', 'language', 'processing', 'machine', 'learning', 'subset', 'artificial', 'intelligence', 'focuses', 'developing', 'algorithms', 'statistical', 'models', 'enable', 'computers', 'learn', 'make', 'decisions', 'based', 'data', 'involves', 'training', 'models', 'large', 'datasets', 'identify', 'patterns', 'make', 'predictions', 'machine', 'learning', 'techniques', 'include', 'supervised', 'learning', 'unsupervised', 'learning', 'reinforcement', 'learning', 'applications', 'machine', 'learning', 'found',

In [63]:
print(f"After label encoding{label_encode_scratch(t1, t)}")
print(f"After one-hot encoding{one_hot_encode_scratch(t1, t)}")
print(f"After bag of words encoding{bag_of_words_scratch(document)}")
print(f"After tfidf encoding{tfidf_scratch(document)}")


After label encoding[48, 49, 36, 3, 4, 5, 6, 7, 8, 9, 10, 16, 12, 13, 14, 49, 16, 74, 18, 83, 20, 43, 22, 36, 87, 25, 26, 27, 81, 29, 30, 31, 32, 33, 34, 35, 36, 37, 82, 83, 40, 41, 42, 43, 44]
After one-hot encoding[0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 0. 1. 0. 1. 0. 1. 0. 1. 0.
 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 0. 0. 0.
 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0.]
After bag of words encoding[[1. 2. 0. 1. 0. 0. 1. 1. 1. 1. 0. 1. 0. 3. 1. 1. 0. 0. 1. 1. 0. 0. 1. 0.
  1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 1. 1. 1. 1. 1. 1.
  0. 1. 2. 0. 0. 0. 2. 0. 1. 1. 0. 0. 1. 1. 1. 3. 0. 0. 1. 1. 0. 0. 1. 1.
  1. 1. 0. 0. 1. 1. 1. 0. 0. 1. 0. 1. 0. 2. 0. 2. 1. 0. 1. 1. 0. 1. 1. 1.]
 [0. 0. 1. 0. 1. 2. 0. 0. 1. 0. 1. 0. 1. 5. 0. 0. 1. 1. 1. 0. 1. 1. 0. 1.
  0. 0. 1. 0. 1. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 0. 1. 0. 1. 1. 0. 1. 0.
  1. 1. 0. 1. 1. 3. 2. 1. 1. 0.