In [31]:
from typing import Callable,Union,List,Tuple,Literal
import numpy as np
import pandas as pd
import os
import re
import math

import nltk
nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [32]:
STOP_WORDS = set(stopwords.words('english'))

$TF(t) = \frac{\text{Number of times term t appears in a document}}{\text{Total number of terms in the document}}$



$IDF(t) = \log_{e}\frac{\text{Total number of documents}}
{\text{Number of documents with term t in it}+1} + 1$


In [81]:
class TFIDF:
    def __init__(self,
                 max_features:int=None,
                 stop_words:Union[str,List[str]]='english',
                 analyze:Literal["word", "char"] = 'word',
                 ngram_range:Tuple[int,int]=(1,1),
                 lowercase:bool=True,
                 binary:bool=False
                 ):
        
        """

        Args:
            
            max_features (int): If not None, build a vocabulary that only consider the top
                               `max_features` ordered by term frequency across the corpus.
                                Otherwise, all features are used.
                                Defaults to None.
                                
            stop_words (List[str]): list of words which will be considered as stop word and 
                                    will be removed from vocabulary.
                                    If `english`, a built-in stop word list for English is used from NLTK.
                                    Defaults to english.
            
            ngram_range (Tuple, optional): The lower and upper boundary of the range of n-values for different word
                                           n-grams or char n-grams to be extracted.
                                           All values of n such such that min_n <= n <= max_n will be used.
                                           For example an ngram_range of (1, 1) means only unigrams, 
                                           (1, 2) means unigrams and bigrams, and (2, 2) means only bigrams.
                                           Defaults to (1,1).
            
            analyzer (Literal["word", "char"]) : Whether the feature should be made of word n-gram or character n-grams.
                                                 Defaults to `word`.
                                                
            
            lowercase (bool, optional): Convert all characters to lowercase before tokenizing.
                                        Defaults to True.
            
            binary (bool, optional): if True calculate binary bagofword else count vectorizer.
                                        Defaults to False.
        """  
        self.ngram_range = ngram_range
        self.lowercase = lowercase
        self.stop_words = stop_words
        self.analyze = analyze 
        self.max_features = max_features
        self.binary = binary
        
        max_feature_validation = (isinstance(self.max_features,int) and self.max_features > 0) or self.max_features is None
        stop_words_validation = isinstance(stop_words,list) or stop_words=='english'
        ngram_range_validation = isinstance(self.ngram_range,tuple) or isinstance(ngram_range,list)
        analyze_validation = self.analyze in ['word','char']
        
        if not max_feature_validation: raise ValueError(f"max_features must be None or int and non zero, got {self.max_features}")
        if not analyze_validation: raise ValueError(f"analyze must word or char, got {self.analyze}")
        if not stop_words_validation: raise ValueError(f"stop_words must be `english` of list of words, got {self.stop_words}")
        if not ngram_range_validation: raise ValueError(f"ngram_range must be tupe of list of intger like (1,2), got {ngram_range}")
        if not ngram_range[0] <= ngram_range[1]: assert ValueError(f"ngram_range lower boundary must be lessthan or equal to upper boundary, got {self.ngram_range}")
                    
        if self.stop_words=='english':
            self.stop_words = STOP_WORDS
            
    def _extract_ngrams(self,tokens):
        
        # https://www.projectpro.io/recipes/find-ngrams-from-text
        for num in range(self.ngram_range[0] ,self.ngram_range[1]+1):
            
            # we already have onegram which is tokens it self
            # hence start with 2
            if num > 1:
                n_grams = ngrams(tokens, num)
                n_grams = [ ' '.join(grams) for grams in n_grams]
                tokens.extend(n_grams)
        
        return tokens
    
    def _generate_tokens(self,corpus:str):
        
        if self.lowercase:
            corpus = corpus.lower()

        if self.analyze == "word":
            # convert text/input into word tokens
            tokens = word_tokenize(corpus)
            
            tokens = self._extract_ngrams(tokens)
            
        else:
            
            input =  re.sub(' +','',corpus)
            
            input = input.strip()
            
            tokens = list(input)
            
            tokens = self._extract_ngrams(tokens)

        # remove stop words from tokens
        final_tokens = [w for w in tokens if not w.lower() in self.stop_words]
        
        return final_tokens
    
    def _verify_and_return_input(self,input):
        
        if isinstance(input,str):
            if not os.path.exists(input): raise FileNotFoundError(f"{input} was not found or is a directory")
            with open(input,'r') as f:
                input = f.read().splitlines()

        else:
            if not (isinstance(input,list) or isinstance(input,tuple)) : raise ValueError(f"input must be list or tuple or filepath, got {input}")
        
        return input
                
    def fit(self, input:Union[str,List[str]]):
        """_summary_

        Args:
            input (Union[str,List[str]]): input can be filepath or list of string

        """
        
        self.vocab_word_idf = pd.DataFrame()
        
        input = self._verify_and_return_input(input)
        
        total_document = len(input) 
        
        joined_input = " ".join(input)
                        
        tokens = self._generate_tokens(joined_input)
        
        for i,token in enumerate(set(tokens)):
            
            num_of_document_with_token = 1
            
            for doc in input:
                if token in doc:
                    num_of_document_with_token += 1
            
                    
            self.vocab_word_idf.loc[i,'word'] = token
            self.vocab_word_idf.loc[i,'idf'] = 1 + math.log( (total_document) / (num_of_document_with_token))
            
        
        self.vocab_word_idf  =  self.vocab_word_idf.sort_values(by=['idf'], ascending=False).reset_index(drop = True)
        
        if self.max_features is not None:
            self.vocab_word_idf = self.vocab_word_idf.loc[:self.max_features-1,:] 
            
            
    def transform(self,input:Union[str,List[str]])->np.array:
        """will calculate and return bag of word vector 

        Args:
            input (Union[str,List[str]]): input can be filepath or list of string

        Returns:
            np.array: bagofword representation
        """        """"""

        input = self._verify_and_return_input(input)

        tfidf_arr = np.zeros((len(input),self.vocab_word_idf.shape[0]))
        
        for i,text in enumerate(input):
            
            current_tokens = self._generate_tokens(text)

            num_current_tokens = len(word_tokenize(text))

            for ct in current_tokens:
                word_detail = self.vocab_word_idf.loc[self.vocab_word_idf['word']==ct]
                
                if word_detail.shape[0] > 0:
                    idf_of_word = word_detail['idf'].tolist()[0]
                    index_of_word = word_detail.index.tolist()[0]
                    
                    tf = current_tokens.count(ct) / num_current_tokens
                    tfidf_arr[i,index_of_word] =  idf_of_word * tf
        
        return tfidf_arr
                                

In [82]:
corpus = [
     "The cat in the hat",
     "The cat ate the mouse",
     "The mouse ran away from the cat"
]

In [83]:
# test based on list of string
tfidf_vectorizer = TFIDF(max_features=10,ngram_range=(1,1),analyze='word')
tfidf_vectorizer.fit(corpus)
tfidf_arr = tfidf_vectorizer.transform(corpus)

In [85]:
print(tfidf_arr)

[[0.         0.28109302 0.         0.         0.         0.14246359]
 [0.28109302 0.         0.         0.         0.2        0.14246359]
 [0.         0.         0.20078073 0.20078073 0.14285714 0.1017597 ]]


In [64]:
tfidf_vectorizer.vocab_word_idf

Unnamed: 0,word,idf
0,ate,1.405465
1,hat,1.405465
2,ran,1.405465
3,away,1.405465
4,mouse,1.0
5,cat,0.712318
