In [9]:
from typing import Callable,Union,List,Tuple,Literal
import numpy as np
import pandas as pd
import os
import re

import nltk
nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
STOP_WORDS = set(stopwords.words('english'))

In [11]:
class BagOfWords:
    def __init__(self,
                 max_features:int=None,
                 stop_words:Union[str,List[str]]='english',
                 analyze:Literal["word", "char"] = 'word',
                 ngram_range:Tuple[int,int]=(1,1),
                 lowercase:bool=True,
                 binary:bool=False
                 ):
        
        """

        Args:
            
            max_features (int): If not None, build a vocabulary that only consider the top
                               `max_features` ordered by term frequency across the corpus.
                                Otherwise, all features are used.
                                Defaults to None.
                                
            stop_words (List[str]): list of words which will be considered as stop word and 
                                    will be removed from vocabulary.
                                    If `english`, a built-in stop word list for English is used from NLTK.
                                    Defaults to english.
            
            ngram_range (Tuple, optional): The lower and upper boundary of the range of n-values for different word
                                           n-grams or char n-grams to be extracted.
                                           All values of n such such that min_n <= n <= max_n will be used.
                                           For example an ngram_range of (1, 1) means only unigrams, 
                                           (1, 2) means unigrams and bigrams, and (2, 2) means only bigrams.
                                           Defaults to (1,1).
            
            analyzer (Literal["word", "char"]) : Whether the feature should be made of word n-gram or character n-grams.
                                                 Defaults to `word`.
                                                
            
            lowercase (bool, optional): Convert all characters to lowercase before tokenizing.
                                        Defaults to True.
            
            binary (bool, optional): if True calculate binary bagofword else count vectorizer.
                                        Defaults to False.
        """  
        self.ngram_range = ngram_range
        self.lowercase = lowercase
        self.stop_words = stop_words
        self.analyze = analyze 
        self.max_features = max_features
        self.binary = binary
        
        max_feature_validation = (isinstance(self.max_features,int) and self.max_features > 0) or self.max_features is None
        stop_words_validation = isinstance(stop_words,list) or stop_words=='english'
        ngram_range_validation = isinstance(self.ngram_range,tuple) or isinstance(ngram_range,list)
        analyze_validation = self.analyze in ['word','char']
        
        if not max_feature_validation: raise ValueError(f"max_features must be None or int and non zero, got {self.max_features}")
        if not analyze_validation: raise ValueError(f"analyze must word or char, got {self.analyze}")
        if not stop_words_validation: raise ValueError(f"stop_words must be `english` of list of words, got {self.stop_words}")
        if not ngram_range_validation: raise ValueError(f"ngram_range must be tupe of list of intger like (1,2), got {ngram_range}")
        if not ngram_range[0] <= ngram_range[1]: assert ValueError(f"ngram_range lower boundary must be lessthan or equal to upper boundary, got {self.ngram_range}")
                    
        if self.stop_words=='english':
            self.stop_words = STOP_WORDS
            
    def _extract_ngrams(self,tokens):
        
        # https://www.projectpro.io/recipes/find-ngrams-from-text
        for num in range(self.ngram_range[0] ,self.ngram_range[1]+1):
            
            # we already have onegram which is tokens it self
            # hence start with 2
            if num > 1:
                n_grams = ngrams(tokens, num)
                n_grams = [ ' '.join(grams) for grams in n_grams]
                tokens.extend(n_grams)
        
        return tokens
    
    def _generate_tokens(self,corpus:str):
        
        if self.lowercase:
            corpus = corpus.lower()

        if self.analyze == "word":
            # convert text/input into word tokens
            tokens = word_tokenize(corpus)
            
            tokens = self._extract_ngrams(tokens)
            
        else:
            
            input =  re.sub(' +','',corpus)
            
            input = input.strip()
            
            tokens = list(input)
            
            tokens = self._extract_ngrams(tokens)
            
        # remove stop words from tokens
        final_tokens = [w for w in tokens if not w.lower() in self.stop_words]
        
        return final_tokens
    
    def _verify_and_return_input(self,input):
        
        if isinstance(input,str):
            if not os.path.exists(input): raise FileNotFoundError(f"{input} was not found or is a directory")
            with open(input,'r') as f:
                input = f.read().splitlines()

        else:
            if not (isinstance(input,list) or isinstance(input,tuple)) : raise ValueError(f"input must be list or tuple or filepath, got {input}")
        
        return input
                
    def fit(self, input:Union[str,List[str]]):
        """_summary_

        Args:
            input (Union[str,List[str]]): input can be filepath or list of string

        """
        
        self.vocab_word_count = pd.DataFrame()
        
        input = self._verify_and_return_input(input)
        
        input = " ".join(input)
                        
        tokens = self._generate_tokens(input)
        
        for i,token in enumerate(set(tokens)):
            self.vocab_word_count.loc[i,'word'] = token
            self.vocab_word_count.loc[i,'count'] = tokens.count(token)
        
        self.vocab_word_count  =  self.vocab_word_count.sort_values(by=['count'], ascending=False).reset_index(drop = True)
        
        if self.max_features is not None:
            self.vocab_word_count = self.vocab_word_count.loc[:self.max_features-1,:] 
            
            
    def transform(self,input:Union[str,List[str]])->np.array:
        """will calculate and return bag of word vector 

        Args:
            input (Union[str,List[str]]): input can be filepath or list of string

        Returns:
            np.array: bagofword representation
        """        """"""

        input = self._verify_and_return_input(input)
        
        bag_of_word_arr = np.zeros((len(input),
                                         self.vocab_word_count.shape[0])
                                        )
        for i,text in enumerate(input):
            current_tokens = self._generate_tokens(text)
            for ct in current_tokens:
                index_of_word = self.vocab_word_count.index[self.vocab_word_count['word']==ct].tolist()
                if len(index_of_word) > 0:
                    bag_of_word_arr[i,index_of_word[0]] =  1 if self.binary else current_tokens.count(ct)
        
        return bag_of_word_arr
                                

In [12]:
corpus = [
     'This is the first document.',
     'This document is the second document.',
     'And this is the third one.',
     'Is this the first document?'
    ]

In [13]:
# test based on list of string
bow_vectorizer = BagOfWords(max_features=10,ngram_range=(1,1),analyze='word')
bow_vectorizer.fit(corpus)
bow_arr = bow_vectorizer.transform(corpus)

In [14]:
bow_arr

array([[1., 1., 1., 0., 0., 0., 0.],
       [2., 1., 0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 1., 1.],
       [1., 0., 1., 0., 1., 0., 0.]])

In [15]:
# test based on file
bow_vectorizer = BagOfWords(max_features=10,ngram_range=(1,1),analyze='word')
bow_vectorizer.fit('corpus.txt')
bow_arr = bow_vectorizer.transform(corpus)

In [16]:
bow_arr

array([[1., 1., 1., 0., 0., 0., 0.],
       [2., 1., 0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 1., 1.],
       [1., 0., 1., 0., 1., 0., 0.]])