In [68]:
import pandas as pd
import numpy as np
import numpy.linalg as linalg
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import unidecode
from typing import List
from word2number import w2n
import re
# nltk.download('punkt')
# nltk.download('wordnet')


In [6]:
corpus = ['data science is one of the most important fields of science',
          'this is one of the best data science courses',
          'data scientists analyze data' ]


In [7]:
text = corpus[0]
text

'data science is one of the most important fields of science'

In [17]:
text = "Hi SiÃ©, give me five hundred twenty five thousand six hundred"
print(w2n.word_to_num(text))

525600


In [8]:
text

'data science is one of the most important fields of science'

In [13]:
word_tokenize(text=text)

['data',
 'science',
 'is',
 'one',
 'of',
 'the',
 'most',
 'important',
 'fields',
 'of',
 'science']

In [6]:
tokens = text.lower().split()
tokens

['data',
 'science',
 'is',
 'one',
 'of',
 'the',
 'most',
 'important',
 'fields',
 'of',
 'science']

In [8]:
import string

test_str = 'Gfg, is best: for ! Geeks ;'

test_str = test_str.translate(str.maketrans('', '',
                                    string.punctuation))
print(test_str)

Gfg is best for  Geeks 


In [34]:
lem = WordNetLemmatizer()
# lem.lemmatize("getting into", pos=)

'getting into'

In [35]:
class TextNormalizerPipeline:

    def __init__(self, 
                lemmatize = True, 
                stop_words_use = True, 
                language = 'english',
                punctuation_removal = True,
                lowercase = True,
                unidecode_use = True,
                tokenize = True):
        
        self.lemmatizer = WordNetLemmatizer() if lemmatize else None
        self.punctuation_table = str.maketrans('','',string.punctuation) if punctuation_removal else None
        self.tokenizer = word_tokenize if tokenize else None
        self.stop_words = set(stopwords.words(language)) if stop_words_use else None
        self.unicode = unidecode.unidecode  if unidecode_use else None

        self.stop_words_use = stop_words_use
        self.lowercase = lowercase
        self.punctuation_removal = punctuation_removal
        self.tokenize = tokenize
        self.lemmatize = lemmatize
        self.unicode_use = unidecode
    
    def normalize_text(self,doc : str):
        
        text = re.sub(' +',' ', doc) # deleting extra white space

        if self.unicode_use :
            text = self.unicode(text)
        if self.lowercase :
            text = text.lower()
        if self.punctuation_removal:
            text = text.translate(self.punctuation_table)

        text = text.strip()

        words = self.tokenizer(text) if self.tokenize else text.split()
        
        if self.stop_words_use :
            words = [word for word in words if word not in self.stop_words]
        
        if self.lemmatize :
            words = [self.lemmatizer.lemmatize(word) for word in words]
      
        return " ".join(words)

In [102]:
class CustomTfidfVectorizer:


    def __init__(self):
        self.features: List[str] = []

    def tf(self,term: str, doc: str) -> float:
    
        tokens = word_tokenize(doc)
        return tokens.count(term)/len(tokens)

    def idf(self,term: str, docs: List[str]) -> float :
        
        frequence = 0
        frequence = sum([1 for doc in docs if term in word_tokenize(doc)])

        return np.log10((1+len(docs)) / (1+ frequence)) +1 

    def vocabulary(self,docs : List[str]) -> List[str]:
        set_words = set()
        for doc in docs:
            set_words = set_words.union(set(word_tokenize(doc)))
        return list(set_words)
    
    def get_feature_names_out(self):
        return self.features
    
    def tf_idf(self,docs: List[str]) -> pd.DataFrame:
        
        self.features = self.vocabulary(docs=docs)
        lines= len(docs)
        cols = len(self.features)

        df = pd.DataFrame(0, index=range(len(docs)), columns=self.features, dtype=float)
        for term in self.features: 
            for index,doc in enumerate(docs):
                df.at[index, term] = self.tf(term=term, doc=doc) * self.idf(term= term, docs=docs)
    
        return df
    
    def fit_tansform(self, docs: List[str]):
        
        return self.tf_idf(docs= docs)
    


In [103]:
tf_idf = CustomTfidfVectorizer()
dat = tf_idf.fit_tansform(docs= corpus)
dat

Unnamed: 0,the,analyze,important,courses,of,one,science,scientists,this,best,data,is,fields,most
0,0.102267,0.0,0.118275,0.0,0.204534,0.102267,0.204534,0.0,0.0,0.0,0.090909,0.102267,0.118275,0.118275
1,0.124993,0.0,0.0,0.144559,0.124993,0.124993,0.124993,0.0,0.144559,0.144559,0.111111,0.124993,0.0,0.0
2,0.0,0.325257,0.0,0.0,0.0,0.0,0.0,0.325257,0.0,0.0,0.5,0.0,0.0,0.0


In [54]:
corpus

['data science is one of the most important fields of science',
 'this is one of the best data science courses',
 'data scientists analyze data']

In [104]:
normalizer = TextNormalizerPipeline()

preprocessed_corpus =[normalizer.normalize_text(doc) for doc in corpus]
preprocessed_corpus

['data science one important field science',
 'one best data science course',
 'data scientist analyze data']

In [105]:
tf_idf1 = CustomTfidfVectorizer()
dat = tf_idf1.fit_tansform(docs= preprocessed_corpus)
dat

Unnamed: 0,analyze,important,field,one,science,course,best,data,scientist
0,0.0,0.216838,0.216838,0.18749,0.37498,0.0,0.0,0.166667,0.0
1,0.0,0.0,0.0,0.224988,0.224988,0.260206,0.260206,0.2,0.0
2,0.325257,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.325257
