#### TF-IDF

$n_{\mathbb{d}\mathbb{w}}$ - term frequency - the number of times that word/term $\mathbb{w}$ occurs in document $\mathbb{d}$ 
divided by the number of all words in the document;<br>
$N_{\mathbb{w}}$ - the number of documents containing the term $\mathbb{w}$;<br>
$N$ - total number of documents; <br><br>

$p(\mathbb{w}, \mathbb{d}) = N_{\mathbb{w}} / N$ - probabilty of appearence of the term $\mathbb{w}$ in any document $\mathbb{d}$ 
<br>
$P(\mathbb{w}, \mathbb{d}, n_{\mathbb{d}\mathbb{w}}) = (N_{\mathbb{w}} / N)^{n_{\mathbb{d}\mathbb{w}}}$ - probability that the given term $\mathbb{w}$ appears $n_{\mathbb{d}\mathbb{w}}$ times in document $\mathbb{d}$<br><br>

$-\log{P(\mathbb{w}, \mathbb{d}, n_{\mathbb{d}\mathbb{w}})} = n_{\mathbb{d}\mathbb{w}} \cdot \log{(N / N_{\mathbb{w}})} = TF(\mathbb{w}, \mathbb{d}) \cdot IDF(\mathbb{w})$<br><br>

$TF(\mathbb{w}, \mathbb{d}) = n_{\mathbb{d}\mathbb{w}}$ - term frequency;<br>
$IDF(\mathbb{w}) = \log{(N /N_{\mathbb{w}})}$ - inverted document frequency;

### Some parameters of TfidfVectorizer

##### input : string {‘filename’, ‘file’, ‘content’}
##### lowercase : boolean, default True
##### preprocessor : callable or None (default)
##### tokenizer : callable or None (default)
##### stop_words : string {‘english’}, list, or None (default)
##### ngram_range : tuple (min_n, max_n)
##### max_df : float in range [0.0, 1.0] or int, default=1.0
##### min_df : float in range [0.0, 1.0] or int, default=1
##### max_features : int or None, default=None


In [1]:
from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'talk.religion.misc',
              'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train',
                                      categories=categories)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [None]:
newsgroups_train.data[:5]

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape

(2034, 34118)

In [4]:
vectorizer = TfidfVectorizer(lowercase=False)
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape

(2034, 42307)

In [5]:
vectorizer.get_feature_names()[:10]

['00',
 '000',
 '0000',
 '00000',
 '000000',
 '000005102000',
 '000021',
 '000062David42',
 '0000VEC',
 '0001']

In [6]:
vectorizer = TfidfVectorizer(min_df=0.8)
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape

(2034, 9)

In [7]:
vectorizer.get_feature_names()

['and', 'from', 'in', 'lines', 'of', 'organization', 'subject', 'the', 'to']

In [8]:
vectorizer = TfidfVectorizer(min_df=0.01, max_df=0.8)
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape

(2034, 2391)

In [9]:
vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=0.03, max_df=0.9)
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape

(2034, 1236)

In [10]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stopWords = set(stopwords.words('english'))
wnl = nltk.WordNetLemmatizer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [11]:
def preproc1(text):
    return ' '.join([wnl.lemmatize(word) for word in word_tokenize(text.lower()) if word not in stopWords])

st = "The saddest aspect of life right now is that science gathers knowledge faster than society gathers wisdom."
preproc1(st)

'saddest aspect life right science gather knowledge faster society gather wisdom .'

In [12]:
vectorizer = TfidfVectorizer(preprocessor=preproc1)
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape

(2034, 31719)

In [13]:
#vectorizer = TfidfVectorizer(preprocessor=preproc1, ngram_range=(1, 3), max_df=0.5, max_features=1000)
vectorizer = TfidfVectorizer(preprocessor=preproc1, min_df = 0.001, max_df=0.9)
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape

(2034, 11371)

In [None]:
vectorizer.get_feature_names()[::100]

In [15]:
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.linear_model import SGDClassifier

In [16]:
dense_vectors = vectors.todense()
X_train, X_test, y_train, y_test= train_test_split(dense_vectors, newsgroups_train.target, test_size=0.2, random_state=0)
y_train.shape, y_test.shape

((1627,), (407,))

In [17]:
from sklearn.metrics import accuracy_score

svc = svm.SVC()
svc.fit(X_train, y_train)
accuracy_score(y_test, svc.predict(X_test))

0.9606879606879607

In [18]:
sgd = SGDClassifier()
sgd.fit(X_train, y_train)
accuracy_score(y_test, sgd.predict(X_test))

0.9582309582309583

### Custom TF-IDF

In [42]:
# just for practicing with unigrams 
import math
from collections import Counter

class MyTfIdf():
  
  def __init__(self, preprocessor=None, min_df = 0.001, max_df = 0.9):
    self.preproc_func = preprocessor
    self.vocab = []
    self.idf = {}
    self.term2id = {}
    self.min_df = min_df
    self.max_df = max_df

  def __prepoc_data(self, data):
    preproc_data = data
    if self.preproc_func is not None:
      preproc_data = list(map(self.preproc_func, preproc_data))
    preproc_data = list(map(lambda x: x.split(), preproc_data))
    return preproc_data


  def fit(self, data):
    preproc_data = self.__prepoc_data(data)
    
    vocab_set = set()
    for doc in preproc_data:
      vocab_set.update(doc)

    self.vocab = sorted(list(vocab_set))
    df = dict(zip(self.vocab, [0]*len(self.vocab)))
    
    for doc in preproc_data:
      doc_vocab_set = set(doc)
      for term in doc_vocab_set:
        if term in df.keys():
          df[term] += 1
    
    for k, v in df.items():
      df[k] = v / len(preproc_data)

    df = {k: v for k, v in df.items() if v > self.min_df and v < self.max_df}
    
    for k, v in df.items():
      self.idf[k] = math.log(1 / v)
    
    for i, val in enumerate(self.idf.keys()):
      self.term2id[val] = i


  def fit_transform(self, data):
    self.fit(data)
    preproc_data = self.__prepoc_data(data)
    results = []
    for doc in preproc_data:
      doc_vector = [0]*len(self.idf.keys())

      term_cnts = Counter(doc)
      for term in term_cnts.keys():
        if term in self.term2id.keys():
          tf = term_cnts[term]/len(doc)
          doc_vector[self.term2id[term]] = tf * self.idf[term] 
      
      results.append(doc_vector)

    return results #self.idf #preproc_data#self.idf 

In [None]:
my_vectorizer = MyTfIdf(preprocessor=preproc1)
my_vectors = my_vectorizer.fit_transform(newsgroups_train.data)
list(filter(lambda x: x!=0, my_vectors[0]))

In [53]:
X_train, X_test, y_train, y_test= train_test_split(my_vectors, newsgroups_train.target, test_size=0.2, random_state=0)
y_train.shape, y_test.shape

((1627,), (407,))

In [54]:
from sklearn.metrics import accuracy_score

svc = svm.SVC()
svc.fit(X_train, y_train)
accuracy_score(y_test, svc.predict(X_test))

0.9262899262899262

In [55]:
sgd = SGDClassifier()
sgd.fit(X_train, y_train)
accuracy_score(y_test, sgd.predict(X_test))

0.9361179361179361