In [1]:
!pip install nltk
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from math import *
import math

nltk.download('punkt')
nltk.download('stopwords')
text = "Hello Mr. Smith, how are you doing today? The weather is great, and Python is awesome. The sky is pinkish-blue. You shouldn't eat cardboard."

sentences = sent_tokenize(text) # NLTK function
total_documents = len(sentences)

def _create_frequency_matrix(sentences,stopWords):
    frequency_matrix = {}
    #ps = PorterStemmer()

    for sent in sentences:
        freq_table = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            #word = ps.stem(word)
            if word in stopWords:
                continue

            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        frequency_matrix[sent[:15]] = freq_table

    return frequency_matrix


def _create_tf_matrix(freq_matrix):
    tf_matrix = {}

    for sent, f_table in freq_matrix.items():
        tf_table = {}

        count_words_in_sentence = len(f_table)
        for word, count in f_table.items():
            tf_table[word] = count / count_words_in_sentence

        tf_matrix[sent] = tf_table

    return tf_matrix


def _create_documents_per_words(freq_matrix):
    word_per_doc_table = {}

    for sent, f_table in freq_matrix.items():
        for word, count in f_table.items():
            if word in word_per_doc_table:
                word_per_doc_table[word] += 1
            else:
                word_per_doc_table[word] = 1

    return word_per_doc_table



def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
    idf_matrix = {}

    for sent, f_table in freq_matrix.items():
        idf_table = {}

        for word in f_table.keys():
            idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))

        idf_matrix[sent] = idf_table

    return idf_matrix


def _create_tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}

    for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):

        tf_idf_table = {}

        for (word1, value1), (word2, value2) in zip(f_table1.items(),
                                                    f_table2.items()):  # here, keys are the same in both the table
            tf_idf_table[word1] = float(value1 * value2)

        tf_idf_matrix[sent1] = tf_idf_table

    return tf_idf_matrix



[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# english text

###  frequency

In [2]:
stopWords = set(stopwords.words("english"))
freq=_create_frequency_matrix(sentences,stopWords)
print(freq)

{'Hello Mr. Smith': {'hello': 1, 'mr.': 1, 'smith': 1, ',': 1, 'today': 1, '?': 1}, 'The weather is ': {'weather': 1, 'great': 1, ',': 1, 'python': 1, 'awesome': 1, '.': 1}, 'The sky is pink': {'sky': 1, 'pinkish-blue': 1, '.': 1}, "You shouldn't e": {"n't": 1, 'eat': 1, 'cardboard': 1, '.': 1}}


### tf

In [3]:
tf=_create_tf_matrix(freq)
print(tf)

{'Hello Mr. Smith': {'hello': 0.16666666666666666, 'mr.': 0.16666666666666666, 'smith': 0.16666666666666666, ',': 0.16666666666666666, 'today': 0.16666666666666666, '?': 0.16666666666666666}, 'The weather is ': {'weather': 0.16666666666666666, 'great': 0.16666666666666666, ',': 0.16666666666666666, 'python': 0.16666666666666666, 'awesome': 0.16666666666666666, '.': 0.16666666666666666}, 'The sky is pink': {'sky': 0.3333333333333333, 'pinkish-blue': 0.3333333333333333, '.': 0.3333333333333333}, "You shouldn't e": {"n't": 0.25, 'eat': 0.25, 'cardboard': 0.25, '.': 0.25}}


### how many sentences contain a word

In [4]:
count_doc_per_words=_create_documents_per_words(freq)
print(count_doc_per_words)

{'hello': 1, 'mr.': 1, 'smith': 1, ',': 2, 'today': 1, '?': 1, 'weather': 1, 'great': 1, 'python': 1, 'awesome': 1, '.': 3, 'sky': 1, 'pinkish-blue': 1, "n't": 1, 'eat': 1, 'cardboard': 1}


### idf

In [5]:
idf=_create_idf_matrix(freq, count_doc_per_words, total_documents)
print(idf)

{'Hello Mr. Smith': {'hello': 0.6020599913279624, 'mr.': 0.6020599913279624, 'smith': 0.6020599913279624, ',': 0.3010299956639812, 'today': 0.6020599913279624, '?': 0.6020599913279624}, 'The weather is ': {'weather': 0.6020599913279624, 'great': 0.6020599913279624, ',': 0.3010299956639812, 'python': 0.6020599913279624, 'awesome': 0.6020599913279624, '.': 0.12493873660829992}, 'The sky is pink': {'sky': 0.6020599913279624, 'pinkish-blue': 0.6020599913279624, '.': 0.12493873660829992}, "You shouldn't e": {"n't": 0.6020599913279624, 'eat': 0.6020599913279624, 'cardboard': 0.6020599913279624, '.': 0.12493873660829992}}


### tf_idf

In [6]:
tf_idf= _create_tf_idf_matrix(tf, idf)
print(tf_idf)

{'Hello Mr. Smith': {'hello': 0.10034333188799373, 'mr.': 0.10034333188799373, 'smith': 0.10034333188799373, ',': 0.050171665943996864, 'today': 0.10034333188799373, '?': 0.10034333188799373}, 'The weather is ': {'weather': 0.10034333188799373, 'great': 0.10034333188799373, ',': 0.050171665943996864, 'python': 0.10034333188799373, 'awesome': 0.10034333188799373, '.': 0.020823122768049984}, 'The sky is pink': {'sky': 0.20068666377598746, 'pinkish-blue': 0.20068666377598746, '.': 0.04164624553609997}, "You shouldn't e": {"n't": 0.1505149978319906, 'eat': 0.1505149978319906, 'cardboard': 0.1505149978319906, '.': 0.03123468415207498}}


# french text

In [7]:
text_fr="bonjout monsieur lucas .j'espere que tu vas bien . aujourd'hui je vais vous presenter quelque chose tres importante."
sentences = sent_tokenize(text_fr) # NLTK function
total_documents = len(sentences)

### frequency

In [8]:
stopWords = set(stopwords.words("french"))
freq=_create_frequency_matrix(sentences,stopWords)
print(freq)

{'bonjout monsieu': {'bonjout': 1, 'monsieur': 1, 'lucas': 1, ".j'espere": 1, 'vas': 1, 'bien': 1, '.': 1}, "aujourd'hui je ": {"aujourd'hui": 1, 'vais': 1, 'presenter': 1, 'quelque': 1, 'chose': 1, 'tres': 1, 'importante': 1, '.': 1}}


### tf

In [9]:
tf=_create_tf_matrix(freq)
print(tf)

{'bonjout monsieu': {'bonjout': 0.14285714285714285, 'monsieur': 0.14285714285714285, 'lucas': 0.14285714285714285, ".j'espere": 0.14285714285714285, 'vas': 0.14285714285714285, 'bien': 0.14285714285714285, '.': 0.14285714285714285}, "aujourd'hui je ": {"aujourd'hui": 0.125, 'vais': 0.125, 'presenter': 0.125, 'quelque': 0.125, 'chose': 0.125, 'tres': 0.125, 'importante': 0.125, '.': 0.125}}


### how many sentences contain a word

In [10]:
count_doc_per_words=_create_documents_per_words(freq)
print(count_doc_per_words)

{'bonjout': 1, 'monsieur': 1, 'lucas': 1, ".j'espere": 1, 'vas': 1, 'bien': 1, '.': 2, "aujourd'hui": 1, 'vais': 1, 'presenter': 1, 'quelque': 1, 'chose': 1, 'tres': 1, 'importante': 1}


### idf

In [11]:
idf=_create_idf_matrix(freq, count_doc_per_words, total_documents)
print(idf)

{'bonjout monsieu': {'bonjout': 0.3010299956639812, 'monsieur': 0.3010299956639812, 'lucas': 0.3010299956639812, ".j'espere": 0.3010299956639812, 'vas': 0.3010299956639812, 'bien': 0.3010299956639812, '.': 0.0}, "aujourd'hui je ": {"aujourd'hui": 0.3010299956639812, 'vais': 0.3010299956639812, 'presenter': 0.3010299956639812, 'quelque': 0.3010299956639812, 'chose': 0.3010299956639812, 'tres': 0.3010299956639812, 'importante': 0.3010299956639812, '.': 0.0}}


### tf_idf

In [12]:
tf_idf= _create_tf_idf_matrix(tf, idf)
print(tf_idf)

{'bonjout monsieu': {'bonjout': 0.043004285094854454, 'monsieur': 0.043004285094854454, 'lucas': 0.043004285094854454, ".j'espere": 0.043004285094854454, 'vas': 0.043004285094854454, 'bien': 0.043004285094854454, '.': 0.0}, "aujourd'hui je ": {"aujourd'hui": 0.03762874945799765, 'vais': 0.03762874945799765, 'presenter': 0.03762874945799765, 'quelque': 0.03762874945799765, 'chose': 0.03762874945799765, 'tres': 0.03762874945799765, 'importante': 0.03762874945799765, '.': 0.0}}
