In [1]:
from sklearn.datasets import fetch_20newsgroups
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]
    return tokens

preprocessed_docs = [preprocess_text(doc) for doc in newsgroups.data]

print(preprocessed_docs[0:10])


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/eminalizade/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['sure', 'bashers', 'pens', 'fans', 'pretty', 'confused', 'lack', 'kind', 'posts', 'recent', 'pens', 'massacre', 'devils', 'actually', 'bit', 'puzzled', 'bit', 'relieved', 'however', 'going', 'put', 'end', 'non-pittsburghers', 'relief', 'bit', 'praise', 'pens', 'man', 'killing', 'devils', 'worse', 'thought', 'jagr', 'showed', 'much', 'better', 'regular', 'season', 'stats', 'also', 'lot', 'fo', 'fun', 'watch', 'playoffs', 'bowman', 'let', 'jagr', 'lot', 'fun', 'next', 'couple', 'games', 'since', 'pens', 'going', 'beat', 'pulp', 'jersey', 'anyway', 'disappointed', 'see', 'islanders', 'lose', 'final', 'regular', 'season', 'game', 'pens', 'rule']


In [4]:
from collections import Counter

def compute_tf(doc):
    tf_dict = {}
    word_counts = Counter(doc)
    total_words = len(doc)
    for word, count in word_counts.items():
        tf_dict[word] = count/total_words
    return tf_dict

tf_docs = [compute_tf(doc) for doc in preprocessed_docs]

print(tf_docs[0:10])


[{'sure': 0.014285714285714285, 'bashers': 0.014285714285714285, 'pens': 0.07142857142857142, 'fans': 0.014285714285714285, 'pretty': 0.014285714285714285, 'confused': 0.014285714285714285, 'lack': 0.014285714285714285, 'kind': 0.014285714285714285, 'posts': 0.014285714285714285, 'recent': 0.014285714285714285, 'massacre': 0.014285714285714285, 'devils': 0.02857142857142857, 'actually': 0.014285714285714285, 'bit': 0.04285714285714286, 'puzzled': 0.014285714285714285, 'relieved': 0.014285714285714285, 'however': 0.014285714285714285, 'going': 0.02857142857142857, 'put': 0.014285714285714285, 'end': 0.014285714285714285, 'non-pittsburghers': 0.014285714285714285, 'relief': 0.014285714285714285, 'praise': 0.014285714285714285, 'man': 0.014285714285714285, 'killing': 0.014285714285714285, 'worse': 0.014285714285714285, 'thought': 0.014285714285714285, 'jagr': 0.02857142857142857, 'showed': 0.014285714285714285, 'much': 0.014285714285714285, 'better': 0.014285714285714285, 'regular': 0.028

In [None]:
import math

def compute_idf(docs):
    idf_dict = {}
    N = len(docs)
    idf_dict = dict.fromkeys(docs[0].keys(), 0)
    for doc in docs:
        for word, val in doc.items():
            if val > 0:
                idf_dict[word] += 1
    for word, val in idf_dict.items():
        idf_dict[word] = math.log10(N / float(val))
    return idf_dict

idf = compute_idf(tf_docs)

def compute_tfidf(tf, idf):
    tfidf_dict = {}
    for word, val in tf.items():
        tfidf_dict[word] = val * idf[word]
    return tfidf_dict

tfidf_docs = []
for doc in tf_docs:
    tfidf_doc = compute_tfidf(doc, idf)
    tfidf_docs.append(tfidf_doc)
