### Sample program for TF-IDF  

#### Import libraries  

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

#### Parameters  

In [None]:
csv_in = 'newsgroups5-1.csv'

#### Read CSV file  

In [None]:
df = pd.read_csv(csv_in, delimiter=',', skiprows=0, header=0)
print(df.shape)
print(df.info())
display(df.head())

#### Check the number of documents in each category  

In [None]:
print(df['target'].value_counts())

#### Assign docID according to its category  
- docID = 'd' + number, such as d0, d1, ..., d1000, d1001, ...
 - number = target * 1000 + j

In [None]:
docID = []
j = np.zeros(len(df['target'].value_counts()))
for i in range(len(df)):
    tgt = df.at[i, 'target']
    # base of document ID:
    #   0 for documents of target 0, 1000 for documents of target 1,
    #   2000 for documents of target 2, ...
    docID.append('d'+str(int(tgt*1000+j[tgt])))
    # increment j for target "tgt"
    j[tgt] += 1
df['docID'] = docID
display(df.head())

#### TF-IDF  

In [None]:
%%time

vectorizer = TfidfVectorizer(use_idf=True, stop_words='english')
vecs = vectorizer.fit_transform(df['content'])

In [None]:
print(vecs.shape)
print(vecs)
print(vecs.toarray())

#### Examples of words with highest TF-IDF for each document  

In [None]:
words = vectorizer.get_feature_names()
vecs_array = vecs.toarray()
print(type(vecs_array), vecs_array.shape)
for i in range(10):
    row = vecs_array[i]
    sorted_idx = np.argsort(-row)
    doc_id = df.at[i, 'docID']
    tgt_n = df.at[i, 'target_names']
    result = '{} {}: {};'.format(i, doc_id, tgt_n)
    for j in sorted_idx[:3]:
        result += ' ' + words[j]
    print(result)