### Sample program for TF-IDF  

#### Import libraries  

In [9]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

#### Parameters  

In [10]:
csv_in = 'newsgroups5-2.csv'

#### Read CSV file  

In [11]:
df = pd.read_csv(csv_in, delimiter=',', skiprows=0, header=0)
print(df.shape)
print(df.info())
display(df.head())

(2927, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2927 entries, 0 to 2926
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   content       2927 non-null   object
 1   target        2927 non-null   int64 
 2   target_names  2927 non-null   object
dtypes: int64(1), object(2)
memory usage: 68.7+ KB
None


Unnamed: 0,content,target,target_names
0,"From article <C5owCB.n3p@world.std.com>, by t...",1,sci.space
1,In article <1r1eu1$4t@transfer.stratus.com> c...,3,talk.politics.guns
2,In article <1993Apr23.184732.1105@aio.jsc.nas...,1,sci.space
3,What is the EXACT entry (parameter and synta...,2,comp.windows.x
4,In article <1993Apr20.151818.4319@samba.oit.u...,0,rec.sport.hockey


#### Check the number of documents in each category  

In [12]:
print(df['target'].value_counts())

0    600
4    595
1    593
2    593
3    546
Name: target, dtype: int64


#### Assign docID according to its category  
- docID = 'd' + number, such as d0, d1, ..., d1000, d1001, ...
 - number = target * 1000 + j

In [13]:
docID = []
j = np.zeros(len(df['target'].value_counts()))
for i in range(len(df)):
    tgt = df.at[i, 'target']
    # base of document ID:
    #   0 for documents of target 0, 1000 for documents of target 1,
    #   2000 for documents of target 2, ...
    docID.append('d'+str(int(tgt*1000+j[tgt])))
    # increment j for target "tgt"
    j[tgt] += 1
df['docID'] = docID
display(df.head())

Unnamed: 0,content,target,target_names,docID
0,"From article <C5owCB.n3p@world.std.com>, by t...",1,sci.space,d1000
1,In article <1r1eu1$4t@transfer.stratus.com> c...,3,talk.politics.guns,d3000
2,In article <1993Apr23.184732.1105@aio.jsc.nas...,1,sci.space,d1001
3,What is the EXACT entry (parameter and synta...,2,comp.windows.x,d2000
4,In article <1993Apr20.151818.4319@samba.oit.u...,0,rec.sport.hockey,d0


#### TF-IDF  

In [14]:
%%time

vectorizer = TfidfVectorizer(use_idf=True, stop_words='english')
vecs = vectorizer.fit_transform(df['content'])

CPU times: user 799 ms, sys: 578 µs, total: 799 ms
Wall time: 800 ms


In [15]:
print(vecs.shape)
print(vecs)
print(vecs.toarray())

(2927, 43833)
  (0, 23235)	0.11433981419003146
  (0, 21565)	0.09828967368147341
  (0, 24871)	0.11731637011801296
  (0, 2746)	0.11179005513866408
  (0, 29093)	0.07834729760213531
  (0, 13391)	0.1013169207323394
  (0, 38380)	0.06253740231988479
  (0, 23402)	0.03793035163264413
  (0, 28155)	0.04844115606922844
  (0, 22513)	0.10757809935156604
  (0, 31171)	0.08512300373906294
  (0, 12093)	0.06692994665631972
  (0, 18130)	0.09431372025420827
  (0, 37501)	0.13136503845608435
  (0, 24435)	0.07746065112137435
  (0, 38906)	0.11901459905545898
  (0, 35219)	0.061099693941857075
  (0, 7967)	0.07358879488022269
  (0, 40772)	0.08998492524400967
  (0, 33783)	0.050149569584074405
  (0, 15515)	0.04058316764150822
  (0, 11505)	0.1019893747894061
  (0, 38714)	0.055065039141524365
  (0, 36213)	0.0682209452929941
  (0, 10286)	0.20993186143477519
  :	:
  (2925, 28190)	0.09831399752143957
  (2925, 19642)	0.15965779026890953
  (2925, 14049)	0.10351157607153053
  (2925, 40529)	0.0861009079050038
  (2925, 43458

#### Examples of words with highest TF-IDF for each document  

In [16]:
words = vectorizer.get_feature_names()
vecs_array = vecs.toarray()
print(type(vecs_array), vecs_array.shape)
for i in range(10):
    row = vecs_array[i]
    sorted_idx = np.argsort(-row)
    doc_id = df.at[i, 'docID']
    tgt_n = df.at[i, 'target_names']
    result = '{} {}: {};'.format(i, doc_id, tgt_n)
    for j in sorted_idx[:3]:
        result += ' ' + words[j]
    print(result)



<class 'numpy.ndarray'> (2927, 43833)
1 d3000: talk.politics.guns; weapons destruction mass
2 d1001: sci.space; ssf option flights
3 d2000: comp.windows.x; ncd 3b2 boots
4 d0: rec.sport.hockey; captain traded leafs
5 d1: rec.sport.hockey; mask cherry upenn
6 d4000: sci.crypt; encryption clause weak
7 d3001: talk.politics.guns; revolver semi autos
8 d1002: sci.space; environment canopies windscreen
9 d2001: comp.windows.x; failed segment shared
