In [21]:
import nltk
import numpy as np
import string
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

file = open('space_invaders.txt', encoding='utf-8')
doc = file.read()
file.close()

stop_words = stopwords.words('english')

porter = nltk.stem.PorterStemmer()
sentences = nltk.tokenize.sent_tokenize(doc)

# treat each sentence as a document
docs = []

punc = str.maketrans('','', string.punctuation)
for sent in sentences:
    sent_no_punc = sent.translate(punc)
    words_stemmed = [porter.stem(w) for w in sent_no_punc.lower().split()
                     if w not in stop_words]
    docs += [' '.join(words_stemmed)]    

In [22]:
tfidf = TfidfVectorizer()
tfidf_docs = tfidf.fit_transform(docs).toarray()
tfidf_docs

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

Show TF-IDF feature-vectors for each document.

In [23]:

df_index = ['doc'+str(i) for i in range(len(docs))]
df_columns = tfidf.get_feature_names()

tfidf_df = pd.DataFrame(data=tfidf_docs, index=df_index, columns=df_columns)
tfidf_df

Unnamed: 0,100yen,1972,1974,1975,1978,1979,8080,abil,abl,achiev,...,wave,way,websit,well,western,whole,world,would,yamato,year
doc0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
doc1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
doc2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
doc3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
doc4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
doc5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.350539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
doc6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
doc7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
doc8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
doc9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285168


For each document, sum up the values in its TFIDF feature-vector.

In [24]:
tfidf_sum_by_docs = np.sum(tfidf_df, axis=1)
tfidf_sum_by_docs

doc0     3.912255
doc1     4.525808
doc2     4.628859
doc3     2.781502
doc4     2.376074
doc5     3.123966
doc6     3.430478
doc7     2.952512
doc8     3.908374
doc9     3.604040
doc10    4.821202
doc11    2.570495
doc12    3.536538
doc13    2.616532
doc14    3.561863
doc15    2.917672
doc16    3.280913
doc17    2.799709
doc18    3.834832
doc19    3.574759
doc20    2.171319
doc21    3.419013
doc22    4.235573
doc23    2.803102
doc24    4.816107
doc25    4.300563
doc26    4.149037
doc27    3.564487
doc28    5.007711
doc29    3.649207
doc30    3.424394
doc31    3.138539
doc32    4.040074
doc33    1.655060
doc34    1.000000
doc35    2.646406
doc36    4.399896
doc37    4.715729
doc38    2.615499
doc39    4.402610
doc40    3.408534
doc41    3.241754
doc42    3.156477
doc43    3.595558
doc44    3.241754
doc45    3.591621
doc46    4.154909
doc47    3.179854
doc48    5.948904
doc49    4.171724
doc50    2.587288
doc51    3.668989
dtype: float64

Sort the documents based on the sum of their TFIDF feature-vectors.

In [25]:
top_tfidf = tfidf_sum_by_docs.sort_values(ascending=False)
top_tfidf

doc48    5.948904
doc28    5.007711
doc10    4.821202
doc24    4.816107
doc37    4.715729
doc2     4.628859
doc1     4.525808
doc39    4.402610
doc36    4.399896
doc25    4.300563
doc22    4.235573
doc49    4.171724
doc46    4.154909
doc26    4.149037
doc32    4.040074
doc0     3.912255
doc8     3.908374
doc18    3.834832
doc51    3.668989
doc29    3.649207
doc9     3.604040
doc43    3.595558
doc45    3.591621
doc19    3.574759
doc27    3.564487
doc14    3.561863
doc12    3.536538
doc6     3.430478
doc30    3.424394
doc21    3.419013
doc40    3.408534
doc16    3.280913
doc44    3.241754
doc41    3.241754
doc47    3.179854
doc42    3.156477
doc31    3.138539
doc5     3.123966
doc7     2.952512
doc15    2.917672
doc23    2.803102
doc17    2.799709
doc3     2.781502
doc35    2.646406
doc13    2.616532
doc38    2.615499
doc50    2.587288
doc11    2.570495
doc4     2.376074
doc20    2.171319
doc33    1.655060
doc34    1.000000
dtype: float64

Get Top 10 rows with high.est TF-IDF values

In [26]:
top_10 = top_tfidf.head(10)
top_10

doc48    5.948904
doc28    5.007711
doc10    4.821202
doc24    4.816107
doc37    4.715729
doc2     4.628859
doc1     4.525808
doc39    4.402610
doc36    4.399896
doc25    4.300563
dtype: float64

Display selected rows.

In [27]:
rows = [int(x[3:]) for x in top_10.index]
rows

[48, 28, 10, 24, 37, 2, 1, 39, 36, 25]

Sort the selected rows.

In [28]:
sorted_rows = sorted(rows)
sorted_rows

[1, 2, 10, 24, 25, 28, 36, 37, 39, 48]

Display content that correspond to rows.

In [29]:
for i in sorted_rows:
    print('[Line {}]'.format(i))
    print(sentences[i] + "\n")

[Line 1]
The aim is to defeat five rows of eleven aliens—although some versions feature different numbers—that move horizontally back and forth across the screen as they advance toward the bottom of the screen.

[Line 2]
The player's laser cannon is partially protected by several stationary defense bunkers—the number also varies by version—that are gradually destroyed from the top and bottom by blasts from either the aliens or the player.

[Line 10]
The game's inspiration is reported to have come from varying sources, including an adaptation of the mechanical game Space Monsters released by Taito in 1972, and a dream about Japanese school children who are waiting for Santa Claus when they are attacked by invading aliens.

[Line 24]
The game uses an Intel 8080 central processing unit (CPU), displays raster graphics on a CRT monitor using a bitmapped framebuffer, and uses monaural sound hosted by a combination of analog circuitry and a Texas Instruments SN76477 sound chip.

[Line 25]
The