In [78]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Use file object to input the details

In [79]:
file = open('space_invaders.txt', encoding='utf-8')
doc = file.read()

In [81]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Prepare Stemmer and stopwords and tokenize sentence

In [82]:
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words('english')
sentences = nltk.tokenize.sent_tokenize(doc)

### Perform data cleansing

In [87]:
sentences[1]

'The aim is to defeat five rows of eleven aliens—although some versions feature different numbers—that move horizontally back and forth across the screen as they advance toward the bottom of the screen.'

In [88]:
def preprocess(docs):
    docs_clean = []
    punc = str.maketrans('', '', string.punctuation)
    for doc in docs:
        docs_no_punc = doc.translate(punc)
        words = docs_no_punc.lower().split()
        words = [lemmatizer.lemmatize(word, 'v')
                for word in words if words not in stop_words]
        docs_clean.append(' '.join(words))
    return docs_clean

docs_clean = preprocess(sentences)

### After getting the cleansed words

#### Input the words into Tfidf

In [89]:
tfidf = TfidfVectorizer()

feature_vectors = tfidf.fit_transform(docs_clean).toarray()

In [90]:
feature_vectors.shape

(52, 492)

In [91]:
sentences_tfidf = []
for i in range(len(feature_vectors)):
    sentences_tfidf.append(feature_vectors[i].sum())
    

#### Order the values

In [92]:
import numpy as np
sentences_tfidf = np.array(sentences_tfidf)

In [94]:
import pandas as pd

In [95]:
sentences_df = pd.DataFrame(data=sentences_tfidf, columns=['sum'])

In [96]:
sentences_df['rownum'] = sentences_df.index

In [98]:
sorted_series = sentences_df.sort_values('sum', ascending=False)

In [99]:
top10 = sorted_series[:10]

In [101]:
top10 = top10.sort_values('rownum')

In [102]:
top10 

Unnamed: 0,sum,rownum
1,5.214,1
2,5.091562,2
10,6.052339,10
22,5.395277,22
24,5.272312,24
28,5.756173,28
36,5.116616,36
37,5.415283,37
39,5.197512,39
48,7.057334,48


In [104]:
for i in range(len(top10)):
    index_of_sorted = top10.index[i]
    print(f"[Line {top10.index[i]}]: \n{sentences[index_of_sorted]}\n")

[Line 1]: 
The aim is to defeat five rows of eleven aliens—although some versions feature different numbers—that move horizontally back and forth across the screen as they advance toward the bottom of the screen.

[Line 2]: 
The player's laser cannon is partially protected by several stationary defense bunkers—the number also varies by version—that are gradually destroyed from the top and bottom by blasts from either the aliens or the player.

[Line 10]: 
The game's inspiration is reported to have come from varying sources, including an adaptation of the mechanical game Space Monsters released by Taito in 1972, and a dream about Japanese school children who are waiting for Santa Claus when they are attacked by invading aliens.

[Line 22]: 
Because microcomputers in Japan were not powerful enough at the time to perform the complex tasks involved in designing and programming Space Invaders, Nishikado had to design his own custom hardware and development tools for the game.

[Line 24]: 
T