In [30]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from num2words import num2words
from sklearn.feature_extraction.text import TfidfVectorizer

import os
import numpy as np
import pandas as pd
import re


In [4]:
path = str(os.getcwd()) + '\\stories\\'

# list all folders in the root
folders = [x[0] for x in os.walk(path)]
folders

['D:\\TIP\\courses\\CS 405\\module_5\\stories\\',
 'D:\\TIP\\courses\\CS 405\\module_5\\stories\\FARNON',
 'D:\\TIP\\courses\\CS 405\\module_5\\stories\\SRE']

In [7]:
file = open(folders[0] + "index.html", 'r')
file.name

text = file.read()
print(text)

<HTML>
<TITLE>T E X T F I L E S</TITLE>
<BODY BGCOLOR="#000000" TEXT="#00FF00" LINK="#00FF00" ALINK="#00AA00" VLINK="#00AA00">
<H1>Stories and Fiction</H1>
<P>
Given than BBSes were an unlimited potential of cross-country publishing,
it's no wonder that hundreds of would-be authors took it upon themselves to
fill file directories with fiction, prose, and poetry. In the first few
years, you normally had stories that were being told with a specific purpose,
such as to bring awareness to an important issue of self-expression or
computers. But as time went on, and there was more space available, you
could see all sorts of literary work make itself known. There were even
a few BBSes dedicated to nothing but fiction and writing.
<P>
Also buried here are transcriptions of old fables or stories, dating back who
knows how many decades.
<P>
Direct transcriptions of classic novels and stories are generally located in
the <A HREF="/etext">Electronic Text</A> section.
<P>
<TABLE WIDTH=100%>
<TD BGC

In [9]:
# perform regex to get the file names from the index.html
file_name = re.findall('><A HREF="(.*)">', text)
file_name = file_name[2:]
file_title = re.findall('<BR><TD> (.*)\n', text)
file_name

['100west.txt',
 '13chil.txt',
 '14.lws',
 '16.lws',
 '17.lws',
 '18.lws',
 '19.lws',
 '20.lws',
 '3gables.txt',
 '3lpigs.txt',
 '3sonnets.vrs',
 '3student.txt',
 '3wishes.txt',
 '4moons.txt',
 '5orange.txt',
 '6ablemen.txt',
 '6napolen.txt',
 '7oldsamr.txt',
 '7voysinb.txt',
 'ab40thv.txt',
 'abbey.txt',
 'abyss.txt',
 'adler.txt',
 'adv_alad.txt',
 'advsayed.txt',
 'advtthum.txt',
 'aesop11.txt',
 'aesopa10.txt',
 'aircon.txt',
 'aisle.six',
 'aislesix.txt',
 'alad10.txt',
 'alissadl.txt',
 'altside.hum',
 'aluminum.hum',
 'aminegg.txt',
 'angelfur.hum',
 'angry_ca.txt',
 'antcrick.txt',
 'aquith.txt',
 'arcadia.sty',
 'archive',
 'arctic.txt',
 'asop',
 'assorted.txt',
 'bagel.man',
 'bagelman.txt',
 'batlslau.txt',
 'beast.asc',
 'beautbst.txt',
 'beggars.txt',
 'bern',
 'berternie.txt',
 'bestwish',
 'beyond.hum',
 'bgb.txt',
 'bgcspoof.txt',
 'bigred.hum',
 'bishop00.txt',
 'blabnove.hum',
 'blabnove.txt',
 'blackp.txt',
 'blackrdr',
 'blak',
 'blasters.fic',
 'blh.txt',
 'blind.

In [12]:
dataset = []
for j in range(len(file_name)):
    dataset.append((str(folders[0]) + str(file_name[j]), file_title[j]))
dataset

[('D:\\TIP\\courses\\CS 405\\module_5\\stories\\100west.txt',
  'Going 100 West by 53 North by Jim Prentice (1990)'),
 ('D:\\TIP\\courses\\CS 405\\module_5\\stories\\13chil.txt',
  'The Story of the Sly Fox'),
 ('D:\\TIP\\courses\\CS 405\\module_5\\stories\\14.lws',
  'A Smart Bomb with a Language Parser'),
 ('D:\\TIP\\courses\\CS 405\\module_5\\stories\\16.lws',
  'Two Guys in a Garage, by M. Pshota'),
 ('D:\\TIP\\courses\\CS 405\\module_5\\stories\\17.lws',
  'The Early Days of a High-Tech Start-up are Magic (November 18, 1991) by M. Peshota'),
 ('D:\\TIP\\courses\\CS 405\\module_5\\stories\\18.lws',
  'The Couch, the File Cabinet, and the Calendar, by M. Peshota (December 9, 1991)'),
 ('D:\\TIP\\courses\\CS 405\\module_5\\stories\\19.lws',
  'Engineering the Future of American Technology by M. Peshota (January 5, 1992)'),
 ('D:\\TIP\\courses\\CS 405\\module_5\\stories\\20.lws',
  'What Research and Development Was Always Meant to Be, by M. Peshota'),
 ('D:\\TIP\\courses\\CS 405\\mod

In [18]:
# print specific document
def print_doc(id):
    print(dataset[id])
    file = open(dataset[id][0], 'r', encoding='cp1250')
    text = file.read().strip()
    file.close()
    print(text)


In [16]:
def convert_lower_case(data):
    return np.char.lower(data)

In [28]:
def remove_stop_words(data):
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text

In [20]:
def remove_punctuation(data):
    symbols = "!\"$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], " ")
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ",", "")
    return data

In [21]:
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")


In [22]:
def stemming(data):
    stemmer = PorterStemmer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

In [41]:
def convert_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w))
        except:
            a = 0 
#             w = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text

In [26]:
def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data)
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = convert_numbers(data)
    data = stemming(data)
    data = remove_punctuation(data)
    data = convert_numbers(data)
    data = stemming(data) # needed again as we need to stem the words
    data = remove_punctuation(data) # needed again as num2word is giving few ...
    data = remove_stop_words(data) # needed again as num2word is giving stopwords
    return data

In [43]:
processed_text = []
processed_title = []

for document in dataset[:10]:
    file = open(document[0], 'r', encoding="utf8", errors='ignore')
    text = file.read().strip()
    file.close()
    
    processed_text.append(str(preprocess(text)))


In [50]:
print(processed_text)

[' sharewar trial project freewar need support continu one hundr west fifti three north jim prentic copyright one thousand nine hundr nineti jim prentic brandon manitoba canada north fifti three magic phrase spoken mumbl thought inwardli thousand soul ventur northward imaginari line shown map label fifti three degr presenc indic highway travel road side sign divi territori distinct mind intern border north fifti three north travel writer poet pilot contribut lore north rigor life bush told tale man eat mosquito murder hord black fli lump flesh carri away giant bull dog fli stori record break trout walley pike legion tale sight sound heard deep spruce forest crash moo tear brush break tree drum grou incess hum insect cackl quackeri duck feed quiet pond placid bay heard intermitt song loon never forgotten voic signatur authent northern scene wildlif northern bush land seem differ found elsewh life man take special breed person live north farther one travel becom appar southern whether fi

In [51]:
# create the transform

vectorizer = TfidfVectorizer()

# tokenize and build vocabulary
vectorizer.fit(processed_text)

# summarize

# specific
# print(vectorizer.vocabulary_['sharewar'])
# print(vectorizer.idf_[0])

# all
print(vectorizer.vocabulary_)
print(vectorizer.idf_)


{'sharewar': 2821, 'trial': 3335, 'project': 2468, 'freewar': 1266, 'need': 2136, 'support': 3139, 'continu': 668, 'one': 2223, 'hundr': 1561, 'west': 3526, 'fifti': 1164, 'three': 3246, 'north': 2176, 'jim': 1718, 'prentic': 2424, 'copyright': 686, 'thousand': 3242, 'nine': 2164, 'nineti': 2166, 'brandon': 364, 'manitoba': 1947, 'canada': 461, 'magic': 1920, 'phrase': 2339, 'spoken': 3000, 'mumbl': 2104, 'thought': 3241, 'inwardli': 1691, 'soul': 2972, 'ventur': 3438, 'northward': 2178, 'imaginari': 1592, 'line': 1845, 'shown': 2849, 'map': 1953, 'label': 1773, 'degr': 808, 'presenc': 2428, 'indic': 1622, 'highway': 1502, 'travel': 3324, 'road': 2671, 'side': 2855, 'sign': 2859, 'divi': 903, 'territori': 3216, 'distinct': 899, 'mind': 2040, 'intern': 1672, 'border': 341, 'writer': 3622, 'poet': 2385, 'pilot': 2348, 'contribut': 673, 'lore': 1884, 'rigor': 2659, 'life': 1838, 'bush': 420, 'told': 3276, 'tale': 3177, 'man': 1940, 'eat': 974, 'mosquito': 2084, 'murder': 2106, 'hord': 153

In [55]:
# create table with list of entities
word = []
frequency = []
score = []

i = 0
for key, value in vectorizer.vocabulary_.items():
    word.append(key)
    frequency.append(value)
    score.append(vectorizer.idf_[i])
    i+=1
    
print(word[0])
print(frequency[0])
print(score[0])

sharewar
2821
2.7047480922384253


In [57]:
entities_labels = list(set(zip(word, frequency, score)))
entities_df = pd.DataFrame(entities_labels)
entities_df.columns = ["Word", "Freq", "Score"]
entities_df

Unnamed: 0,Word,Freq,Score
0,muskrat,2111,2.011601
1,episod,1035,2.704748
2,loiter,1870,2.704748
3,sniff,2935,2.704748
4,blowsi,320,2.011601
...,...,...,...
3650,proprietor,2479,2.704748
3651,protector,2483,2.704748
3652,skeptic,2888,2.704748
3653,scotch,2751,2.704748
