In [6]:
from string import punctuation
from os import listdir
from collections import Counter
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer

from numpy import genfromtxt
import csv
import numpy as np

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text


# turn a doc into clean tokens
def clean_doc(doc):
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens


# load doc and add to vocab
def add_doc_to_vocab(filename, vocab):
    # load doc
    doc = load_doc(filename)
    # clean doc
    tokens = clean_doc(doc)
    # update counts
    vocab.update(tokens)

    
# load all docs in a directory
def process_docs(directory, vocab):
    # walk through all files in the folder
    filenames = listdir(directory)
    filenames.sort()
    print("Total # of files:",len(filenames))
    for filename in filenames:
        # skip any reviews in the test set
#         if filename.startswith('cv9'):
#             continue
        # create the full path of the file to open
        path = directory + '/' + filename
        # add doc to vocab
        add_doc_to_vocab(path, vocab)


# save list to file
def save_list(lines, filename):
        # convert lines to a single blob of text
    data = '\n'.join(lines)
    # open file
    file = open(filename, 'w')
    # write text
    file.write(data)
    # close file
    file.close()


# load doc, clean and return line of tokens
def doc_to_line(filename, vocab):
    # load the doc
    doc = load_doc(filename)
    # clean doc
    tokens = clean_doc(doc)
    # filter by vocab
    tokens = [w for w in tokens if w in vocab]
    return ' '.join(tokens)


# load all docs in a directory and place data in a list
def read_docs(directory, vocab):
    lines = list()
    filenames = listdir(directory)
    filenames.sort()
    # walk through all files in the folder
    for filename in filenames:
        # create the full path of the file to open
        path = directory + '/' + filename
        # load and clean the doc
        line = doc_to_line(path, vocab)
        # add to list
        lines.append(line)
    return lines


# define vocab
vocab = Counter()
# add all docs to vocab
process_docs('MIREX-like_mood/dataset/Lyrics', vocab)
# print the size of the vocab
print("Length of Vocabulary", len(vocab))
# print the top words in the vocab
# print(vocab.most_common(50))
# keep tokens with a min occurrence
min_occurance = 2
# tokens = [k for k, c in vocab.items() if c >= min_occurance]
tokens = [k for k, c in vocab.items()]
print("# of Tokens:",len(tokens))
# save tokens to a vocabulary file
save_list(tokens, 'vocabulary.txt')

# load the vocabulary
vocab_filename = 'vocabulary.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)
# load all training reviews
lines = np.array(read_docs('MIREX-like_mood/dataset/Lyrics/', vocab))
# summarize what we have
print("# of Lyrics:",len(lines))

Total # of files: 764
Length of Vocabulary 12060
# of Tokens: 12060
# of Lyrics: 764


In [2]:
ifile = open('Ratings_Warriner_et_al.csv', 'r')
reader = csv.reader(ifile)
words = []
valence = []
arousal = []
for row in reader:
    if(row[0] != ""):
        words.append(row[1])
        valence.append(row[2])
        arousal.append(row[5])
data = [words[0],valence[0],arousal[0]]
for i in range(1,len(words)):
    data_row = [words[i],valence[i],arousal[i]]
    data = np.vstack((data, data_row))
print(data.shape)

np.save('rated_words.npy', data)

(13915, 3)


In [3]:
wva = np.load('rated_words.npy')
print(wva[0][0])

# load the vocabulary
vocab_filename = 'vocabulary.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab.sort()
print(vocab[0])

aardvark
ABOVE


In [4]:
rated_words = wva[:,0]
found_words = 0
for word in vocab:
    if(word in rated_words):
        found_words+=1
#         print("{} is in the rated words".format(word))

print("Found {} words out of {}".format(found_words, len(vocab)))

Found 3870 words out of 12060
