In [19]:
## Trying different feature extraction methods from sckitlearn and see if can be applied to HW4
## More note can be found in the trial and test note.docx

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer

import pandas as pd
import numpy as np


# Initialize the "TfidVectorizer" object, which is scikit-learn's
# bag of words tool. http://scikit-learn.org/dev/modules/feature_extraction.html#text-feature-extraction
vectorizer = TfidfVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000)


## Initialize n-gram vectorizer
ngram2_vectorizer = CountVectorizer(ngram_range=(2, 2),token_pattern=r'\b\w+\b', min_df=1, max_features = 5000)
ngram4_vectorizer = CountVectorizer(ngram_range=(4, 4),token_pattern=r'\b\w+\b', min_df=1, max_features = 5000)


## These following two tools are also initialized, but can't be used report dtype error
## TO-DO: need to fix it
transformer = TfidfTransformer()
hashvect = HashingVectorizer()

#import file
df = pd.read_csv('Data/all_poems_tokenized.txt',sep='\t')

# Get all the poem names
poemnames = pd.unique(df.poem_name.ravel())

# Array to hold bag of words for each poem
poem_bagwords = []
# Arrays to hold labels
poem_label = []

# Vectorizer results are normalized, which makes KMeans behave as
# spherical k-means for better results. Since LSA/SVD results are
# not normalized, we have to redo the normalization.
# svd = TruncatedSVD()
# normalizer = Normalizer(copy=False)
# lsa = make_pipeline(svd, normalizer)

## loop through all poems
for i in range(len(poemnames)):
    
    ### Create bag of words for all plays
    p = df[df['poem_name'] == poemnames[i]] # Get the sub data frame of each play
    s = "" # Initiate empty string to hold bag of words for play
    # Iterate all the rows to append the poem words to a string
    for index,row in p.iterrows():
        s += str(row['poem'])

    # Append the bag of words to each poem
    poem_bagwords.append(s)
    # Append the label to each row
    poem_label.append(str(poemnames[i]))
    
f = open("poem_feature_labels.txt", "w")
f.write("\n".join(map(lambda x: str(x), poem_label)))
f.close()



# Vectorize analyze the similarities among poems
# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
poem_vect = vectorizer.fit_transform(poem_bagwords)
poem_ngram2= ngram2_vectorizer.fit_transform(poem_bagwords)
poem_ngram2 = poem_ngram2.toarray() 
poem_ngram4 = ngram4_vectorizer.fit_transform(poem_bagwords)
poem_ngram4 = poem_ngram4.toarray() 


# # Numpy arrays are easy to work with, so convert the result to an 
# # array
poem_feature = poem_vect.toarray() # Show the similarity array of plays
# poem_feature2 = lsa.fit_transform(play_feature) # Apply Latent Semantic Analysis
# explained_variance = svd.explained_variance_ratio_.sum()
# print(explained_variance)


# See the data arrary: print (play_feature.shape)

# Write the plays feature array to a csv file
np.savetxt("Poem_Features_Vectorize.txt", poem_feature, delimiter=",")
np.savetxt("Poem_Features_ngram2.txt", poem_ngram2, delimiter=",")
np.savetxt("Poem_Features_ngram4.txt", poem_ngram4, delimiter=",")
# np.savetxt("Poem_Features_Vectorize_LSA.txt", poem_feature2, delimiter=",")

# Take a look at the words in the vocabulary
# vocab = vectorizer.get_feature_names()

# import numpy as np
# print(vocab)

### Print the count of each word in vocab -- not working somehow
# # Sum up the counts of each vocabulary word
# dist = np.sum(play_feature, axis=0)
# # For each, print the vocabulary word and the number of times it 
# # appears in the training set
# for tag, count in zip(vocab, dist):
#     print (count, tag)



#Y = transformer.fit_transform(play_bagwords) -- To DO fix the error: no supported conversion for types: (dtype('<U98727')

In [20]:
ngram2_vectorizer.get_feature_names()

['1612 normal',
 'absent thi',
 'act thi',
 'adoni live',
 'age yet',
 'aid vers',
 'air let',
 'alon thou',
 'angri eye',
 'antiqu pen',
 'appear like',
 'appli love',
 'arm bound',
 'art beauti',
 'art dead',
 'art made',
 'art man',
 'art therefor',
 'art thi',
 'art thou',
 'ashi pale',
 'attend time',
 'author thi',
 'away face',
 'bare everi',
 'bark peeld',
 'bear dead',
 'bear thee',
 'bear thi',
 'beast bear',
 'beast know',
 'beauti dead',
 'beauti doth',
 'beauti hath',
 'beauti hold',
 'beauti lie',
 'beauti live',
 'beauti may',
 'beauti red',
 'beauti set',
 'beauti shall',
 'beauti still',
 'beauti success',
 'beauti thi',
 'beauti thou',
 'beauti use',
 'behold face',
 'behold thi',
 'believ love',
 'believ though',
 'belong thi',
 'besieg ardea',
 'best best',
 'best love',
 'better angel',
 'better life',
 'better part',
 'black face',
 'black night',
 'blood staind',
 'blood wateri',
 'blush shame',
 'boar whose',
 'brain full',
 'break day',
 'breast doth',
 'breast

In [18]:
ngram4_vectorizer.get_feature_names()

['1612 normal text ed',
 'henri wriothesli earl southampton',
 'honor henri wriothesli earl',
 'love thee sort thou',
 'mani nymph vowd chast',
 'mani one perus sighd',
 'mani ring posi gold',
 'mani see care carv',
 'mani seem one sing',
 'mani sever fair kind',
 'mani thine alon imag',
 'mani thing sought old',
 'mani thou none lovest',
 'mani trojan mother share',
 'mani vanishd sight griev',
 'manifold sever stone wit',
 'manli chivalri bruis arm',
 'manli hector faint troilu',
 'manli shame bid possess',
 'manner breed thenc come',
 'manner deal nwithal suddenli',
 'manner expressli told ajax',
 'manner hold still comment',
 'manner may sing thou',
 'manner piti want pain',
 'manner vile deed nbitter',
 'mansion batterd enemi sacr',
 'mansion spend shall worm',
 'mansion thought assignd labour',
 'mansion vice got habit',
 'mantl rude oer arm',
 'manual wax red lipthousand',
 'map day outworn beauti',
 'map death death dim',
 'map deep impress bear',
 'map doth natur store',
 'mar