In [5]:
import numpy as np
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer

In [2]:
import string

from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.util import ngrams
from nltk.tokenize import RegexpTokenizer

from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer

class text_processor:

    def __init__(self, remover_function=None, tokenizer_function=None, 
                 cleaning_function=None, stemmer_function=None,
                     vectorizer_function = CountVectorizer()):
        self.remover = remover_function
        self.tokenizer = tokenizer_function
        self.cleaner = cleaning_function
        self.stemmer = stemmer_function
        self.vectorizer = vectorizer_function


        if remover_function == 'no_punctuation':
            self.remover = self.no_punctuation
        if tokenizer_function == 'tk_word':
            self.tokenizer = self.tk_word
        if not tokenizer_function:
            self.tokenizer = self.splitter
        if cleaning_function == 'lowstem':
            self.cleaner = self.lowstem
                
   # cleaning functions

    def lower(self,X):
        sentences = []
        for sentence in X:
            sentences.append(sentence.lower()) 
        return sentences


    def no_punctuation(self,X):
    # remove the punctuation
        pos = []
        for sentence in X:
            for punc in string.punctuation:
                sentence = sentence.replace(punc,'')
            pos.append(sentence)
        return pos
    
 # tokenizer functions   
    
    def tk_word(self,X):
        vocabulary = []
        for x in X:
            vocabulary.append(word_tokenize(x)) 
        return vocabulary        
    
    
    def splitter(self, text):
        """
        Default tokenizer that splits on spaces naively
        """
        return text.split(' ')

   # stemmer function
    

    def stem(self,X):
        stemmed = []
        for word in (X):
            stem_word = stemmer.stem(word)
            stemmed.append(stem_word)
        return stemmed


    
    
    # vectorizing function
    def vectorize(self, X):
        self.vectorizer.fit(X)
        self.columns=self.vectorizer.get_feature_names()
        return self.vectorizer.transform(X).toarray()
        
        
    def fit(self,X):
        clear_text = self.remover(X)
        clear_text = self.lower(clear_text)
#        clear_text = self.stem(clear_text)
        self.matrix = self.vectorize(clear_text)
   
 

In [3]:
nlp = text_processor(remover_function='no_punctuation',tokenizer_function = 'tk_word'
                    ,stemmer_function = PorterStemmer,
#                    vectorizer_function=TfidfVectorizer(min_df=0.3, max_df=0.8))
                    vectorizer_function=TfidfVectorizer(min_df=0, max_df=1))

In [7]:
df = pd.read_excel('./data/bukowski.xlsx')
X = df['verses']
pos = ['Cannot beat the real thing' ]
for x in X:
    for punc in string.punctuation:
        x = x.replace(punc,'')
    pos.append(x)

In [8]:
nlp.fit(pos)
pos_matrix = nlp.matrix
pos_columns = nlp.columns

In [9]:
lsa = TruncatedSVD(2, algorithm = 'arpack')
dtm_lsa = lsa.fit_transform(pos_matrix)
dtm_lsa = Normalizer(copy=False).fit_transform(dtm_lsa)

In [27]:
pd.DataFrame(lsa.components_.round(5),
             index = ["component_1","component_2"],columns = pos_columns)
df3 = pd.DataFrame(dtm_lsa.round(5), index = pos, columns = ["component_1","component_2" ])

In [28]:
df3.head()

Unnamed: 0,component_1,component_2
Cannot beat the real thing,-0.69747,0.71661
don’t ever get the idea I am a poet you can see me,0.90037,0.43512
at the racetrack any day half drunk,-0.96392,0.26621
betting quarters sidewheelers and straight thoroughs,0.84119,-0.54074
but let me tell you there are some women there,-0.95226,-0.30528


In [29]:
df4 = df3.nlargest(573, 'component_1')
df4.head()

Unnamed: 0,component_1,component_2
the impossibility the impossibility,1.0,-0.0
poison mushrooms its a bad time,1.0,-0.0
out of the arms of one love,1.0,-0.0
they run out the trash bins,1.0,-0.0
are they,1.0,-0.0


In [30]:
df4 = df4.reset_index()
mask1 = (df4['index'] == "Cannot beat the real thing")
df4[mask1].index[0]

413

In [33]:
df4.iloc[414]['index']

'be watched'

In [35]:
df4.iloc[412]['index']

'pages'

In [36]:
df4.iloc[413]['index']

'Cannot beat the real thing'

In [65]:
def generator(text):
    
    df = pd.read_excel('./data/bukowski1.xlsx')
    X = df['verses']
    pos = [text]
    for x in X:
        for punc in string.punctuation:
            x = x.replace(punc,'')
        pos.append(x)
    nlp.fit(pos)
    pos_matrix = nlp.matrix
    pos_columns = nlp.columns
    lsa = TruncatedSVD(2, algorithm = 'arpack')
    dtm_lsa = lsa.fit_transform(pos_matrix)
    dtm_lsa = Normalizer(copy=False).fit_transform(dtm_lsa)
    pd.DataFrame(lsa.components_.round(5),
                 index = ["component_1","component_2"],columns = pos_columns)
    df3 = pd.DataFrame(dtm_lsa.round(5), index = pos, columns = ["component_1","component_2" ])
    df4 = df3.nlargest(len(df3), 'component_1')
    df4 = df4.reset_index()
    mask1 = (df4['index'] == text)
    n = df4[mask1].index[0]
    df5 = df3.nlargest(len(df3), 'component_2')
    df5 = df5.reset_index()
    mask5 = (df5['index'] == text)
    m = df5[mask5].index[0]

    return (df4.iloc[n+1]['index'])

In [100]:
generator("Cannot beat the real thing")

' its the worst'

In [103]:
generator("People. Passion. Possibilities")

' avocados tomatoes cucumbers '

In [97]:
generator("just do it")

' out of there'

In [80]:
generator("don't be evil")

' just then the supervisor walked up and said'

In [84]:
generator("Cannot beat the real thing")

' I have died too many times'

In [85]:
input

<bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x10eafd668>>

In [92]:
def verse():
    x = input()
    return generator(x)

In [105]:
verse()

don't leave home without it


' and a potbelly'