# This notebook creates the base code for an app to pick a bukowski poem verse to associate with a slogan 
## -1- creating a class for a pipeline
## -2- creating a function to call the class
## -3- enter the slogan --> return the associated verse 


In [1]:
# start -1- creating a class for a pipeline

In [2]:
# libraries needed in the notebook
import numpy as np
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer

In [3]:
# libraries needed in the class
import string

from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.util import ngrams
from nltk.tokenize import RegexpTokenizer

from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer

class text_processor:

    def __init__(self, remover_function=None, tokenizer_function=None, 
                 cleaning_function=None, stemmer_function=None,
                     vectorizer_function = CountVectorizer()):
        self.remover = remover_function
        self.tokenizer = tokenizer_function
        self.cleaner = cleaning_function
        self.stemmer = stemmer_function
        self.vectorizer = vectorizer_function
# The variables in the init are to:
#        remove characters from the text, 
#        tokenize the text, 
#        clean the text(typically lowercase but could be other actions like pos)
#        stem the words
#        vectorize
#  the values go into variables to be used later in the class
#  below are some test to decided which function to use in the class
        if remover_function == 'no_punctuation':
            self.remover = self.no_punctuation
        if tokenizer_function == 'tk_word':
            self.tokenizer = self.tk_word
        if not tokenizer_function:
            self.tokenizer = self.splitter
        if cleaning_function == 'lowstem':
            self.cleaner = self.lowstem
                
   # cleaning functions

    def lower(self,X):
        sentences = []
        for sentence in X:
            sentences.append(sentence.lower()) 
        return sentences


    def no_punctuation(self,X):
    # remove the punctuation
        pos = []
        for sentence in X:
            for punc in string.punctuation:
                sentence = sentence.replace(punc,'')
            pos.append(sentence)
        return pos
    
 # tokenizer functions   
    
    def tk_word(self,X):
        vocabulary = []
        for x in X:
            vocabulary.append(word_tokenize(x)) 
        return vocabulary        
    
    
    def splitter(self, text):
        """
        Default tokenizer that splits on spaces naively
        """
        return text.split(' ')

   # stemmer function
    

    def stem(self,X):
        stemmed = []
        for word in (X):
            stem_word = stemmer.stem(word)
            stemmed.append(stem_word)
        return stemmed


    
    
    # vectorizing function. Decide to do the fit and the run of it
    def vectorize(self, X):
        self.vectorizer.fit(X)
        self.columns=self.vectorizer.get_feature_names()  # the names of all the words used in vectorization
        return self.vectorizer.transform(X).toarray()
        
     # the fit covers text changes to vectorization of it.   
    def fit(self,X):
        clear_text = self.remover(X)
        clear_text = self.lower(clear_text)
#        clear_text = self.stem(clear_text)
        self.matrix = self.vectorize(clear_text)
 # i create my matrix of 0 and 1  

In [4]:
# end -1- creating a class for a pipeline

In [5]:
# start -2- creating a function to use the class
# and to find the bukowski verse the closest to the slogan on principal component one of the LSA results

In [6]:
def generator(text):
    
    nlp = text_processor(remover_function='no_punctuation',tokenizer_function = 'tk_word'
                    ,stemmer_function = PorterStemmer,
# this removes most records !                    vectorizer_function=TfidfVectorizer(min_df=0.3, max_df=0.8))
                    vectorizer_function=TfidfVectorizer(min_df=0, max_df=1))
    
    df = pd.read_excel('./data/bukowski1.xlsx')
    X = df['verses']
# got data from around 30 Bukowski poems on line and paste them in the excel
 # got them in a dataframe    
    pos = [text]
    for x in X:
        for punc in string.punctuation:
            x = x.replace(punc,'')
        pos.append(x)
# put them all in one list
# create the class that vectorize everything and gives me the columns and the matrix of 0/1
    nlp.fit(pos)
    pos_matrix = nlp.matrix
    pos_columns = nlp.columns
# call LSA   
    lsa = TruncatedSVD(2, algorithm = 'arpack')
    dtm_lsa = lsa.fit_transform(pos_matrix)
    dtm_lsa = Normalizer(copy=False).fit_transform(dtm_lsa)
# Load results in dataframe
    pd.DataFrame(lsa.components_.round(5),
                 index = ["component_1","component_2"],columns = pos_columns)
    df3 = pd.DataFrame(dtm_lsa.round(5), index = pos, columns = ["component_1","component_2" ])
# create another dataframe ordered by component 1 len(df3) is keep all 
    df4 = df3.nlargest(len(df3), 'component_1')
#   df index is the verses and one slogan. we reindex to get all the line numbered be
#   cause we want to take the verse following the slogan in the list
    df4 = df4.reset_index()
    mask1 = (df4['index'] == text)
    n = df4[mask1].index[0]    # this is the position of the slogan ordered in component 1 list
    df5 = df3.nlargest(len(df3), 'component_2')
    df5 = df5.reset_index()
    mask5 = (df5['index'] == text)
    m = df5[mask5].index[0]   #  this is the position of the slogan ordered in component 2 list
# I chose only one. somehow component 1 looked the best
    return (df4.iloc[n+1]['index']) 
# this return the verse (column name is index at position)
#(n+1 is the position of the slogan+1 for the 1st item after the slogan

In [7]:
def verse():
    x = input()
    return generator(x)
# basic function to enter a slogan ... or whatever ... and return a verse

In [9]:
verse()  # calling the function

first in food


' I decided never to become an American'