In [1]:
import numpy as np
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
import string

from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.util import ngrams
from nltk.tokenize import RegexpTokenizer

from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer

class text_processor:

    def __init__(self, remover_function=None, tokenizer_function=None, 
                 cleaning_function=None, stemmer_function=None,
                     vectorizer_function = CountVectorizer()):
        self.remover = remover_function
        self.tokenizer = tokenizer_function
        self.cleaner = cleaning_function
        self.stemmer = stemmer_function
        self.vectorizer = vectorizer_function


        if remover_function == 'no_punctuation':
            self.remover = self.no_punctuation
        if tokenizer_function == 'tk_word':
            self.tokenizer = self.tk_word
        if not tokenizer_function:
            self.tokenizer = self.splitter
        if cleaning_function == 'lowstem':
            self.cleaner = self.lowstem
                
   # cleaning functions

    def lower(self,X):
        sentences = []
        for sentence in X:
            sentences.append(sentence.lower()) 
        return sentences


    def no_punctuation(self,X):
    # remove the punctuation
        pos = []
        for sentence in X:
            for punc in string.punctuation:
                sentence = sentence.replace(punc,'')
            pos.append(sentence)
        return pos
    
 # tokenizer functions   
    
    def tk_word(self,X):
        vocabulary = []
        for x in X:
            vocabulary.append(word_tokenize(x)) 
        return vocabulary        
    
    
    def splitter(self, text):
        """
        Default tokenizer that splits on spaces naively
        """
        return text.split(' ')

   # stemmer function
    

    def stem(self,X):
        stemmed = []
        for word in (X):
            stem_word = stemmer.stem(word)
            stemmed.append(stem_word)
        return stemmed


    
    
    # vectorizing function
    def vectorize(self, X):
        self.vectorizer.fit(X)
        self.columns=self.vectorizer.get_feature_names()
        return self.vectorizer.transform(X).toarray()
        
        
    def fit(self,X):
        clear_text = self.remover(X)
        clear_text = self.lower(clear_text)
#        clear_text = self.stem(clear_text)
        self.matrix = self.vectorize(clear_text)
   
 

In [3]:
nlp = text_processor(remover_function='no_punctuation',tokenizer_function = 'tk_word'
                    ,stemmer_function = PorterStemmer,
#                    vectorizer_function=TfidfVectorizer(min_df=0.3, max_df=0.8))
                    vectorizer_function=TfidfVectorizer(min_df=0, max_df=1))

In [4]:
slogans = ['extending and enhancing human life', 
           'We will Be There When The Light Goes On', 'Small Business, rejoice']
bukowski = ['the impossibility of being human','moving this little bit of light toward us']
ensemble = ['extending and enhancing human life', 
           'We will Be There When The Light Goes On', 'Small Business, rejoice',
            'the impossibility of being human','moving this little bit of light toward us']

In [5]:
nlp.fit(slogans)
slogans_matrix = nlp.matrix
slogans_columns = nlp.columns

In [6]:
nlp.fit(bukowski)
bukowski_matrix = nlp.matrix
bukowski_columns = nlp.columns

In [7]:
ensemble = bukowski + slogans
nlp.fit(ensemble)
ensemble_matrix = nlp.matrix
ensemble_columns = nlp.columns

In [8]:
ensemble_columns

['and',
 'be',
 'being',
 'bit',
 'business',
 'enhancing',
 'extending',
 'goes',
 'impossibility',
 'life',
 'little',
 'moving',
 'on',
 'rejoice',
 'small',
 'there',
 'this',
 'toward',
 'us',
 'we',
 'when',
 'will']

In [9]:
ensemble_matrix

array([[0.        , 0.        , 0.70710678, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.70710678, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.40824829, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.40824829, 0.40824829, 0.        , 0.        , 0.        ,
        0.        , 0.40824829, 0.40824829, 0.40824829, 0.        ,
        0.        , 0.        ],
       [0.5       , 0.        , 0.        , 0.        , 0.        ,
        0.5       , 0.5       , 0.        , 0.        , 0.5       ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.37796447, 0.        , 0.        , 0.        ,
        0.       

In [10]:
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
lsa = TruncatedSVD(2, algorithm = 'arpack')
dtm_lsa = lsa.fit_transform(ensemble_matrix)
dtm_lsa = Normalizer(copy=False).fit_transform(dtm_lsa)

In [11]:
lsa.explained_variance_ratio_

array([0.18098099, 0.20284924])

In [12]:
import pandas as pd
pd.DataFrame(lsa.components_.round(5),
             index = ["component_1","component_2"],columns = ensemble_columns)

Unnamed: 0,and,be,being,bit,business,enhancing,extending,goes,impossibility,life,...,on,rejoice,small,there,this,toward,us,we,when,will
component_1,0.11933,-0.03403,0.04327,4e-05,0.55713,0.11933,0.11933,-0.03403,0.04327,0.11933,...,-0.03403,0.55713,0.55713,-0.03403,4e-05,4e-05,4e-05,-0.03403,-0.03403,-0.03403
component_2,0.33811,0.25388,-0.20508,-0.00026,-0.04976,0.33811,0.33811,0.25388,-0.20508,0.33811,...,0.25388,-0.04976,-0.04976,0.25388,-0.00026,-0.00026,-0.00026,0.25388,0.25388,0.25388


In [13]:
df3 = pd.DataFrame(dtm_lsa.round(5), index = ensemble, columns = ["component_1","component_2" ])

In [29]:
df3 = df3.reset_index()

In [30]:
df3.head()

Unnamed: 0,index,component_1,component_2
0,the impossibility of being human,0.20644,-0.97846
1,moving this little bit of light toward us,0.1493,-0.98879
2,extending and enhancing human life,0.33281,0.943
3,We will Be There When The Light Goes On,-0.13284,0.99114
4,"Small Business, rejoice",0.99604,-0.08895


In [32]:
mask1 = (df3['index'] == "moving this little bit of light toward us")

In [35]:
df3[mask1].index[0]

1

In [31]:
df3.nlargest(2, 'component_1')

Unnamed: 0,index,component_1,component_2
4,"Small Business, rejoice",0.99604,-0.08895
2,extending and enhancing human life,0.33281,0.943


In [41]:
df3.iloc[1]['index']

'moving this little bit of light toward us'

In [16]:
df3.nlargest(2, 'component_2')

Unnamed: 0,component_1,component_2
We will Be There When The Light Goes On,-0.13284,0.99114
extending and enhancing human life,0.33281,0.943


In [17]:
df4=df3.nlargest(2, 'component_2')

In [18]:
df4.index[0]

'We will Be There When The Light Goes On'

In [19]:
df4.index[1]

'extending and enhancing human life'

In [20]:
int((len(df4.index[1].split())+len(df4.index[1].split()))/2)

5

In [21]:
import nltk

In [22]:
nltk.pos_tag(word_tokenize(df4.index[1]))

[('extending', 'VBG'),
 ('and', 'CC'),
 ('enhancing', 'VBG'),
 ('human', 'JJ'),
 ('life', 'NN')]

In [23]:
nltk.pos_tag(word_tokenize(df4.index[0]))

[('We', 'PRP'),
 ('will', 'MD'),
 ('Be', 'VB'),
 ('There', 'EX'),
 ('When', 'WRB'),
 ('The', 'DT'),
 ('Light', 'NNP'),
 ('Goes', 'NNP'),
 ('On', 'IN')]

In [24]:
df = pd.read_excel('./data/aug18slogan.xlsx')

In [25]:
X = df['SLOGAN']
pos = ['the impossibility of being human']
for x in X:
    for punc in string.punctuation:
        x = x.replace(punc,'')
    pos.append(x)

In [26]:
nlp.fit(pos)
pos_matrix = nlp.matrix
pos_columns = nlp.columns

In [27]:
pos_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [28]:
pos_columns

['130',
 '160',
 '1888',
 '1897',
 '30',
 '50',
 '99',
 'abc',
 'absolutely',
 'accelerate',
 'achieve',
 'acquisitions',
 'act',
 'action',
 'add',
 'advance',
 'advances',
 'advantage',
 'advertise',
 'advisor',
 'aep',
 'after',
 'agencies',
 'aggression',
 'aimco',
 'alleviating',
 'alone',
 'alongside',
 'ambition',
 'american',
 'animals',
 'anticipate',
 'ap',
 'apart',
 'applied',
 'apply',
 'applying',
 'appreciating',
 'arbys',
 'arent',
 'arizona',
 'assurant',
 'authoritative',
 'auto',
 'awake',
 'baby',
 'bad',
 'balance',
 'banking',
 'banks',
 'banquet',
 'bar',
 'bean',
 'beanz',
 'beautify',
 'beer',
 'begins',
 'behind',
 'believing',
 'benefits',
 'bestseller',
 'bettering',
 'between',
 'birth',
 'bond',
 'born',
 'boy',
 'breakfast',
 'breakthrough',
 'breath',
 'breathe',
 'brighter',
 'brilliance',
 'brilliant',
 'brings',
 'bubbles',
 'buid',
 'builder',
 'burger',
 'but',
 'button',
 'buying',
 'cabinets',
 'cable',
 'call',
 'camel',
 'canthe',
 'capture',
 '