In [1]:
import numpy as np
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
import string

from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.util import ngrams
from nltk.tokenize import RegexpTokenizer

from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer

class text_processor:

    def __init__(self, remover_function=None, tokenizer_function=None, 
                 cleaning_function=None, stemmer_function=None,
                     vectorizer_function = CountVectorizer()):
        self.remover = remover_function
        self.tokenizer = tokenizer_function
        self.cleaner = cleaning_function
        self.stemmer = stemmer_function
        self.vectorizer = vectorizer_function


        if remover_function == 'no_punctuation':
            self.remover = self.no_punctuation
        if tokenizer_function == 'tk_word':
            self.tokenizer = self.tk_word
        if not tokenizer_function:
            self.tokenizer = self.splitter
        if cleaning_function == 'lowstem':
            self.cleaner = self.lowstem

    def stem(self,X):
        stemmed = []
        for word in (X):
            stem_word = stemmer.stem(word)
            stemmed.append(stem_word)
        return stemmed

                
   # cleaning functions

    def lower(self,X):
        sentences = []
        for sentence in X:
            sentences.append(sentence.lower()) 
        return sentences


    def no_punctuation(self,X):
    # remove the punctuation
        pos = []
        for sentence in X:
            for punc in string.punctuation:
                sentence = sentence.replace(punc,'')
            pos.append(sentence)
        return pos
    
 # tokenizer functions   
    
    def tk_word(self,X):
        vocabulary = []
        for x in X:
            vocabulary.append(word_tokenize(x)) 
        return vocabulary        
    
    
    def splitter(self, text):
        """
        Default tokenizer that splits on spaces naively
        """
        return text.split(' ')

   # stemmer function
    


    
    
    # vectorizing function
    def vectorize(self, X):
        self.vectorizer.fit(X)
        self.columns=self.vectorizer.get_feature_names()
        return self.vectorizer.transform(X).toarray()
        
        
    def fit(self,X):
        clear_text = self.remover(X)
        clear_text = self.lower(clear_text)
#        clear_text = self.stem(clear_text)
        self.matrix = self.vectorize(clear_text)
   
 

In [3]:
nlp = text_processor(remover_function='no_punctuation',tokenizer_function = 'tk_word'
                    ,stemmer_function = PorterStemmer,
#                    vectorizer_function=TfidfVectorizer(min_df=0.3, max_df=0.8))
                    vectorizer_function=TfidfVectorizer(min_df=0.1, max_df=0.8))

In [4]:
slogans = ['extending and enhancing human life', 
           'We will Be There When The Light Goes On', 'Small Business, rejoice']
bukowski = ['the impossibility of being human','moving this little bit of light toward us']
ensemble = ['extending and enhancing human life', 
           'We will Be There When The Light Goes On', 'Small Business, rejoice',
            'the impossibility of being human','moving this little bit of light toward us']

In [5]:
nlp.fit(slogans)
slogans_matrix = nlp.matrix
slogans_columns = nlp.columns

In [6]:
nlp.fit(bukowski)
bukowski_matrix = nlp.matrix
bukowski_columns = nlp.columns

In [7]:
ensemble = bukowski + slogans
nlp.fit(ensemble)
ensemble_matrix = nlp.matrix
ensemble_columns = nlp.columns

In [8]:
ensemble_columns

['and',
 'be',
 'being',
 'bit',
 'business',
 'enhancing',
 'extending',
 'goes',
 'human',
 'impossibility',
 'life',
 'light',
 'little',
 'moving',
 'of',
 'on',
 'rejoice',
 'small',
 'the',
 'there',
 'this',
 'toward',
 'us',
 'we',
 'when',
 'will']

In [9]:
ensemble_matrix

array([[0.        , 0.        , 0.50297966, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.40580082, 0.50297966,
        0.        , 0.        , 0.        , 0.        , 0.40580082,
        0.        , 0.        , 0.        , 0.40580082, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.37007017, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.29857028, 0.37007017, 0.37007017, 0.29857028,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.37007017, 0.37007017, 0.37007017, 0.        , 0.        ,
        0.        ],
       [0.46369322, 0.        , 0.        , 0.        , 0.        ,
        0.46369322, 0.46369322, 0.        , 0.37410477, 0.        ,
        0.46369322, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
      

In [10]:
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
lsa = TruncatedSVD(2, algorithm = 'arpack')
dtm_lsa = lsa.fit_transform(ensemble_matrix)
dtm_lsa = Normalizer(copy=False).fit_transform(dtm_lsa)

In [11]:
lsa.explained_variance_ratio_

array([0.07624557, 0.27197697])

In [12]:
import pandas as pd
pd.DataFrame(lsa.components_.round(5),
             index = ["component_1","component_2"],columns = ensemble_columns)

Unnamed: 0,and,be,being,bit,business,enhancing,extending,goes,human,impossibility,...,rejoice,small,the,there,this,toward,us,we,when,will
component_1,0.16308,0.13859,0.29525,0.15263,0.0,0.16308,0.16308,0.13859,0.36978,0.29525,...,0.0,0.0,0.35002,0.13859,0.15263,0.15263,0.15263,0.13859,0.13859,0.13859
component_2,0.3394,-0.15658,0.08931,-0.16285,-0.0,0.3394,0.3394,-0.15658,0.34588,0.08931,...,-0.0,-0.0,-0.05428,-0.15658,-0.16285,-0.16285,-0.16285,-0.15658,-0.15658,-0.15658


In [13]:
df3 = pd.DataFrame(dtm_lsa.round(5), index = ensemble, columns = ["component_1","component_2" ])

In [14]:
df3.nlargest(2, 'component_1')

Unnamed: 0,component_1,component_2
the impossibility of being human,0.97009,0.24273
moving this little bit of light toward us,0.74975,-0.66173


In [15]:
df3.nlargest(2, 'component_2')

Unnamed: 0,component_1,component_2
extending and enhancing human life,0.50227,0.86471
the impossibility of being human,0.97009,0.24273


In [16]:
df4=df3.nlargest(2, 'component_2')

In [17]:
df4.index[0]

'extending and enhancing human life'

In [18]:
df4.index[1]

'the impossibility of being human'

In [19]:
int((len(df4.index[1].split())+len(df4.index[1].split()))/2)

5

In [20]:
import nltk

In [21]:
nltk.pos_tag(word_tokenize(df4.index[1]))

[('the', 'DT'),
 ('impossibility', 'NN'),
 ('of', 'IN'),
 ('being', 'VBG'),
 ('human', 'JJ')]

In [22]:
nltk.pos_tag(word_tokenize(df4.index[0]))

[('extending', 'VBG'),
 ('and', 'CC'),
 ('enhancing', 'VBG'),
 ('human', 'JJ'),
 ('life', 'NN')]

In [23]:
df = pd.read_excel('./data/aug18slogan.xlsx')

In [25]:
X = df['SLOGAN']

In [26]:
nlp.fit(X)
X_matrix = nlp.matrix
X_columns = nlp.columns