In [7]:
from tokenizer_Gilles import *
import pandas as pd
import string
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import PorterStemmer

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
import pickle


class nlp_preprocessor:
   
    def __init__(self, vectorizer=CountVectorizer(), tokenizer=None, 
                 cleaning_function=None, stemmer=None):
        """
        A class for pipelining our data in NLP problems. The user provides a series of 
        tools, and this class manages all of the training, transforming, and modification
        of the text data.
        ---
        Inputs:
        vectorizer: the model to use for vectorization of text data
        tokenizer: The tokenizer to use, if none defaults to split on spaces
        cleaning_function: how to clean the data, if None, defaults to the in built class
        """
        if not tokenizer:
            tokenizer = self.splitter
        if not cleaning_function:
            cleaning_function = self.clean_text
        self.stemmer = stemmer
        self.tokenizer = tokenizer
        #self.model = model
        self.cleaning_function = cleaning_function
        self.vectorizer = vectorizer
        self._is_fit = False
        
    def splitter(self, text):
        """
        Default tokenizer that splits on spaces naively
        """
        return text.split(' ')
        
    def clean_text(self, text, tokenizer, stemmer):
        """
        A naive function to lowercase all works can clean them quickly.
        This is the default behavior if no other cleaning function is specified
        """
        cleaned_text = []
        for post in text:
            cleaned_words = []
            for word in tokenizer(post):
                low_word = word.lower()
                if stemmer:
                    low_word = stemmer.stem(low_word)
                cleaned_words.append(low_word)
            cleaned_text.append(' '.join(cleaned_words))
        return cleaned_text
    
    def fit(self, text):
        """
        Cleans the data and then fits the vectorizer with
        the user provided text
        """
        clean_text = self.cleaning_function(text, self.tokenizer, self.stemmer)
        self.vectorizer.fit(clean_text)
        self._is_fit = True
        
    def transform(self, text):
        """
        Cleans any provided data and then transforms the data into
        a vectorized format based on the fit function. Returns the
        vectorized form of the data.
        """
        if not self._is_fit:
            raise ValueError("Must fit the models before transforming!")
        clean_text = self.cleaning_function(text, self.tokenizer, self.stemmer) # ????????? not defined here
        return self.vectorizer.transform(clean_text)
    
    def save_pipe(self, filename):
        """
        Writes the attributes of the pipeline to a file
        allowing a pipeline to be loaded later with the
        pre-trained pieces in place.
        """
        if type(filename) != str:
            raise TypeError("filename must be a string")
        pickle.dump(self.__dict__, open(filename+".mdl",'wb'))
        
    def load_pipe(self, filename):
        """
        Writes the attributes of the pipeline to a file
        allowing a pipeline to be loaded later with the
        pre-trained pieces in place.
        """
        if type(filename) != str:
            raise TypeError("filename must be a string")
        if filename[-4:] != '.mdl':
            filename += '.mdl'
        self.__dict__ = pickle.load(open(filename,'rb'))

In [23]:
from nltk.corpus import stopwords
from nltk.stem import porter
import string
stemmer = porter.PorterStemmer()

stopwords = stopwords.words()
def cleaning_text(text):
    cleaned_text = []
    for post in text:
        cleaned_words = []
        post = post.replace("’",'')
        for punc in string.punctuation:
            post = post.replace(punc,'')
#        print("post2",post)
        for word in post.split():
            low_word = stemmer.stem(word.lower())
#            print("low_word",low_word)
            if low_word not in stopwords:
                cleaned_words.append(low_word)
        print("cleaned_words",cleaned_words)
        cleaned_text.append(' '.join(cleaned_words))
    return cleaned_text

clean_text(['BOB the 123builder', 'is a strange', 'caRtoon type thing'])

cleaned_words ['bob', '123builder']
cleaned_words ['strang']
cleaned_words ['cartoon', 'type', 'thing']


['bob 123builder', 'strang', 'cartoon type thing']

In [24]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import PorterStemmer

nlp = nlp_preprocessor(vectorizer=TfidfVectorizer(min_df=0.3, max_df=0.8), 
                       cleaning_function=cleaning_text, 
                       tokenizer=TreebankWordTokenizer().tokenize, stemmer=PorterStemmer())

In [17]:
df = pd.read_excel('./data/aug18slogan.xlsx')

In [18]:
df.head()

Unnamed: 0,COMPANY,SLOGAN,SectorId,URL,Telecommunication Services,GICS_SubIndustry,WORDS,Unnamed: 7
0,3M Company,Science. Applied to life.,20,3M Company,Industrials,Industrial Conglomerates,4.0,
1,A&P,"At the A&P, we watch our P's and Q's.",30,A&P,Consumer Staples,,,
2,Abbott Laboratories,Life. To the fullest.,35,Abbott Laboratories,Health Care,Health Care Equipment,4.0,
3,AbbVie,People. Passion. Possibilities.,35,AbbVie Inc.,Health Care,Pharmaceuticals,3.0,
4,ABN AMRO Bank,Making More Possible,40,,Financials,,,


In [19]:
X = df['SLOGAN']
y = df['SectorId']

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [25]:
nlp.fit(X)

TypeError: cleaning_text() takes 1 positional argument but 3 were given

In [25]:
df['CSLOGANS'] = clean_text(X)

In [26]:
df.to_csv('toto.csv')