#### Use Env : Base , NLTK not installed in tensorflow env of conda 

In [6]:
import string
import nltk 
from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn
from nltk import wordpunct_tokenize
from nltk import WordNetLemmatizer
from nltk import sent_tokenize
from nltk import pos_tag
from sklearn.base import BaseEstimator, TransformerMixin

ImportError: No module named 'nltk'

In [20]:
import csv
import numpy as np
import gc
gc.collect()
reader = csv.reader(open("./meetup_sane.csv"), delimiter=",")
meetup = list(reader) 

#### Define the Text Computation Pre Processor Class
###### The transform method 
takes a list of documents (given as the variable, X) and returns a new list of tokenized documents, where each document is transformed into list of ordered tokens.
    
###### The tokenize method 
breaks raw strings into sentences, then breaks those sentences into words and punctuation, 
and applies a part of speech tag. The token is then normalized: made lower case,
then stripped of whitespace and other types of punctuation that may be appended. 
If the token is a stopword or if every character is punctuation, the token is ignored. 
If it is not ignored, the part of speech is used to lemmatize the token, which is then yielded

                
###### The Lemmatization method 
is the process of looking up a single word form from the variety of morphologic affixes that can be applied to indicate tense, plurality, gender, etc. First we need to identify the WordNet tag form based on the Penn Treebank tag, which is returned from NLTK’s standard pos_tag function. 
We simply look to see if the Penn tag starts with ‘N’, ‘V’, ‘R’, or ‘J’ and can correctly identify if its a noun, verb, adverb, or adjective. We then use the new tag to look up the lemma in the lexicon.

In [5]:
class NLTKPreprocessor(BaseEstimator, TransformerMixin):

    def __init__(self, stopwords=None, punct=None,
                 lower=True, strip=True):
        self.lower      = lower
        self.strip      = strip
        self.stopwords  = stopwords or set(sw.words('english'))
        self.punct      = punct or set(string.punctuation)
        self.lemmatizer = WordNetLemmatizer()

    def fit(self, X, y=None):
        return self

    def inverse_transform(self, X):
        return [" ".join(doc) for doc in X]
    
    def transform(self, X):
        return [
            list(self.tokenize(doc)) for doc in X
        ]

    def tokenize(self, document):
        # Break the document into sentences
        for sent in sent_tokenize(document):
            # Break the sentence into part of speech tagged tokens
            for token, tag in pos_tag(wordpunct_tokenize(sent)):
                # Apply preprocessing to the token
                token = token.lower() if self.lower else token
                token = token.strip() if self.strip else token
                token = token.strip('_') if self.strip else token
                token = token.strip('*') if self.strip else token

                # If stopword, ignore token and continue
                if token in self.stopwords:
                    continue

                # If punctuation, ignore token and continue
                if all(char in self.punct for char in token):
                    continue

                # Lemmatize the token and yield
                lemma = self.lemmatize(token, tag)
                yield lemma
                
    def lemmatize(self, token, tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)


NameError: name 'BaseEstimator' is not defined

#### Build and Evaluate
###### The Build Method  
- takes a classifier class or instance (if given a class, it instantiates the classifier with the defaults) and creates the pipeline with that classifier and fits it. 
- The function times the build process, evaluates it via the classification report that reports precision, recall, and F1. 
- Then builds a new model on the complete dataset and writes it out to disk

*Note that when using the TfidfVectorizer you must make sure that its default preprocessor, normalizer, and tokenizer are all turned off using the identity function and passing None to the other parameters.*

In [251]:
# decorator function to time functions
import time 
def timeit(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time() - ts
#         print('[TimeIt] func: "{}" run in {}s'.format(method.__name__, te - ts))
        return result, te 
    return timed

In [1]:
import pickle 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report as clsr
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split as tts

def identity(x):
    return x

@timeit
def build_and_evaluate(X, y, classifier=SGDClassifier, outpath=None, verbose=True):

    @timeit
    def build(classifier, X, y=None):
        """
        Inner build function that builds a single model.
        """
        if isinstance(classifier, type):
            classifier = classifier()

        model = Pipeline([
            ('preprocessor', NLTKPreprocessor()),
            ('vectorizer', TfidfVectorizer(
                tokenizer=identity, preprocessor=None, lowercase=False
            )),
            ('classifier', classifier),
        ])

        model.fit(X, y)
        return model

    # Label encode the targets
    labels = LabelEncoder()
    y = labels.fit_transform(y)

    # Begin evaluation
    if verbose: 
        print("Building for evaluation")
        
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2)
    model,secs = build(classifier, X_train, y_train)
    
    if verbose:
        print("Evaluation test model fit in {:0.3f} seconds".format(secs))
        print("Classification Report:\n")

    y_pred = model.predict(X_test)
    print(clsr(y_test, y_pred, target_names=labels.classes_))

    if verbose:
        print("Building complete model and saving ...")
    model, secs = build(classifier, X, y)
    model.labels_ = labels

    if verbose:
        print("Complete model fit in {:0.3f} seconds".format(secs))
        

    if outpath:
        with open(outpath, 'wb') as f:
            pickle.dump(model, f)

        print("Model written out to {}".format(outpath))

    return model




NameError: name 'timeit' is not defined

In [74]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\salee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\salee\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [76]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\salee\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [78]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\salee\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True


###### The next stage is to create the pipeline, train a classifier, then to evaluate it
- The model is split into a training and testing set by shuffling the data
- The model is trained on the training set, and evaluated on testing.
- A new model is then fit on all of the data and saved to disk.

##### Concatenating textual data from groups and events  

In [252]:
mp = meetup[1:]
meetup_df = pd.DataFrame(mp, columns=meetup[0])
meetup_df['event_X'] = meetup_df['group_name'].map(str) +" " +  meetup_df['category_name'].map(str) +' ' +meetup_df['venue_name'].map(str) +" " + meetup_df['group_description'].map(str) +" " +  meetup_df['event_description'].map(str)+" " +  meetup_df['bio'].map(str)      
event_X = meetup_df.as_matrix(['event_X']).ravel() 
event_Y = meetup_df.as_matrix(['event_name']).ravel()

In [253]:
X_training, X_Validation, Y_training, Y_validation = tts(event_X, event_Y, test_size=0.1)

model = build_and_evaluate(X_training, Y_training, outpath=PATH)

Building for evaluation
Evaluation model fit in 1134.891 seconds
Classification Report:



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


                                                                                  precision    recall  f1-score   support

                100% discount! Kintone Connect Digital Transformation Conference       1.00      1.00      1.00        12
    101: How to do Assignments, Back-2-Back, Selling LLC's & All Closing Secrets       0.00      0.00      0.00         0
                                                            11/12 Live Art Event       0.00      0.00      0.00         1
                                               2017 Behavioral Finance Symposium       0.00      0.00      0.00         1
     3DHEALS 2018 Global Healthcare 3D Printing Conference: Ecosystem Innovation       1.00      1.00      1.00       128
                                                      3rd Fridays: An Art Event!       1.00      1.00      1.00        12
                                  40 & Over Annual HALLOWEEN Party (Prepay Only)       1.00      1.00      1.00        11
                       

Complete model fit in 1452.517 seconds
Model written out to simple_ml_op/model.pickle


In [255]:
import random
member_bio = meetup_df.as_matrix(['bio']).ravel()
print(random.choice(member_bio))


Lifetime student. Interested primarily in learning more about music. I can teach violin, rock climbing, and programming.


In [261]:


with open(PATH, 'rb') as f:
    model = pickle.load(f)


yhat = model.predict([random.choice(member_bio)])
labels = LabelEncoder()
y = labels.fit_transform(event_Y)
print(labels.inverse_transform(yhat))


['December SF FinTech Demo Day!']


In [264]:
zzz = meetup_df.loc[meetup_df['bio'] == 'Lifetime student. Interested primarily in learning more about music. I can teach violin, rock climbing, and programming.']  

In [271]:
if (zzz['event_name'].any == 'December SF FinTech Demo Day!'):
    print('sssss')

In [4]:
PATH = "simple_ml_op/model.pickle"
with open(PATH, 'rb') as f:
    model = pickle.load(f)

y_pred = model.predict(X_Validation)
print(clsr(Y_validation, y_pred, target_names=labels.classes_))
    

AttributeError: Can't get attribute 'NLTKPreprocessor' on <module '__main__'>