In [19]:
import numpy as np
from keras.models import load_model
from xgboost import XGBClassifier
import xgboost as xgb
import pickle

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

In [20]:
# Load Models
nn_model = load_model('nn_sentiment_model.h5')
xgb_model = pickle.load(open('xgboost_1.dat', 'rb'))

## Preprocessing

In [21]:
class PreProcessor:
    '''
    Performs all the standard preprocessing steps
    like removing stopwords, stemming, etc.
    '''
    def __init__(self):
        self.sentence = ''
        self.stopwords = set(stopwords.words('english'))
        self.stemmer = SnowballStemmer("english")
        self.preprocessed = []
        
    def tokenize(self, sentence):
        '''
        Splits up words and makes a list of all words in the tweet
        '''
        tokenized_sentence = word_tokenize(sentence)
        return tokenized_sentence
            
    def remove_stopwords(self, sentence):
        '''Removes stopwords like 'a', 'the', 'and', etc.'''
        filtered_sentence = []
        for w in sentence:
            if w not in self.stopwords and len(w) > 1 and w[:2] != '//' and w != 'https': 
                filtered_sentence.append(w)
        return filtered_sentence
    
    def stem(self, sentence):
        '''
        Stems certain words to their root form.
        For example, words like 'computer', 'computation'
        all get trunacated to 'comput'
        '''
        return [self.stemmer.stem(word) for word in sentence]
    
    def join_to_string(self, sentence):
        '''
        Joins the tokenized words to one string.
        '''
        return ' '.join(sentence)
    
    def preprocess(self):
        '''
        Preprocess a selected number of rows and
        connects them back to strings
        '''   
        # Perform preprocessing
        tweet = self.sentence
        tokenized = self.tokenize(tweet)
        cleaned = self.remove_stopwords(tokenized)
        stemmed = self.stem(cleaned)
        joined = self.join_to_string(stemmed)
        self.preprocessed.append(joined)
        return self.preprocessed
    
    def demo(self):
        self.sentence = input()
        self.preprocess()
        return self.preprocessed

In [22]:
pp = PreProcessor()

In [24]:
pp.demo()

Hey, who are you? I have been waiting for hours!


['hey wait hour']

In [25]:
pp.preprocessed

['hey wait hour']