In [2]:
import matplotlib.pyplot as plt
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import string
import pickle

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def remove_punctuation(text):
  cleaned = ''.join([word for word in text if word not in string.punctuation])
  return cleaned

In [4]:
class RemovePunctuationTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, text_column):
        self.text_column = text_column
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_ = X.copy()
        X_[self.text_column] = X_[self.text_column].apply(lambda x: remove_punctuation(x))
        return X_

In [5]:
def tokenizer(text):
  #remove uupercase
  lower_text = text.lower()
  #Tokenize
  tokenized_text = nltk.word_tokenize(lower_text)
  return tokenized_text

In [6]:
class TokenizerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, text_column):
        self.text_column = text_column
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_ = X.copy()
        X_[self.text_column] = X_[self.text_column].apply(lambda x: tokenizer(x))
        return X_  

In [7]:
stopwords = nltk.corpus.stopwords.words('english')

In [8]:
def remove_stopwords(text):
  without_stopwords = [word for word in text if word not in stopwords]
  return  without_stopwords

In [9]:
class RemoveStopwordsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, text_column):
        self.text_column = text_column
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_ = X.copy()
        X_[self.text_column] = X_[self.text_column].apply(lambda x: remove_stopwords(x))
        return X_  

In [10]:
def remove_shorttokens(text):
  without_shorttokens = [word for word in text if len(word)>2]
  return  without_shorttokens

In [11]:
class RemoveShortTokensTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, text_column):
        self.text_column = text_column
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_ = X.copy()
        X_[self.text_column] = X_[self.text_column].apply(lambda x: remove_shorttokens(x))
        return X_    

In [12]:
stemmer = nltk.PorterStemmer()

In [13]:
def stemming(text):
  stemmed_words = [stemmer.stem(word) for word in text]
  return stemmed_words

In [14]:
class StemmingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, text_column):
        self.text_column = text_column
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_ = X.copy()
        X_[self.text_column] = X_[self.text_column].apply(lambda x: stemming(x))
        return X_

In [15]:
class  ReturnStringTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, text_column):
        self.text_column = text_column
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_ = X.copy()
        X_[self.text_column] = X_[self.text_column].apply(lambda x: " ".join(x))
        X_ = pd.Series(X_[self.text_column])
        return X_

In [16]:
pickled_model = pickle.load(open('model.pkl', 'rb'))

In [17]:
X_new = pd.DataFrame({'text': ["FUCK YOU",
                               "Barely 24 hours ago he said what special and loved people these terrorists were, and how he felt the same way they do.",
                               "You incited this attack. You own it. Go to hell.", 
                               "Every single illegal immigrant should be dropped from welfare programs, immediately. Absolutely absurd that Americans are on a 5 year waitlist, while Mr. & Mrs. hopped-the-border are helping themselves to our tax dollars. No. No. And NO.",
                               "You will forever be the winner & the greatest man this country has ever known. We will always be in your debt and owe you endless gratitude. I love you man and you gonna be my spirit animal forever!",
                               "When you worship a mad man it should tell you some about yourself.  I pray you don't have children",
                               "That is terrifying. I can’t imagine living with tornadoes.",
                               "Prayers for the families who lost loved ones.",
                               "Normally when people are on hallucinogenic,s, I,m couries to try the experience. But gurl-you are on a life long bad trip honey. Ain,t no one want what you havin!"]})
X_new

Unnamed: 0,text
0,FUCK YOU
1,Barely 24 hours ago he said what special and l...
2,You incited this attack. You own it. Go to hell.
3,Every single illegal immigrant should be dropp...
4,You will forever be the winner & the greatest ...
5,When you worship a mad man it should tell you ...
6,That is terrifying. I can’t imagine living wit...
7,Prayers for the families who lost loved ones.
8,"Normally when people are on hallucinogenic,s, ..."


In [18]:
pickled_model.predict(X_new)

array([0, 0, 0, 1, 0, 0, 0, 0, 0])