In [66]:
#!/usr/bin/env python
# coding: utf-8
import codecs
import csv
import re
import sys
import numpy as np
import pandas as pd
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from newspaper import Article
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation
csv.field_size_limit(sys.maxsize)

9223372036854775807

In [67]:
# helper functions
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):

    text = text.lower().split()

    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)

    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
        
    return text

def get_body(url):
    a = Article(url, language='en')
    a.download()
    a.parse()
    return u''.join(a.text).encode('utf-8').strip()

In [68]:
#do this only once while in production
model = load_model('./model.h5')

read = pd.read_csv('./data/train.csv')
texts = [str(row[1]) + str(row[2]) for row in read.values]
labels = read[["type"]].values

read = pd.read_csv('./data/test.csv')
test_texts = [str(row[1]) + str(row[2]) for row in read.values]
test_ids = read.index.values

In [69]:
# get input from backend
_input = { "title": "Brexit: David Davis 'pretty sure' of free trade deal - BBC News",
          "subtitle": "Pakistan oil tanker inferno kills at least 123",
          "description": "News",
          "website": "bbc.com",
          "poster": "BBC",
          "url": "http://www.bbc.com/news/uk-politics-40397312",
          "id": "world-south-asia-40396036" }

In [108]:
# get input from backend
_input = { "title": "Koch brothers plan stepped-up spending: ‘More optimistic now about what we can accomplish’",
           "url"  : "http://abcnews.go.com/International/wireStory/county-government-southwestern-china-100-people-40-homes-48248383"}

In [109]:
# clean input
text = [text_to_wordlist(_input['title'] + get_body(_input['url']))]
# tokenize input with train_data
tokenizer = Tokenizer(num_words=200000)
tokenizer.fit_on_texts(texts + test_texts + text)

# convert input to padded sequence
sequences = tokenizer.texts_to_sequences(text)
word_index = tokenizer.word_index
data = pad_sequences(sequences, maxlen=500)

In [110]:
# predict newsarticle validity
pred = model.predict([data], batch_size=1, verbose=0)

# output
output = {'id': _input['id'], 'prediction':int(round(pred[0][0])), 'confidence': pred[0][0]}

In [111]:
output

{'confidence': 0.88757902, 'id': 'world-south-asia-40396036', 'prediction': 1}

In [112]:
read

Unnamed: 0.1,Unnamed: 0,title,text,site_url,type
0,1514,Johnny Depp Spotted Wearing Fiancee Amber Hear...,Johnny Depp has been spotted wearing his fianc...,ibtimes.co.uk,0.0
1,1191,Film review: Muppets Most Wanted (U),"Created almost 60 years ago by Jim Henson, Ker...",yorkshireeveningpost.co.uk,0.0
2,9806,"Comment on After Fighting for Freedom, 76-yo V...",Home / Health / Medical Marijuana / After Figh...,thefreethoughtproject.com,1.0
3,1748,OBAMA TRIES TO PASS LAST SECOND EXECUTIVE ORDE...,"Posted by Matthew Bernstein | Nov 18, 2016 | A...",conservativedailypost.com,1.0
4,1748,"Business group, Dem senator move to stymie nat...",The main opposition to sending American natura...,dailycaller.com,0.0
5,1043,GERMAN ‘GESTAPO’ considers prosecuting parents...,BNI Store Oct 30 2016 GERMAN ‘GESTAPO’ conside...,barenakedislam.com,1.0
6,1043,Reporter's Notebook: In Search of Flight MH370,This transcript has been automatically generat...,live.wsj.com,0.0
7,5169,"This Is What Gold Does In A Currency Crisis, B...","This Is What Gold Does In A Currency Crisis, B...",investmentwatchblog.com,1.0
8,5169,Amazon 'not optimistic' about ending book stan...,Amazon.com defended the right of a retailer to...,pcworld.com,0.0
9,8426,"Trump Is Keeping His Promises, Cutting Taxes, ...",ZeroHedge.com ALL CONTENT ON 'SGTREPORT.COM' A...,sgtreport.com,1.0
