In [8]:
from prefect import task
import requests
import pandas as pd
import numpy as np
import sqlalchemy
from decouple import config 
import psycopg2


def hn_scrape(i,comments_list):
    r = requests.get('https://hacker-news.firebaseio.com/v0/item/'+str(i)+'.json').json()
    try:
        if ('deleted' in r.keys()):
            pass
        else:
            if r["type"] == 'comment':
                t = (r["by"],r["id"],r["text"],r["time"])
                comments_list.append(t)
    except:
        pass

def sentiment(text):
    payload = {'text':text}
    try:
        return requests.get('https://crawftv-nlp-api.herokuapp.com/sentiment',params=payload).json()["compound"]
    except:
        pass
    

@task
def extract():
    comments_list = []
    max_item = requests.get("https://hacker-news.firebaseio.com/v0/maxitem.json?print=pretty").json()
    for i in range(max_item-10,max_item):
        hn_scrape(i,comments_list)
    return comments_list
    
    

@task
def transform(comments_list):
    comments= pd.DataFrame(columns = ["by", "id","text","time"], data=comments_list)
    print(comments)
    comments["text"] = comments["text"].str.replace("&quot;","")
    comments["text"] = comments["text"].str.replace("&#x27;","'")
    comments["text"] = comments["text"].str.replace("&gt; "," ")
    comments["text"] = comments["text"].str.replace("<p>"," ")
    comments["text"] = comments["text"].str.replace("<a>"," ")
    comments["text"] = comments["text"].str.replace("</a>"," ")
    comments["text"] = comments["text"].str.replace("<i>"," ")
    comments["text"] = comments["text"].str.replace("</i>"," ")
    comments["text"] = comments["text"].str.replace("&#x2F;",'')
    comments["text"] = comments["text"].str.replace("https:",' ')
    comments["text"] = comments["text"].str.replace("\\n",' ')
    comments = comments.dropna()
    return comments


@task
def load(comments):
    comments["sentiment"] = comments["text"].apply(sentiment)
    comments.to_sql(name="comments",con=db, if_exists="append",chunksize=500)



In [5]:
host = config('AWS_DATABASE_URL')
db = sqlalchemy.create_engine(host)
conn = db.connect()
curs = conn.connection.cursor()

In [9]:
from prefect import Flow


max_item = requests.get("https://hacker-news.firebaseio.com/v0/maxitem.json?print=pretty").json()


with Flow('ETL') as flow:
    e = extract()
    t = transform(e)
    l = load(t)


flow.run()
conn.close()
curs.close()

[2019-05-30 22:59:00,923] INFO - prefect.FlowRunner | Beginning Flow run for 'ETL'
[2019-05-30 22:59:00,923] INFO - prefect.FlowRunner | Starting flow run.
[2019-05-30 22:59:00,926] INFO - prefect.TaskRunner | Task 'extract': Starting task run...
[2019-05-30 22:59:02,074] INFO - prefect.TaskRunner | Task 'extract': finished task run for task with final state: 'Success'
[2019-05-30 22:59:02,075] INFO - prefect.TaskRunner | Task 'transform': Starting task run...
[2019-05-30 22:59:02,088] INFO - prefect.TaskRunner | Task 'transform': finished task run for task with final state: 'Success'
[2019-05-30 22:59:02,089] INFO - prefect.TaskRunner | Task 'load': Starting task run...


             by        id                                               text  \
0  caddytodaddy  20056768  I have a feeling the “spare tire” chips are ch...   
1       y2kenny  20056770  Are there anything that help with switching ex...   
2   jamesblonde  20056771  Some background reading&#x2F;viewing:\n<a href...   
3         vokep  20056772  So then.....how do we start hobbyist chip-fab@...   
4       ngcc_hk  20056773         Surprise you can use those character here.   
5        czbond  20056774  I did - early beta. Based on my experience as ...   
6          rasz  20056775  No it doesnt, its a corporate giveaway program...   
7        jupp0r  20056776  &gt; and as a result are actually causing many...   
8       nwallin  20056777  I totally agree.<p>Every time I start in a new...   

         time  
0  1559257046  
1  1559257062  
2  1559257062  
3  1559257066  
4  1559257066  
5  1559257081  
6  1559257085  
7  1559257088  
8  1559257094  


[2019-05-30 22:59:04,148] INFO - prefect.TaskRunner | Task 'load': finished task run for task with final state: 'Success'
[2019-05-30 22:59:04,149] INFO - prefect.FlowRunner | Flow run SUCCESS: all reference tasks succeeded


## test

In [32]:
comments

Unnamed: 0,by,id,text,time,sentiment
0,wmf,20056566,CPU design and RF stuff are both very difficul...,1559255253,-0.4201
1,the-rc,20056567,I run anywhere from 30 to 70 miles per week an...,1559255259,0.7496
2,bhaak,20056568,HISTFILESIZE and HISTSIZE don't solve the abys...,1559255260,0.2283
3,_emacsomancer_,20056570,Which it hasn't been since Reagan become presi...,1559255271,0.0
4,jnwatson,20056571,"Outside of academia, I’d wager the usage of th...",1559255274,0.0
5,hulahoof,20056572,Resist fingerprinting will also stop sensor da...,1559255276,-0.4215
6,whenchamenia,20056573,This is a rather poor refutation of the claims...,1559255313,0.6808
7,thethirdone,20056574,Firefox has ~250M users as of about 9 months ...,1559255314,-0.1536
8,merb,20056575,well you could help out gitea and make a hoste...,1559255317,0.8176


In [10]:
conn = db.connect()
curs = conn.connection.cursor()
curs.execute(" SELECT COUNT(id) FROM comments")
table_max = curs.fetchall()
print(table_max)
conn.close()
curs.close()

[('20056777',)]
