diff --git a/.bumpversion.cfg b/.bumpversion.cfg new file mode 100644 index 0000000..a592e96 --- /dev/null +++ b/.bumpversion.cfg @@ -0,0 +1,9 @@ +[bumpversion] +current_version = 0.1.0 + +[bumpversion:file:docker/prod/harvester/Dockerfile] + +[bumpversion:file:docker/prod/exporter/Dockerfile] + +[bumpversion:file:setup.py] + diff --git a/.gitignore b/.gitignore index 72364f9..f293de6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,6 @@ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] -*$py.class # C extensions *.so @@ -43,7 +42,6 @@ htmlcov/ nosetests.xml coverage.xml *,cover -.hypothesis/ # Translations *.mo @@ -51,14 +49,6 @@ coverage.xml # Django stuff: *.log -local_settings.py - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy # Sphinx documentation docs/_build/ @@ -66,24 +56,9 @@ docs/_build/ # PyBuilder target/ -# IPython Notebook -.ipynb_checkpoints - -# pyenv -.python-version - -# celery beat schedule file -celerybeat-schedule - -# dotenv -.env - -# virtualenv -venv/ -ENV/ +# PyCharm +.idea/ -# Spyder project settings -.spyderproject +# Test configuration +test_config.py -# Rope project settings -.ropeproject diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..a689d16 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,22 @@ +language: python +python: +- '2.7' +sudo: required +services: +- docker +before_install: +- sudo apt-get update +- sudo apt-get install -y -o Dpkg::Options::="--force-confnew" docker-engine +- sudo pip install -U docker-compose +- sudo pip install --upgrade ndg-httpsclient +- docker-compose -f docker/ci.docker-compose.yml pull +- docker-compose -f docker/ci.docker-compose.yml up -d +- sleep 30 +- docker exec docker_sfmtumblrharvester_1 python -m unittest discover +install: pip install -r requirements/master.txt +script: python -m unittest discover +notifications: + email: + - ychtan@email.gwu.edu + slack: + on_success: never diff --git a/LICENSE b/LICENSE index f382d36..0ce3235 100644 --- a/LICENSE +++ b/LICENSE @@ -19,3 +19,4 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + diff --git a/README.md b/README.md index 80886f2..edb623c 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,33 @@ # sfm-tumblr-harvester -A basic harvester for Tumblr public post data as part of Social Feed Manager. http://gwu-libraries.github.io/sfm-ui +A basic harvester for Tumblr public post data as part of [Social Feed Manager](https://gwu-libraries.github.io/sfm-ui). + +[![Build Status](https://travis-ci.org/gwu-libraries/sfm-tumblr-harvester.svg?branch=master)](https://travis-ci.org/gwu-libraries/sfm-tumblr-harvester) + +Provides harvesters for [Tumblr API](https://www.tumblr.com/docs/en/api/v2) and harvesting is performed by official API client [pytumblr](https://github.com/tumblr/pytumblr). + +# Install +```bash +git clone https://github.com/gwu-libraries/sfm-tumblr-harvester +cd sfm-tumblr-harvester +pip install -r requirements/requirements.txt +``` + +# Ready to work +* Sign up an account at [Tumblr](https://www.tumblr.com). +* Register an application at [here](https://www.tumblr.com/oauth/apps) to get your `CONSUMER_KEY`, `CONSUMER_SECRET`. +* Provide your `CONSUMER_KEY` and `CONSUMER_SECRET` and get your access token at [here]( https://api.tumblr.com/console). +* Once you are succeed authorized your APP, click the `Show Keys` button at the top-right. +* An example of the keys looks like (the following keys are invalid): + +```bash +CONSUMER_KEY = "3jlICwerCIWqEdUdAyuenNyercwkVuXOuYFoxTPafWx8DsUMe2" +CONSUMER_SECRET = "sTCdLJ9kdfgEwTPoYIdfdsteF0XB8WiHlczLx0GgvzRim1L47n" +ACCESS_TOKEN = "sdrsaPx5FtpJ0tfZAG13kMZMjenouGsdJw9W7ssK6husepcFoWg" +ACCESS_TOKEN_SECRET = "0VxKNAMSiNO8IT6PsdattmUsdsfI5X1hP4usBNZLllgkhwsdQiY" +________________________________________________________________________ +API_KEY = "3jlICwerCIWqEdUdAyuenNyercwkVuXOuYFoxTPafWx8DsUMe2" + +``` + + + diff --git a/docker/ci.docker-compose.yml b/docker/ci.docker-compose.yml new file mode 100644 index 0000000..7062326 --- /dev/null +++ b/docker/ci.docker-compose.yml @@ -0,0 +1,20 @@ +sfmrabbit: + image: rabbitmq@sha256:a5180a37b0baebb938ee9d12dd11eed64a909288d7f344e24771278f8a122367 + environment: + - TZ=America/New_York + - RABBITMQ_DEFAULT_USER=sfm_user + - RABBITMQ_DEFAULT_PASS=password +sfmtumblrharvester: + image: gwul/sfm-tumblr-harvester:dev + links: + - sfmrabbit:mq + volumes: + - "..:/opt/sfm-tumblr-harvester" + environment: + - TZ=America/New_York + - TUMBLR_CONSUMER_KEY + - TUMBLR_CONSUMER_SECRET + - TUMBLR_ACCESS_TOKEN + - TUMBLR_ACCESS_TOKEN_SECRET + + command: bash -c "pip install -r requirements/master.txt --upgrade && appdeps.py --port-wait mq:5672 && python tumblr_harvester.py --debug=True service mq sfm_user password" \ No newline at end of file diff --git a/docker/dev.docker-compose.yml b/docker/dev.docker-compose.yml new file mode 100644 index 0000000..ccf2fa0 --- /dev/null +++ b/docker/dev.docker-compose.yml @@ -0,0 +1,34 @@ +sfmrabbit: + image: rabbitmq@sha256:a5180a37b0baebb938ee9d12dd11eed64a909288d7f344e24771278f8a122367 + ports: + - "15672:15672" + restart: always + environment: + - TZ=America/New_York + - RABBITMQ_DEFAULT_USER=sfm_user + - RABBITMQ_DEFAULT_PASS=password +sfmtumblrharvester: + image: gwul/sfm-tumblr-harvester:dev + links: + - sfmrabbit:mq + volumes: + - "..:/opt/sfm-tumblr-harvester" + - "../../sfm-utils:/opt/sfm-utils" + - "../../warcprox:/opt/warcprox" + environment: + - TZ=America/New_York + - DEBUG=True +# restart: always + +sfmtumblrexporter: + image: gwul/sfm-tumblr-exporter:dev + links: + - sfmrabbit:mq + volumes: + - "..:/opt/sfm-tumblr-harvester" + - "../../sfm-utils:/opt/sfm-utils" + - "../../warcprox:/opt/warcprox" + environment: + - TZ=America/New_York + - DEBUG=True +# restart: always \ No newline at end of file diff --git a/docker/dev/exporter/Dockerfile b/docker/dev/exporter/Dockerfile new file mode 100644 index 0000000..80636d2 --- /dev/null +++ b/docker/dev/exporter/Dockerfile @@ -0,0 +1,24 @@ +FROM python@sha256:ad39551743b356efda7c61f46019b97d49d1aab01b97f0e6d87c9b34326f3bfe +MAINTAINER Vict Tan + +ARG DEBIAN_FRONTEND=noninteractive +RUN apt-get update && apt-get install -y \ + git=1:2.1.4-2.1+deb8u1 + +#pip set in 7.1.2 +RUN pip install pip==7.1.2 +#Avoid the warning of https +RUN pip install --upgrade ndg-httpsclient + +#copy the requirement +ADD https://raw.githubusercontent.com/gwu-libraries/sfm-tumblr-harvester/master/requirements/common.txt /tmp/ +ADD https://raw.githubusercontent.com/gwu-libraries/sfm-tumblr-harvester/master/requirements/requirements.txt /tmp/ + +RUN pip install -r /tmp/requirements.txt +RUN pip install appdeps +#It should mounted as your development dir +WORKDIR /opt/sfm-tumblr-harvester + +CMD pip install -r requirements/dev.txt --upgrade \ + && appdeps.py --port-wait mq:5672 \ + && python tumblr_exporter.py --debug=$DEBUG service mq $MQ_ENV_RABBITMQ_DEFAULT_USER $MQ_ENV_RABBITMQ_DEFAULT_PASS http://api diff --git a/docker/dev/exporter/README.md b/docker/dev/exporter/README.md new file mode 100644 index 0000000..be23545 --- /dev/null +++ b/docker/dev/exporter/README.md @@ -0,0 +1,20 @@ +# sfm-tumblr-exporter dev docker container + +A docker container for running sfm-tumblr-exporter as a service. +The harvester code must be mounted as `/opt/sfm-tumblr-harvester`, the sfm-utils code as `/opt/sfm-utils` and the warcprox code as `/opt/warcprox`. +For example: + +```python +volumes: + - "/my_directory/sfm-tumblr-harvester:/opt/sfm-tumblr-harvester" + - "/my_directory/sfm-utils:/opt/sfm-utils" + - "/my_directory/warcprox:/opt/warcprox" +``` + +This container requires a link to a container running the queue. This must be linked with the alias `mq`. +For example: + +```python +links: + - sfmrabbit:mq +``` diff --git a/docker/dev/harvester/Dockerfile b/docker/dev/harvester/Dockerfile new file mode 100644 index 0000000..dd58168 --- /dev/null +++ b/docker/dev/harvester/Dockerfile @@ -0,0 +1,24 @@ +FROM python@sha256:ad39551743b356efda7c61f46019b97d49d1aab01b97f0e6d87c9b34326f3bfe +MAINTAINER Vict Tan + +ARG DEBIAN_FRONTEND=noninteractive +RUN apt-get update && apt-get install -y \ + git=1:2.1.4-2.1+deb8u1 + +#pip set in 7.1.2 +RUN pip install pip==7.1.2 +#Avoid the warning of https +RUN pip install --upgrade ndg-httpsclient + +#copy the requirement +ADD https://raw.githubusercontent.com/gwu-libraries/sfm-tumblr-harvester/master/requirements/common.txt /tmp/ +ADD https://raw.githubusercontent.com/gwu-libraries/sfm-tumblr-harvester/master/requirements/requirements.txt /tmp/ + +RUN pip install -r /tmp/requirements.txt +RUN pip install appdeps +#It should mounted as your development dir +WORKDIR /opt/sfm-tumblr-harvester + +CMD pip install -r requirements/dev.txt --upgrade \ + && appdeps.py --port-wait mq:5672 \ + && python tumblr_harvester.py --debug=$DEBUG service mq $MQ_ENV_RABBITMQ_DEFAULT_USER $MQ_ENV_RABBITMQ_DEFAULT_PASS diff --git a/docker/dev/harvester/README.md b/docker/dev/harvester/README.md new file mode 100644 index 0000000..7431c8b --- /dev/null +++ b/docker/dev/harvester/README.md @@ -0,0 +1,20 @@ +# sfm-tumblr-harvester dev docker container + +A docker container for running sfm-tumblr-harvester as a service. +The harvester code must be mounted as `/opt/sfm-tumblr-harvester`, the sfm-utils code as `/opt/sfm-utils` and the warcprox code as `/opt/warcprox`. +For example: + +```python +volumes: + - "/my_directory/sfm-tumblr-harvester:/opt/sfm-tumblr-harvester" + - "/my_directory/sfm-utils:/opt/sfm-utils" + - "/my_directory/warcprox:/opt/warcprox" +``` + +This container requires a link to a container running the queue. This must be linked with the alias `mq`. +For example: + +```python +links: + - sfmrabbit:mq +``` diff --git a/docker/master.docker-compose.yml b/docker/master.docker-compose.yml new file mode 100644 index 0000000..cb4749e --- /dev/null +++ b/docker/master.docker-compose.yml @@ -0,0 +1,25 @@ +sfmrabbit: + image: rabbitmq@sha256:a5180a37b0baebb938ee9d12dd11eed64a909288d7f344e24771278f8a122367 + ports: + - "15672:15672" + restart: always + environment: + - TZ=America/New_York + - RABBITMQ_DEFAULT_USER=sfm_user + - RABBITMQ_DEFAULT_PASS=password +sfmtumblrharvester: + image: gwul/sfm-tumblr-harvester:master + links: + - sfmrabbit:mq + restart: always + environment: + - TZ=America/New_York + - DEBUG=True +sfmtumblrexporter: + image: gwul/sfm-tumblr-exporter:master + links: + - sfmrabbit:mq + restart: always + environment: + - TZ=America/New_York + - DEBUG=True \ No newline at end of file diff --git a/docker/master/exporter/Dockerfile b/docker/master/exporter/Dockerfile new file mode 100644 index 0000000..823e6a6 --- /dev/null +++ b/docker/master/exporter/Dockerfile @@ -0,0 +1,22 @@ +FROM python@sha256:ad39551743b356efda7c61f46019b97d49d1aab01b97f0e6d87c9b34326f3bfe +MAINTAINER Vict Tan + +ARG DEBIAN_FRONTEND=noninteractive +RUN apt-get update && apt-get install -y \ + zip=3.0-8 \ + git=1:2.1.4-2.1+deb8u1 +#Upgrade pip +RUN pip install pip==7.1.2 +#Avoid the warning of https +RUN pip install --upgrade ndg-httpsclient + +WORKDIR /tmp +ADD https://github.com/gwu-libraries/sfm-tumblr-harvester/archive/master.zip /tmp/ +RUN unzip master.zip +RUN mv sfm-tumblr-harvester-master /opt/sfm-tumblr-harvester +WORKDIR /opt/sfm-tumblr-harvester +RUN pip install -r requirements/master.txt +RUN pip install appdeps +CMD appdeps.py --port-wait mq:5672 \ + && python tumblr_exporter.py --debug=$DEBUG service mq $MQ_ENV_RABBITMQ_DEFAULT_USER $MQ_ENV_RABBITMQ_DEFAULT_PASS http://api + diff --git a/docker/master/exporter/README.md b/docker/master/exporter/README.md new file mode 100644 index 0000000..2df3641 --- /dev/null +++ b/docker/master/exporter/README.md @@ -0,0 +1,11 @@ +# sfm-tumblr-exporter master docker container + +A docker container for running sfm-tumblr-exporter as a service. + +This container requires a link to a container running the queue. This must be linked with the alias `mq`. +For example: + +```python +links: + - sfmrabbit:mq +``` diff --git a/docker/master/harvester/Dockerfile b/docker/master/harvester/Dockerfile new file mode 100644 index 0000000..0ee6211 --- /dev/null +++ b/docker/master/harvester/Dockerfile @@ -0,0 +1,22 @@ +FROM python@sha256:ad39551743b356efda7c61f46019b97d49d1aab01b97f0e6d87c9b34326f3bfe +MAINTAINER Vict Tan + +ARG DEBIAN_FRONTEND=noninteractive +RUN apt-get update && apt-get install -y \ + zip=3.0-8 \ + git=1:2.1.4-2.1+deb8u1 +#Upgrade pip +RUN pip install pip==7.1.2 +#Avoid the warning of https +RUN pip install --upgrade ndg-httpsclient + +WORKDIR /tmp +ADD https://github.com/gwu-libraries/sfm-tumblr-harvester/archive/master.zip /tmp/ +RUN unzip master.zip +RUN mv sfm-tumblr-harvester-master /opt/sfm-tumblr-harvester +WORKDIR /opt/sfm-tumblr-harvester +RUN pip install -r requirements/master.txt +RUN pip install appdeps +CMD appdeps.py --port-wait mq:5672 \ + && python tumblr_harvester.py --debug=$DEBUG service mq $MQ_ENV_RABBITMQ_DEFAULT_USER $MQ_ENV_RABBITMQ_DEFAULT_PASS + diff --git a/docker/master/harvester/README.md b/docker/master/harvester/README.md new file mode 100644 index 0000000..3bca6ef --- /dev/null +++ b/docker/master/harvester/README.md @@ -0,0 +1,11 @@ +# sfm-tumblr-harvester master docker container + +A docker container for running sfm-tumblr-harvester as a service. + +This container requires a link to a container running the queue. This must be linked with the alias `mq`. +For example: + +```python +links: + - sfmrabbit:mq +``` diff --git a/docker/prod/exporter/Dockerfile b/docker/prod/exporter/Dockerfile new file mode 100644 index 0000000..83c4d7f --- /dev/null +++ b/docker/prod/exporter/Dockerfile @@ -0,0 +1,24 @@ +FROM python@sha256:ad39551743b356efda7c61f46019b97d49d1aab01b97f0e6d87c9b34326f3bfe +MAINTAINER Vict Tan + +ARG DEBIAN_FRONTEND=noninteractive +ENV sfm_harvester_version 1.0.0 + +RUN apt-get update && apt-get install -y \ + zip=3.0-8 +#Upgrade pip +RUN pip install pip==7.1.2 +#Avoid the warning of https +RUN pip install --upgrade ndg-httpsclient + +WORKDIR /tmp +ADD https://github.com/gwu-libraries/sfm-weibo-harvester/archive/${sfm_harvester_version}.zip /tmp/ +RUN unzip ${sfm_harvester_version}.zip +RUN mv sfm-weibo-harvester-${sfm_harvester_version} /opt/sfm-weibo-harvester + +WORKDIR /opt/sfm-weibo-harvester +RUN pip install -r requirements/requirements.txt +RUN pip install appdeps +CMD appdeps.py --port-wait mq:5672 \ + && python weibo_exporter.py --debug=$DEBUG service mq $MQ_ENV_RABBITMQ_DEFAULT_USER $MQ_ENV_RABBITMQ_DEFAULT_PASS http://api + diff --git a/docker/prod/exporter/README.md b/docker/prod/exporter/README.md new file mode 100644 index 0000000..ca6b0c3 --- /dev/null +++ b/docker/prod/exporter/README.md @@ -0,0 +1,11 @@ +# sfm-weibo-exporter prod docker container + +A docker container for running sfm-weibo-exporter as a service. + +This container requires a link to a container running the queue. This must be linked with the alias `mq`. +For example: + +```python +links: + - sfmrabbit:mq +``` diff --git a/docker/prod/harvester/Dockerfile b/docker/prod/harvester/Dockerfile new file mode 100644 index 0000000..294a80a --- /dev/null +++ b/docker/prod/harvester/Dockerfile @@ -0,0 +1,24 @@ +FROM python@sha256:ad39551743b356efda7c61f46019b97d49d1aab01b97f0e6d87c9b34326f3bfe +MAINTAINER Vict Tan + +ARG DEBIAN_FRONTEND=noninteractive +ENV sfm_harvester_version 1.0.0 + +RUN apt-get update && apt-get install -y \ + zip=3.0-8 +#Upgrade pip +RUN pip install pip==7.1.2 +#Avoid the warning of https +RUN pip install --upgrade ndg-httpsclient + +WORKDIR /tmp +ADD https://github.com/gwu-libraries/sfm-weibo-harvester/archive/${sfm_harvester_version}.zip /tmp/ +RUN unzip ${sfm_harvester_version}.zip +RUN mv sfm-weibo-harvester-${sfm_harvester_version} /opt/sfm-weibo-harvester + +WORKDIR /opt/sfm-weibo-harvester +RUN pip install -r requirements/requirements.txt +RUN pip install appdeps +CMD appdeps.py --port-wait mq:5672 \ + && python weibo_harvester.py --debug=$DEBUG service mq $MQ_ENV_RABBITMQ_DEFAULT_USER $MQ_ENV_RABBITMQ_DEFAULT_PASS + diff --git a/docker/prod/harvester/README.md b/docker/prod/harvester/README.md new file mode 100644 index 0000000..0418d0d --- /dev/null +++ b/docker/prod/harvester/README.md @@ -0,0 +1,11 @@ +# sfm-weibo-harvester prod docker container + +A docker container for running sfm-weibo-harvester as a service. + +This container requires a link to a container running the queue. This must be linked with the alias `mq`. +For example: + +```python +links: + - sfmrabbit:mq +``` diff --git a/requirements/common.txt b/requirements/common.txt new file mode 100644 index 0000000..6ffe42f --- /dev/null +++ b/requirements/common.txt @@ -0,0 +1,29 @@ +amqp==1.4.9 +anyjson==0.3.3 +certauth==1.1.3 +cffi==1.5.0 +cryptography==1.2.1 +enum34==1.1.2 +funcsigs==0.4 +idna==2.0 +ipaddress==1.0.16 +kombu==3.0.33 +librabbitmq==1.6.1 +meld3==1.0.2 +mock==1.3.0 +oauthlib==1.0.3 +pbr==1.8.1 +py==1.4.31 +pyasn1==0.1.9 +pycparser==2.14 +pyOpenSSL==0.15.1 +pytest==2.8.7 +python-dateutil==2.4.2 +requests==2.9.1 +requests-oauthlib==0.6.0 +pytumblr +six==1.10.0 +vcrpy==1.7.4 +urllib3==1.14 +warc==0.2.1 +warctools==4.9.0 diff --git a/requirements/dev.txt b/requirements/dev.txt new file mode 100644 index 0000000..199b96e --- /dev/null +++ b/requirements/dev.txt @@ -0,0 +1,3 @@ +-r common.txt +-e ../warcprox +-e ../sfm-utils \ No newline at end of file diff --git a/requirements/dev_no_warcprox.txt b/requirements/dev_no_warcprox.txt new file mode 100644 index 0000000..fe252bb --- /dev/null +++ b/requirements/dev_no_warcprox.txt @@ -0,0 +1,2 @@ +-r common.txt +-e ../sfm-utils \ No newline at end of file diff --git a/requirements/master.txt b/requirements/master.txt new file mode 100644 index 0000000..858ffea --- /dev/null +++ b/requirements/master.txt @@ -0,0 +1,3 @@ +-r common.txt +git+https://github.com/gwu-libraries/warcprox.git@master#egg=warcprox-gwu +git+https://github.com/gwu-libraries/sfm-utils.git@master#egg=sfmutils diff --git a/requirements/requirements.txt b/requirements/requirements.txt new file mode 100644 index 0000000..0c048a9 --- /dev/null +++ b/requirements/requirements.txt @@ -0,0 +1,3 @@ +-r common.txt +git+https://github.com/gwu-libraries/warcprox.git@0.2.0#egg=warcprox-gwu +git+https://github.com/gwu-libraries/sfm-utils.git@0.4.1#egg=sfmutils diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..dd49eea --- /dev/null +++ b/setup.py @@ -0,0 +1,23 @@ +from setuptools import setup + +setup( + name='sfmtumblrharvester', + version='0.1.0', + url='https://github.com/gwu-libraries/sfm-tumblr-harvester', + author='Vict Tan', + author_email='ychtan@email.gwu.edu', + description="Social Feed Manager Tumblr Harvester", + platforms=['POSIX'], + test_suite='tests', + scripts=['tumblr_harvester.py', + 'tumblr_warc_iter.py'], + py_modules=['tumblr_harvester','tumblr_warc_iter'], + install_requires=['sfmutils'], + tests_require=['mock>=1.3.0'], + classifiers=[ + 'Intended Audience :: Developers', + 'Topic :: Software Development :: Libraries :: Python Modules', + 'Programming Language :: Python :: 2.7', + 'Development Status :: 4 - Beta', + ], +) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..39cb6d9 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1,33 @@ +import logging +import unittest +import os +import socket + +try: + from test_config import * +except ImportError: + TUMBLR_CONSUMER_KEY = os.environ.get("TUMBLR_CONSUMER_KEY") + TUMBLR_CONSUMER_SECRET = os.environ.get("TUMBLR_CONSUMER_SECRET") + TUMBLR_ACCESS_TOKEN = os.environ.get("TUMBLR_ACCESS_TOKEN") + TUMBLR_ACCESS_TOKEN_SECRET = os.environ.get("TUMBLR_ACCESS_TOKEN_SECRET") + +test_config_available = True if TUMBLR_CONSUMER_KEY and TUMBLR_CONSUMER_SECRET \ + and TUMBLR_ACCESS_TOKEN and TUMBLR_ACCESS_TOKEN_SECRET else False + +mq_port_available = True +s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) +try: + s.connect(("mq", 5672)) +except socket.error: + mq_port_available = False + +mq_username = os.environ.get("MQ_ENV_RABBITMQ_DEFAULT_USER") +mq_password = os.environ.get("MQ_ENV_RABBITMQ_DEFAULT_PASS") +integration_env_available = mq_port_available and mq_username and mq_password + + +class TestCase(unittest.TestCase): + logging.basicConfig(level=logging.DEBUG) + logging.getLogger("tumblr_harvester").setLevel(logging.DEBUG) + logging.getLogger("requests").setLevel(logging.ERROR) + logging.getLogger("vcr").setLevel(logging.INFO) \ No newline at end of file diff --git a/tests/test_tumblr_exporter.py b/tests/test_tumblr_exporter.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_tumblr_harvester.py b/tests/test_tumblr_harvester.py new file mode 100644 index 0000000..bef16de --- /dev/null +++ b/tests/test_tumblr_harvester.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from __future__ import absolute_import +import tests +import vcr as base_vcr +from tests.tumblr import post1,post2 +import unittest +from mock import MagicMock, patch, call +from kombu import Connection, Exchange, Queue, Producer +from sfmutils.state_store import DictHarvestStateStore +from sfmutils.harvester import HarvestResult, EXCHANGE +import threading +import shutil +import tempfile +import time +import os +from datetime import datetime, date +from tumblr_harvester import TumblrHarvester + +vcr = base_vcr.VCR( + cassette_library_dir='tests/fixtures', + record_mode='once', +) + + +@unittest.skipIf(not tests.test_config_available, "Skipping test since test config not available.") +class TestTumblrHarvesterVCR(tests.TestCase): + def setUp(self): + self.harvester = TumblrHarvester() + self.harvester.state_store = DictHarvestStateStore() + self.harvester.harvest_result = HarvestResult() + self.harvester.stop_event = threading.Event() + self.harvester.harvest_result_lock = threading.Lock() + self.harvester.message = { + "id": "test:1", + "type": "tumblr_user_timeline", + "path": self.path, + "seeds": [ + { + "token": "codingjester" + } + ], + "credentials": { + "consumer_key": tests.TUMBLR_CONSUMER_KEY, + "consumer_secret": tests.TUMBLR_CONSUMER_SECRET, + "access_token": tests.TUMBLR_ACCESS_TOKEN, + "access_token_secret": tests.TUMBLR_ACCESS_TOKEN_SECRET + }, + "collection_set": { + "id": "test_collection_set" + }, + "options": { + } + } + + @vcr.use_cassette(filter_query_parameters=['api_key']) + def test_search_vcr(self): + self.harvester.harvest_seeds() + # check the total number, for new users don't how to check + self.assertEqual(self.harvester.harvest_result.stats_summary()["posts"], 2134) + # check the harvester status + self.assertTrue(self.harvester.harvest_result.success) + + @vcr.use_cassette(filter_query_parameters=['api_key']) + def test_incremental_search_vcr(self): + self.harvester.message["options"]["incremental"] = True + host_name = self.harvester.message["seeds"]["token"] + self.harvester.state_store.set_state("tumblr_harvester", "{}.offset".format(host_name), 20) + self.harvester.harvest_seeds() + + # Check harvest result + self.assertTrue(self.harvester.harvest_result.success) + # for check the number of get + self.assertEqual(self.harvester.harvest_result.stats_summary()["posts"], 2134) + # check the state + self.assertEqual(2134, self.harvester.state_store.get_state("tumblr_harvester", "{}.offset".format(host_name))) + + +@unittest.skipIf(not tests.test_config_available, "Skipping test since test config not available.") +@unittest.skipIf(not tests.integration_env_available, "Skipping test since integration env not available.") +class TestTumblrHarvesterIntegration(tests.TestCase): + def _create_connection(self): + return Connection(hostname="mq", userid=tests.mq_username, password=tests.mq_password) + + def setUp(self): + self.exchange = Exchange(EXCHANGE, type="topic") + self.result_queue = Queue(name="result_queue", routing_key="harvest.status.tumblr.*", exchange=self.exchange, + durable=True) + self.web_harvest_queue = Queue(name="web_harvest_queue", routing_key="harvest.start.web", + exchange=self.exchange) + self.warc_created_queue = Queue(name="warc_created_queue", routing_key="warc_created", exchange=self.exchange) + tumblr_harvester_queue = Queue(name="tumblr_harvester", exchange=self.exchange) + with self._create_connection() as connection: + self.result_queue(connection).declare() + self.result_queue(connection).purge() + self.web_harvest_queue(connection).declare() + self.web_harvest_queue(connection).purge() + self.warc_created_queue(connection).declare() + self.warc_created_queue(connection).purge() + # avoid raise NOT_FOUND error 404 + tumblr_harvester_queue(connection).declare() + tumblr_harvester_queue(connection).purge() + + self.path = tempfile.mkdtemp() + + def tearDown(self): + shutil.rmtree(self.path, ignore_errors=True) + + def test_search(self): + harvest_msg = { + "id": "test:1", + "type": "tumblr_user_timeline", + "path": self.path, + "seeds": [ + { + "token": "codingjester" + } + ], + "credentials": { + "consumer_key": tests.TUMBLR_CONSUMER_KEY, + "consumer_secret": tests.TUMBLR_CONSUMER_SECRET, + "access_token": tests.TUMBLR_ACCESS_TOKEN, + "access_token_secret": tests.TUMBLR_ACCESS_TOKEN_SECRET + }, + "collection_set": { + "id": "test_collection_set" + }, + "options": { + } + } + with self._create_connection() as connection: + bound_exchange = self.exchange(connection) + producer = Producer(connection, exchange=bound_exchange) + producer.publish(harvest_msg, routing_key="harvest.start.tumblr.tumblr_user_timeline") + + # Now wait for result message. + counter = 0 + bound_result_queue = self.result_queue(connection) + message_obj = None + while counter < 240 and not message_obj: + time.sleep(.5) + message_obj = bound_result_queue.get(no_ack=True) + counter += 1 + self.assertTrue(message_obj, "Timed out waiting for result at {}.".format(datetime.now())) + + result_msg = message_obj.payload + # Matching ids + self.assertEqual("test:1", result_msg["id"]) + # Success + self.assertEqual("completed success", result_msg["status"]) + # Some weibo posts + self.assertTrue(result_msg["stats"][date.today().isoformat()]["posts"]) + + # Web harvest message. + bound_web_harvest_queue = self.web_harvest_queue(connection) + message_obj = bound_web_harvest_queue.get(no_ack=True) + # the default value is not harvesting web resources. + self.assertIsNotNone(message_obj, "No web harvest message.") + web_harvest_msg = message_obj.payload + # Some seeds + self.assertTrue(len(web_harvest_msg["seeds"])) + + # Warc created message. + bound_warc_created_queue = self.warc_created_queue(connection) + message_obj = bound_warc_created_queue.get(no_ack=True) + self.assertIsNotNone(message_obj, "No warc created message.") + # check path exist + warc_msg = message_obj.payload + self.assertTrue(os.path.isfile(warc_msg["warc"]["path"])) \ No newline at end of file diff --git a/tests/tumblr.py b/tests/tumblr.py new file mode 100644 index 0000000..1178407 --- /dev/null +++ b/tests/tumblr.py @@ -0,0 +1,203 @@ +post1={ + "blog_name": "codingjester", + "id": 145825561465, + "post_url": "http://tumblr.johnbunting.me/post/145825561465/eatsleepdraw-epic-death-miguel-co-blog", + "slug": "eatsleepdraw-epic-death-miguel-co-blog", + "type": "photo", + "date": "2016-06-12 22:24:21 GMT", + "timestamp": 1465770261, + "state": "published", + "format": "html", + "reblog_key": "d6u1qMKq", + "tags": [], + "short_url": "https://tmblr.co/Z9Yjvx27puNjv", + "summary": "“Epic Death”\n Miguel Co\n Blog\n Etsy", + "recommended_source": None, + "recommended_color": None, + "highlighted": [], + "note_count": 730, + "caption": "

eatsleepdraw:

\n
\n

“Epic Death”

\n

Miguel Co

\n

Blog

\n

Etsy

\n
", + "reblog": { + "tree_html": "

eatsleepdraw:

\n

“Epic Death”

\n

Miguel Co

\n

Blog

\n

Etsy

\n
", + "comment": "" + }, + "trail": [ + { + "blog": { + "name": "eatsleepdraw", + "active": True, + "theme": { + "header_full_width": 500, + "header_full_height": 500, + "header_focus_width": 500, + "header_focus_height": 281, + "avatar_shape": "circle", + "background_color": "#FFFFFF", + "body_font": "Helvetica Neue", + "header_bounds": "62,500,343,0", + "header_image": "https://secure.static.tumblr.com/4612212114889e352cae8257c9f20e1e/o5ggtk2/5FDn9udky/tumblr_static_24a2c9h92tvooccs04k8kkgs0.png", + "header_image_focused": "https://secure.static.tumblr.com/4612212114889e352cae8257c9f20e1e/o5ggtk2/PmNn9udkz/tumblr_static_tumblr_static_24a2c9h92tvooccs04k8kkgs0_focused_v3.png", + "header_image_scaled": "https://secure.static.tumblr.com/4612212114889e352cae8257c9f20e1e/o5ggtk2/5FDn9udky/tumblr_static_24a2c9h92tvooccs04k8kkgs0_2048_v2.png", + "header_stretch": True, + "link_color": "#1589CF", + "show_avatar": True, + "show_description": True, + "show_header_image": True, + "show_title": True, + "title_color": "#444444", + "title_font": "Capita", + "title_font_weight": "bold" + }, + "share_likes": False, + "share_following": False + }, + "post": { + "id": "145789411872" + }, + "content_raw": "

“Epic Death”

\n

Miguel Co

\n

Blog

\n

Etsy

", + "content": "

“Epic Death”

\n

Miguel Co

\n

Blog

\n

Etsy

", + "is_root_item": True + } + ], + "image_permalink": "http://tumblr.johnbunting.me/image/145825561465", + "photos": [ + { + "caption": "", + "alt_sizes": [ + { + "url": "https://66.media.tumblr.com/2263d0cfede6a84b7e9b40b9ff6a7775/tumblr_o8f0fqkmdN1qz7t0xo1_1280.jpg", + "width": 1125, + "height": 900 + }, + { + "url": "https://66.media.tumblr.com/2263d0cfede6a84b7e9b40b9ff6a7775/tumblr_o8f0fqkmdN1qz7t0xo1_500.jpg", + "width": 500, + "height": 400 + }, + { + "url": "https://65.media.tumblr.com/2263d0cfede6a84b7e9b40b9ff6a7775/tumblr_o8f0fqkmdN1qz7t0xo1_400.jpg", + "width": 400, + "height": 320 + }, + { + "url": "https://67.media.tumblr.com/2263d0cfede6a84b7e9b40b9ff6a7775/tumblr_o8f0fqkmdN1qz7t0xo1_250.jpg", + "width": 250, + "height": 200 + }, + { + "url": "https://65.media.tumblr.com/2263d0cfede6a84b7e9b40b9ff6a7775/tumblr_o8f0fqkmdN1qz7t0xo1_100.jpg", + "width": 100, + "height": 80 + }, + { + "url": "https://67.media.tumblr.com/2263d0cfede6a84b7e9b40b9ff6a7775/tumblr_o8f0fqkmdN1qz7t0xo1_75sq.jpg", + "width": 75, + "height": 75 + } + ], + "original_size": { + "url": "https://66.media.tumblr.com/2263d0cfede6a84b7e9b40b9ff6a7775/tumblr_o8f0fqkmdN1qz7t0xo1_1280.jpg", + "width": 1125, + "height": 900 + } + } + ], + "can_send_in_message": True + } + +post2= { + "blog_name": "codingjester", + "id": 145731610225, + "post_url": "http://tumblr.johnbunting.me/post/145731610225/waiting-the-late-night-deploy-to-finish", + "slug": "waiting-the-late-night-deploy-to-finish", + "type": "text", + "date": "2016-06-11 00:53:31 GMT", + "timestamp": 1465606411, + "state": "published", + "format": "html", + "reblog_key": "TgSSAC7S", + "tags": [], + "short_url": "https://tmblr.co/Z9Yjvx27kH_Pn", + "summary": "Waiting the late night deploy to finish", + "recommended_source": None, + "recommended_color": None, + "highlighted": [], + "note_count": 161, + "title": "Waiting the late night deploy to finish", + "body": "

devopsreactions:

\n
\n

by @uaiHebert

\n
\n\n

Current feels

", + "reblog": { + "tree_html": "

devopsreactions:

\n

\n

by @uaiHebert

\n
", + "comment": "

Current feels

" + }, + "trail": [ + { + "blog": { + "name": "devopsreactions", + "active": True, + "theme": { + "avatar_shape": "square", + "background_color": "#FAFAFA", + "body_font": "Helvetica Neue", + "header_bounds": 0, + "header_image": "https://secure.static.tumblr.com/398b388f3c62803c43c7264c16b05268/qfdjrjv/4Jsnb4pcz/tumblr_static_filename.jpg", + "header_image_focused": "https://secure.static.tumblr.com/398b388f3c62803c43c7264c16b05268/qfdjrjv/4Jsnb4pcz/tumblr_static_filename_2048_v2.jpg", + "header_image_scaled": "https://secure.static.tumblr.com/398b388f3c62803c43c7264c16b05268/qfdjrjv/4Jsnb4pcz/tumblr_static_filename_2048_v2.jpg", + "header_stretch": True, + "link_color": "#529ECC", + "show_avatar": True, + "show_description": True, + "show_header_image": True, + "show_title": True, + "title_color": "#444444", + "title_font": "Gibson", + "title_font_weight": "bold" + }, + "share_likes": False, + "share_following": False + }, + "post": { + "id": "145700589719" + }, + "content_raw": "

\n

by @uaiHebert

", + "content": "

\n

by @uaiHebert

", + "is_root_item": True + }, + { + "blog": { + "name": "codingjester", + "active": True, + "theme": { + "header_full_width": 1080, + "header_full_height": 720, + "header_focus_width": 1080, + "header_focus_height": 607, + "avatar_shape": "circle", + "background_color": "#fafafa", + "body_font": "Helvetica Neue", + "header_bounds": "56,1080,663,0", + "header_image": "https://secure.static.tumblr.com/eb8cd4f05abcaa206aeb8e4cd3c4b64a/lejyfcf/E4mng0imc/tumblr_static_2dkdz3tvwgissggco080wk0k8.jpg", + "header_image_focused": "https://secure.static.tumblr.com/eb8cd4f05abcaa206aeb8e4cd3c4b64a/lejyfcf/9Ghng0imf/tumblr_static_tumblr_static_2dkdz3tvwgissggco080wk0k8_focused_v3.jpg", + "header_image_scaled": "https://secure.static.tumblr.com/eb8cd4f05abcaa206aeb8e4cd3c4b64a/lejyfcf/E4mng0imc/tumblr_static_2dkdz3tvwgissggco080wk0k8_2048_v2.jpg", + "header_stretch": True, + "link_color": "#529ecc", + "show_avatar": True, + "show_description": True, + "show_header_image": True, + "show_title": True, + "title_color": "#444444", + "title_font": "Helvetica Neue", + "title_font_weight": "bold" + }, + "share_likes": True, + "share_following": False + }, + "post": { + "id": "145731610225" + }, + "content_raw": "

Current feels

", + "content": "

Current feels

", + "is_current_item": True + } + ], + "can_send_in_message": True + } \ No newline at end of file diff --git a/tumblr_harvester.py b/tumblr_harvester.py new file mode 100644 index 0000000..c8b31cf --- /dev/null +++ b/tumblr_harvester.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from __future__ import absolute_import +import logging +from sfmutils.harvester import BaseHarvester +import pytumblr +import time + +log = logging.getLogger(__name__) + +QUEUE = "tumblr_harvester" +TIMELINE_ROUTING_KEY = "harvest.start.tumblr.tumblr_user_timeline" +SEARCH_ROUTING_KEY = "harvest.start.tumblr.tumblr_search" + + +class TumblrHarvester(BaseHarvester): + def __init__(self, mq_config=None, debug=False): + BaseHarvester.__init__(self, mq_config=mq_config, debug=debug) + self.tumblrapi = None + + def harvest_seeds(self): + self._create_tumblrarc() + + harvest_type = self.message.get("type") + log.debug("Harvest type is %s", harvest_type) + if harvest_type == "tumblr_user_timeline": + self.user_timeline() + elif harvest_type == "tumblr_search": + self.tag_search() + else: + raise KeyError + + def user_timeline(self): + incremental = self.message.get("options", {}).get("incremental", False) + + for seed in self.message.get("seeds", []): + self._user_post(seed["id"], seed.get("token"), incremental) + if not self.harvest_result.success: + break + + def _user_post(self, seed_id, host_name, incremental): + log.info("Harvesting user %s with seed_id %s. Incremental is %s. Sizes is %s", host_name, seed_id, incremental) + assert host_name + # Get offset from state_store + offset = self.state_store.get_state(__name__, + "timeline.{}.offset".format(host_name)) if incremental else None + + max_offset = self._process_posts(self._post(host_name=host_name, offset=offset)) + log.debug("Timeline for %s offset %s returned %s posts.", host_name, + offset, self.harvest_result.stats_summary().get("posts")) + + # Update state store + if incremental and max_offset: + self.state_store.set_state(__name__, "timeline.{}.offset".format(host_name), max_offset) + + def _post(self, host_name, offset): + start_request = 0 + params = { + } + + while True: + if offset: + params['offset'] = offset + start_request += 1 + # log.debug("Fetching %s of %s times.", start_request, max_request) + resp = self.tumblrapi.posts(host_name, **params) + # total_posts = resp['response']['blog']['total_posts'] + posts = resp.json()['posts'] + if len(posts) == 0: + log.info("no new weibo post matching %s", params) + break + + for post in posts: + yield post + + offset = len(posts) + + if start_request == 50: + seconds = 60 + log.info("Reach max request, sleep %d.", seconds) + time.sleep(seconds) + # reset start request + start_request = 0 + + def _process_posts(self, posts): + max_offset = None + for count, post in enumerate(posts): + if not count % 100: + log.debug("Processed %s posts", count) + if self.stop_event.is_set(): + log.debug("Stopping since stop event set.") + break + if "id" in post: + max_offset += 1 + return max_offset + + def tag_search(self): + pass + + def _create_tumblrarc(self): + self.tumblrapi = pytumblr.TumblrRestClient(self.message["credentials"]["consumer_key"], + self.message["credentials"]["consumer_secret"], + self.message["credentials"]["access_token"], + self.message["credentials"]["access_token_secret"]) + + +if __name__ == "__main__": + TumblrHarvester.main(TumblrHarvester, QUEUE, [SEARCH_ROUTING_KEY, TIMELINE_ROUTING_KEY]) diff --git a/tumblr_warc_iter.py.py b/tumblr_warc_iter.py.py new file mode 100644 index 0000000..3be35a6 --- /dev/null +++ b/tumblr_warc_iter.py.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python + +from __future__ import absolute_import +from sfmutils.warc_iter import BaseWarcIter + + +class TumblrWarcIter(BaseWarcIter): + def __init__(self, file_paths, limit_user_ids=None): + BaseWarcIter.__init__(self, file_paths) + self.limit_user_ids = limit_user_ids + + def _select_record(self, url): + return url.startswith("https://api.tumblr.com/v2") + + def _item_iter(self, url, json_obj): + pass + + @staticmethod + def item_types(): + return ["tumblr_status"] + + def _select_item(self, item): + pass + +if __name__ == "__main__": + TumblrWarcIter.main(TumblrWarcIter) \ No newline at end of file