Skip to content

Commit

Permalink
Upgrade to Storm-crawler 1.8 and Elasticsearch 6.0
Browse files Browse the repository at this point in the history
including refactoring of packages
  • Loading branch information
sebastian-nagel committed Dec 19, 2017
1 parent 742f9b1 commit 022ad09
Show file tree
Hide file tree
Showing 23 changed files with 328 additions and 269 deletions.
43 changes: 16 additions & 27 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -22,43 +22,34 @@ RUN apt-get update -qq && \
#
# Elasticsearch and Kibana
#
RUN wget -qO - https://packages.elastic.co/GPG-KEY-elasticsearch \
RUN wget -qO - https://artifacts.elastic.co/GPG-KEY-elasticsearch \
| apt-key add -
RUN echo "deb https://packages.elastic.co/elasticsearch/2.x/debian stable main" \
>> /etc/apt/sources.list.d/elasticsearch-2.x.list
RUN echo "deb http://packages.elastic.co/kibana/4.5/debian stable main" \
>> /etc/apt/sources.list.d/elasticsearch-2.x.list
RUN echo "deb https://artifacts.elastic.co/packages/6.x/apt stable main" \
>> /etc/apt/sources.list.d/elasticsearch-6.x.list
RUN apt-get update -qq && \
apt-get install -yq --no-install-recommends \
elasticsearch=2.3.1 \
kibana=4.5.1
elasticsearch=6.0.1 \
kibana=6.0.1
RUN ln -s /usr/share/elasticsearch/bin/elasticsearch /usr/bin/elasticsearch
RUN ln -s /opt/kibana/bin/kibana /usr/bin/kibana
RUN mkdir /var/log/kibana && chown -R kibana:kibana /var/log/kibana
RUN chown -R kibana:kibana /opt/kibana/
# install marvel, see https://www.elastic.co/downloads/marvel
USER elasticsearch
RUN /usr/share/elasticsearch/bin/plugin install license
RUN /usr/share/elasticsearch/bin/plugin install marvel-agent
USER kibana
RUN /opt/kibana/bin/kibana plugin --install elasticsearch/marvel/latest
RUN /opt/kibana/bin/kibana plugin --install elastic/sense
RUN ln -s /usr/share/kibana/bin/kibana /usr/bin/kibana
RUN chown -R kibana:kibana /usr/share/kibana/
USER root
# system configuration, see https://www.elastic.co/guide/en/elasticsearch/reference/current/setup-configuration.html
# system configuration, see https://www.elastic.co/guide/en/elasticsearch/reference/current/deb.html
ADD etc/sysctl.d/60-elasticsearch.conf /etc/sysctl.d/60-elasticsearch.conf
ADD etc/supervisor/conf.d/elasticsearch.conf /etc/supervisor/conf.d/elasticsearch.conf
ADD etc/supervisor/conf.d/kibana.conf /etc/supervisor/conf.d/kibana.conf
RUN chmod -R 644 /etc/sysctl.d/60-elasticsearch.conf /etc/supervisor/conf.d/*.conf
ENV ES_HEAP_SIZE=20g
# enable updates via scripting
RUN echo "\n\nscript.engine.groovy.inline.update: true\n" >>/etc/elasticsearch/elasticsearch.yml
RUN sed -Ei 's@^path\.data: .*@path.data: /data/elasticsearch@' /etc/elasticsearch/elasticsearch.yml

#
# Apache Storm
#
ENV STORM_VERSION=1.0.1
RUN wget -q -O - http://mirrors.ukfast.co.uk/sites/ftp.apache.org/storm/apache-storm-$STORM_VERSION/apache-storm-$STORM_VERSION.tar.gz \
| tar -xzf - -C /opt
ENV STORM_VERSION=1.1.1
COPY downloads/apache-storm-$STORM_VERSION.tar.gz /tmp/apache-storm-$STORM_VERSION.tar.gz
RUN tar -xzf /tmp/apache-storm-$STORM_VERSION.tar.gz -C /opt
RUN rm /tmp/apache-storm-$STORM_VERSION.tar.gz
ENV STORM_HOME /opt/apache-storm-$STORM_VERSION
RUN groupadd storm && \
useradd --gid storm --home-dir /home/storm \
Expand Down Expand Up @@ -89,15 +80,13 @@ RUN mkdir news-crawler/ && \
mkdir news-crawler/seeds/ && \
chmod -R a+rx news-crawler/
# add the news crawler uber-jar
ADD target/crawler-1.4-SNAPSHOT.jar news-crawler/lib/crawler.jar
ADD target/crawler-1.8-SNAPSHOT.jar news-crawler/lib/crawler.jar
# and configuration files
ADD conf/*.* news-crawler/conf/
ADD seeds/feeds.txt news-crawler/seeds/
ADD seeds/*.txt news-crawler/seeds/
ADD bin/*.sh news-crawler/bin/
ADD bin/es_status news-crawler/bin/
# add storm-crawler/external/elasticsearch/ES_IndexInit.sh
RUN wget -O news-crawler/bin/ES_IndexInit.sh \
https://raw.githubusercontent.com/DigitalPebble/storm-crawler/master/external/elasticsearch/ES_IndexInit.sh

USER root
RUN chown -R ubuntu:ubuntu /home/ubuntu && \
chmod -R a+r /home/ubuntu && \
Expand Down
20 changes: 16 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ You can check that the URLs have been injected on [http://localhost:9200/status/
You can then run the crawl topology with :

``` sh
storm jar target/crawler-1.0-SNAPSHOT.jar com.digitalpebble.stormcrawler.CrawlTopology -conf conf/es-conf.yaml -conf conf/crawler-conf.yaml
storm jar target/crawler-1.0-SNAPSHOT.jar org.commoncrawl.stormcrawler.news.CrawlTopology -conf conf/es-conf.yaml -conf conf/crawler-conf.yaml
```

The topology will create WARC files in the directory specified in the configuration under the key `warc.dir`. This directory must be created beforehand.
Expand All @@ -52,9 +52,21 @@ See instructions on [https://github.com/DigitalPebble/storm-crawler/tree/master/
Run Crawl from Docker Container
-------------

Build the Docker image from the [Dockerfile](./Dockerfile):
First, download Apache Storm:
```
docker build -t newscrawler:1.0 .
STORM_VERSION=1.1.1
mkdir downloads
wget -q -P downloads --timestamping http://mirrors.ukfast.co.uk/sites/ftp.apache.org/storm/apache-storm-$STORM_VERSION/apache-storm-$STORM_VERSION.tar.gz
```

Second, the script to create the Elasticsearch index:
```
wget -O bin/ES_IndexInit.sh https://raw.githubusercontent.com/DigitalPebble/storm-crawler/master/external/elasticsearch/ES_IndexInit.sh
````

Then build the Docker image from the [Dockerfile](./Dockerfile):
```
docker build -t newscrawler:1.8 .
```

Note: the uberjar is included in the Docker image and needs to be built first.
Expand All @@ -66,7 +78,7 @@ docker run --net=host \
-p 5601:5601 -p 8080:8080 \
-v .../newscrawl/elasticsearch:/data/elasticsearch \
-v .../newscrawl/warc:/data/warc \
--rm -i -t newscrawler:1.0 /bin/bash
--rm -i -t newscrawler:1.8 /bin/bash
```

NOTE: don't forget to adapt the paths to mounted volumes used to persist data on the host.
Expand Down
73 changes: 32 additions & 41 deletions aws/packer/bootstrap.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,46 +28,38 @@ sudo cp /tmp/install/etc/supervisor/supervisord.conf /etc/supervisor/supervisord
#
# see https://www.elastic.co/guide/en/elasticsearch/reference/master/rpm.html
#
sudo rpm --import https://packages.elastic.co/GPG-KEY-elasticsearch
sudo rpm --import https://artifacts.elastic.co/GPG-KEY-elasticsearch
sudo bash -c 'cat >/etc/yum.repos.d/elasticsearch.repo <<"EOF"
[elasticsearch-2.x]
name=Elasticsearch repository for 2.x packages
baseurl=https://packages.elastic.co/elasticsearch/2.x/centos
[elasticsearch-6.x]
name=Elasticsearch repository for 6.x packages
baseurl=https://artifacts.elastic.co/packages/6.x/yum
gpgcheck=1
gpgkey=https://packages.elastic.co/GPG-KEY-elasticsearch
gpgkey=https://artifacts.elastic.co/GPG-KEY-elasticsearch
enabled=1
autorefresh=1
type=rpm-md
[kibana-4.5]
name=Kibana repository for 4.5.x packages
baseurl=http://packages.elastic.co/kibana/4.5/centos
[kibana-6.x]
name=Kibana repository for 6.x packages
baseurl=https://artifacts.elastic.co/packages/6.x/yum
gpgcheck=1
gpgkey=http://packages.elastic.co/GPG-KEY-elasticsearch
gpgkey=https://artifacts.elastic.co/GPG-KEY-elasticsearch
enabled=1
autorefresh=1
type=rpm-md
EOF'

sudo yum install -y elasticsearch-2.3.1 kibana-4.5.1
sudo yum install -y elasticsearch-6.0.1 kibana-6.0.1
sudo chkconfig --add elasticsearch

sudo -u elasticsearch /usr/share/elasticsearch/bin/plugin install -b license
sudo -u elasticsearch /usr/share/elasticsearch/bin/plugin install -b marvel-agent

sudo /opt/kibana/bin/kibana plugin --install elasticsearch/marvel/latest
sudo /opt/kibana/bin/kibana plugin --install elastic/sense
sudo /usr/share/elasticsearch/bin/elasticsearch-plugin install -b repository-s3

sudo ln -s /usr/share/elasticsearch/bin/elasticsearch /usr/bin/elasticsearch
sudo ln -s /opt/kibana/bin/kibana /usr/bin/kibana

sudo groupadd kibana && sudo useradd --gid kibana kibana
sudo chown -R kibana:kibana /var/log/kibana
sudo chown -R kibana:kibana /opt/kibana/

sudo cp /tmp/install/etc/sysctl.d/60-elasticsearch.conf /etc/sysctl.d/
sudo cp /tmp/install/elasticsearch.conf /etc/supervisor/conf.d/
sudo cp /tmp/install/kibana.conf /etc/supervisor/conf.d/
sudo cp /tmp/install/etc/sysctl.d/60-elasticsearch.conf /etc/sysctl.d/
sudo cp /tmp/install/etc/supervisor/conf.d/elasticsearch.conf /etc/supervisor/conf.d/
sudo cp /tmp/install/etc/supervisor/conf.d/kibana.conf /etc/supervisor/conf.d/

# must start elasticsearch via supervisorctl
# TODO: avoid issues if it's started erroneously via
Expand All @@ -79,11 +71,11 @@ sudo cp /tmp/install/kibana.conf /etc/supervisor/conf.d/
#
# Apache Storm and Zookeeper
#
ZOOKEEPER_VERSION=3.4.8
ZOOKEEPER_VERSION=3.4.11
wget -q -O - http://mirrors.ukfast.co.uk/sites/ftp.apache.org/zookeeper/zookeeper-$ZOOKEEPER_VERSION/zookeeper-$ZOOKEEPER_VERSION.tar.gz \
| sudo tar -xzf - -C /opt
ZOOKEEPER_HOME=/opt/zookeeper-$ZOOKEEPER_VERSION
STORM_VERSION=1.0.1
STORM_VERSION=1.1.1
wget -q -O - http://mirrors.ukfast.co.uk/sites/ftp.apache.org/storm/apache-storm-$STORM_VERSION/apache-storm-$STORM_VERSION.tar.gz \
| sudo tar -xzf - -C /opt
STORM_HOME=/opt/apache-storm-$STORM_VERSION
Expand All @@ -98,8 +90,8 @@ sudo ln -s $STORM_HOME/bin/storm /usr/bin/storm
sudo ln -s $ZOOKEEPER_HOME/conf/zoo_sample.cfg $ZOOKEEPER_HOME/conf/zoo.cfg
sudo ln -s $ZOOKEEPER_HOME /usr/share/zookeeper
sudo bash <<EOF
cp etc/supervisor/conf.d/storm-*.conf /etc/supervisor/conf.d/
cp etc/supervisor/conf.d/zookeeper.conf /etc/supervisor/conf.d/
cp /tmp/install/etc/supervisor/conf.d/storm-*.conf /etc/supervisor/conf.d/
cp /tmp/install/etc/supervisor/conf.d/zookeeper.conf /etc/supervisor/conf.d/
chmod 644 /etc/supervisor/conf.d/*.conf
EOF

Expand All @@ -108,32 +100,31 @@ EOF
#
# Storm crawler / News crawler
#
cp /tmp/install/newscrawler .
cp -r /tmp/install/news-crawler .
mkdir -p news-crawler/{conf,bin,lib,seeds}
# seeds must readable for user "storm"
# seeds must be readable for user "storm"
chmod a+rx news-crawler/seeds/
chmod 644 news-crawler/seeds/*
cp /tmp/install/bin/*.sh news-crawler/bin/
cp /tmp/install/news-crawler/lib/crawler-1.0-SNAPSHOT.jar news-crawler/lib/
wget -O news-crawler/bin/ES_IndexInit.sh https://raw.githubusercontent.com/DigitalPebble/storm-crawler/master/external/elasticsearch/ES_IndexInit.sh
cp /tmp/install/news-crawler/lib/crawler-1.8-SNAPSHOT.jar news-crawler/lib/
chmod u+x news-crawler/bin/*


#
# Volumes
#
sudo bash <<EOF
mkdir /data/elasticsearch /data/warc
echo "/dev/xvdb /data/elasticsearch auto defaults,nofail,comment=cloudconfig 0 2" >>/etc/fstab
echo "/dev/xvdc /data/warc auto defaults,nofail,comment=cloudconfig 0 2" >>/etc/fstab
mkdir /data
echo "/dev/sdb /data auto defaults,nofail,comment=cloudconfig 0 2" >>/etc/fstab
EOF

# TODO: mount volumes and set owner and permissions
# mount /data/elasticsearch
# chown -R elasticsearch:elasticsearch /data/elasticsearch
# mount /data/warc
# chown -R storm:storm /data/warc

# mount volumes and set owner and permissions
sudo mount /data
sudo mkdir /data/elasticsearch
sudo chown -R elasticsearch:elasticsearch /data/elasticsearch
sudo mkdir /data/warc
sudo chown -R storm:storm /data/warc

# TODO: cronjob to upload WARC files

# TODO cronjobs:
# - to upload WARC files
# - to backup Elasticsearch status index
48 changes: 35 additions & 13 deletions aws/packer/newscrawl-ami.json
Original file line number Diff line number Diff line change
@@ -1,32 +1,54 @@
{
"variables": {
"aws_access_key": "{{env `AWS_ACCESS_KEY_ID`}}",
"aws_secret_key": "{{env `AWS_SECRET_ACCESS_KEY`}}"
"aws_secret_key": "{{env `AWS_SECRET_ACCESS_KEY`}}",
"aws_vpc_id": "{{env `AWS_VPC_ID`}}",
"aws_subnet_id": "{{env `AWS_SUBNET_ID`}}",
"aws_security_group_id": "{{env `AWS_SECURITY_GROUP_ID`}}"
},
"builders": [{
"type": "amazon-ebs",
"source_ami_filter": {
"filters": {
"virtualization-type": "hvm",
"name": "amzn-ami-hvm-*",
"root-device-type": "ebs"
},
"most_recent": true
},
"instance_type": "r4.xlarge",
"region": "us-east-1",
"access_key": "{{user `aws_access_key`}}",
"secret_key": "{{user `aws_secret_key`}}",
"region": "us-east-1",
"source_ami": "ami-6869aa05",
"instance_type": "m3.xlarge",
"vpc_id": "{{user `aws_vpc_id`}}",
"subnet_id": "{{user `aws_subnet_id`}}",
"security_group_id": "{{user `aws_security_group_id`}}",
"associate_public_ip_address": true,
"ssh_username": "ec2-user",
"ssh_timeout": "1200s",
"ami_name": "Common Crawl News Crawler {{timestamp}}",
"ami_regions": ["us-east-1"],
"ami_groups": ["all"]
"ami_groups": ["all"],
"ami_block_device_mappings": [
{
"device_name": "/dev/sdb",
"virtual_name": "ephemeral0",
"volume_size": 200,
"volume_type": "gp2",
"delete_on_termination": true
}
]
}],
"provisioners": [
{
"type": "file",
"source": "etc",
"destination": "/tmp/install/etc"
},
{
"type": "shell",
"inline": ["mkdir -p /tmp/install/news-crawler/lib/",
"mkdir -p /tmp/install/news-crawler/seeds/"]
},
{
"type": "file",
"source": "etc",
"destination": "/tmp/install/etc"
},
{
"type": "file",
"source": "conf",
Expand All @@ -39,8 +61,8 @@
},
{
"type": "file",
"source": "target/crawler-1.0-SNAPSHOT.jar",
"destination": "/tmp/install/news-crawler/lib/crawler-1.0-SNAPSHOT.jar"
"source": "target/crawler-1.8-SNAPSHOT.jar",
"destination": "/tmp/install/news-crawler/lib/crawler-1.8-SNAPSHOT.jar"
},
{
"type": "file",
Expand Down
Loading

0 comments on commit 022ad09

Please sign in to comment.