# Flume
![Flume](https://flume.apache.org/_static/flume-logo.png)

- https://flume.apache.org

## Setup

- download from https://flume.apache.org/download.html
- version 1.9.0

In [6]:
%%bash

# Download package
cd /opt/pkgs
#wget -q -c https://downloads.apache.org/flume/1.9.0/apache-flume-1.9.0-bin.tar.gz

# unpack file and create link
tar -zxf /opt/pkgs/apache-flume-1.9.0-bin.tar.gz -C /opt
ln -s /opt/apache-flume-1.9.0-bin /opt/flume

# update guava library on Flume
rm -f /opt/flume/lib/guava-11.0.2.jar 
cp -f /opt/hadoop/share/hadoop/common/lib/guava-27.0-jre.jar /opt/flume/lib

# update envvars.sh
cat >> /opt/envvars.sh << EOF
# Flume
export FLUME_HOME=/opt/flume
export PATH=\${PATH}:\${FLUME_HOME}/bin

EOF

cat /opt/envvars.sh

export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-amd64
export PDSH_RCMD_TYPE=ssh

export HADOOP_HOME=/opt/hadoop
export HADOOP_COMMON_HOME=${HADOOP_HOME}
export HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop
export HADOOP_HDFS_HOME=${HADOOP_HOME}
export HADOOP_MAPRED_HOME=${HADOOP_HOME}
export HADOOP_YARN_HOME=${HADOOP_HOME}

export PATH=${PATH}:${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin     

# Flume
export FLUME_HOME=/opt/flume
export PATH=${PATH}:${FLUME_HOME}/bin



In [7]:
# Load environment variables
%load_ext dotenv
%dotenv -o /opt/envvars.sh
%env

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


{'HOSTNAME': 'hadoop',
 'OLDPWD': '/',
 'PWD': '/opt',
 'HOME': '/home/hadoop',
 'SHELL': '/bin/bash',
 'SHLVL': '1',
 'PATH': '/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/hadoop/bin:/opt/hadoop/sbin:/opt/hadoop/bin:/opt/hadoop/sbin:/opt/hadoop/bin:/opt/hadoop/sbin:/opt/flume/bin',
 '_': '/usr/bin/nohup',
 'JPY_PARENT_PID': '2960',
 'TERM': 'xterm-color',
 'CLICOLOR': '1',
 'PAGER': 'cat',
 'GIT_PAGER': 'cat',
 'MPLBACKEND': 'module://ipykernel.pylab.backend_inline',
 'JAVA_HOME': '/usr/lib/jvm/java-1.8.0-openjdk-amd64',
 'PDSH_RCMD_TYPE': 'ssh',
 'HADOOP_HOME': '/opt/hadoop',
 'HADOOP_COMMON_HOME': '/opt/hadoop',
 'HADOOP_CONF_DIR': '/opt/hadoop/etc/hadoop',
 'HADOOP_HDFS_HOME': '/opt/hadoop',
 'HADOOP_MAPRED_HOME': '/opt/hadoop',
 'HADOOP_YARN_HOME': '/opt/hadoop',
 'FLUME_HOME': '/opt/flume'}

## Tailagent example

- https://flume.apache.org/releases/content/1.9.0/FlumeUserGuide.html

In [8]:
%%writefile /opt/flume/conf/tailagent.conf
# Agent components
tailagent.sources = execsource
tailagent.channels = memchannel
tailagent.sinks = hdfssink

# Configuring source
tailagent.sources.execsource.type = exec
tailagent.sources.execsource.command = tail -F /tmp/events

# Configuring sink
tailagent.sinks.hdfssink.type = hdfs
tailagent.sinks.hdfssink.hdfs.path = /tmp
tailagent.sinks.hdfssink.hdfs.filePrefix = tailevents-
tailagent.sinks.hdfssink.hdfs.fileType = DataStream

# Configuring channel
tailagent.channels.memchannel.type = memory

# Bind the source and sink to the channel 
tailagent.sources.execsource.channels = memchannel
tailagent.sinks.hdfssink.channel = memchannel

Writing /opt/flume/conf/tailagent.conf


In [9]:
%%bash

# run agent in background
cd /opt/flume

flume-ng agent -n tailagent -c ./conf \
-f ./conf/tailagent.conf > ./tailagent.output 2>&1 &
echo $! > ./tailagent.pid

ps -fp $(cat ./tailagent.pid)

UID          PID    PPID  C STIME TTY          TIME CMD
hadoop      6523    6522  0 21:21 ?        00:00:00 /bin/bash /opt/flume/bin/flume-ng agent -n tailagent -c ./conf -f ./conf/tailagent.conf


In [10]:
%%bash

# run random generator in background
cd /opt/flume

cat > randomgen.sh << EOF
while true
do
    echo \${RANDOM} >> /tmp/events
    sleep 1
done
EOF

chmod +x randomgen.sh
./randomgen.sh > /dev/null 2>&1 &
echo $! > ./randomgen.pid

ps -fp $(cat ./randomgen.pid)

UID          PID    PPID  C STIME TTY          TIME CMD
hadoop      6656    6653  0 21:22 ?        00:00:00 bash


In [12]:
%%bash

# check files generated in HDFS
hdfs dfs -ls /tmp/tailevents*

-rw-r--r--   2 hadoop supergroup         55 2021-06-27 21:22 /tmp/tailevents-.1624828948256
-rw-r--r--   2 hadoop supergroup         56 2021-06-27 21:22 /tmp/tailevents-.1624828948257
-rw-r--r--   2 hadoop supergroup         58 2021-06-27 21:22 /tmp/tailevents-.1624828948258
-rw-r--r--   2 hadoop supergroup         58 2021-06-27 21:23 /tmp/tailevents-.1624828948259
-rw-r--r--   2 hadoop supergroup          0 2021-06-27 21:23 /tmp/tailevents-.1624828948260.tmp


In [13]:
%%bash

# cat files
hdfs dfs -cat /tmp/tailevents*

4308
30433
24720
11196
24570
8321
2580
16701
7703
2517
2763
19774
24041
7424
1814
18229
7919
25994
25194
10512
30738
1007
25130
21030
25301
8267
22361
16686
15422
13443
12395
22523
11434
3349
26125
1759
15299
16053
10045
19450


In [14]:
%%bash

cd /opt/flume

# kill random generator
kill $(cat randomgen.pid)
rm randomgen.pid

# kill tailagent
kill $(cat tailagent.pid)
rm tailagent.pid
rm tailagent.output

# remove files
hdfs dfs -rm /tmp/tailevents*

Deleted /tmp/tailevents-.1624828948256
Deleted /tmp/tailevents-.1624828948257
Deleted /tmp/tailevents-.1624828948258
Deleted /tmp/tailevents-.1624828948259
Deleted /tmp/tailevents-.1624828948260
Deleted /tmp/tailevents-.1624828948261
Deleted /tmp/tailevents-.1624828948262
Deleted /tmp/tailevents-.1624828948263.tmp
