<a href="https://colab.research.google.com/github/d-vinha/SPBD/blob/main/lab2/SPBD_Labs_mapreduce2_exercise_solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MrJob MapReduce Python Example

Word count implemented in pure Python, using the library MrJob.

[MrJob](https://mrjob.readthedocs.io/en/latest/) can be used to write MapReduce jobs and run them on several platforms.

Some key advantages:
+ Write **multi-step** MapReduce jobs in pure Python;
+ Test on your **local machine**;
+ Deploy jobs in several cloud plataforms of several vendors.

In [None]:
#@title Download the dataset and install MrJob
!wget -q -O os_maias.txt https://www.dropbox.com/s/n24v0z7y79np319/os_maias.txt?dl=0
!pip install mrjob --quiet
!wget -q -O /etc/mrjob.conf https://raw.githubusercontent.com/smduarte/spbd-2223/main/lab2/mrjob.conf

##1. MrJob MapReduce Word Frequency

In [None]:
%%file desc_word_freq.py

import string
from mrjob.job import MRJob, MRStep

MAX_FREQ=100000

class MRWordCountFrequency(MRJob):

    def mapper_words(self, _, line):
      line = line.strip()
      line = line.translate(str.maketrans('', '', string.punctuation+'«»'))
      for word in line.split():
        yield word, 1

    def reducer_words(self, key, values):
        yield key, sum(values)

    def mapper_partition_sort(self, word, freq):
      yield '%05d' % (MAX_FREQ-freq), word

    def reducer_partition_sort(self, freq, words):
      for word in words:
        yield word, MAX_FREQ-int(freq)

    def mapper_total_sort(self, word, freq):
      yield None, (word, freq)

    def reducer_total_sort(self, _, values):
      for word, freq in sorted(values, key= lambda x: x[1], reverse=True):
        yield word, freq

    def steps(self):
        return [ MRStep(mapper=self.mapper_words, reducer=self.reducer_words),
                 MRStep(mapper=self.mapper_partition_sort, reducer=self.reducer_partition_sort),
                 MRStep(mapper=self.mapper_total_sort, reducer=self.reducer_total_sort)]

if __name__ == '__main__':
    MRWordCountFrequency.run()

Overwriting desc_word_freq.py


In [None]:
!rm -rf results
!python -m desc_word_freq  --output-dir results --cleanup NONE os_maias.txt
!head results/*

Using configs in /etc/mrjob.conf
No configs specified for inline runner
Running step 1 of 3...
Creating temp directory /tmp/desc_word_freq.root.20221011.145554.896627
Running step 2 of 3...
Running step 3 of 3...
job output is in results
"de"	8311
"a"	6736
"o"	6615
"que"	4986
"e"	4533
"um"	3026
"com"	2794
"do"	2571
"da"	2202
"uma"	2170


##2. Weblog DDOS Attack Analysis

In [None]:
!wget -q -O web.log https://www.dropbox.com/s/0r8902uj9yum7dg/web.log?dl=0
!wc web.log

  143457   860742 11758533 web.log


### 1. Count the number of unique IP addresses involved in the DDOS attack.



In [None]:
%%file unique_ips.py

from mrjob.job import MRJob, MRStep

class MRUniqueIPs(MRJob):

    def steps(self):
      return [MRStep(mapper=self.mapper_ip, reducer=self.reducer_ip),
              MRStep(reducer=self.reducer_filter)]

    def mapper_ip(self, _, line):
      _, ip_source, _ = line.strip().split(' ', 2)
      yield ip_source, None

    def reducer_ip(self, ip_source, _):
      yield None, 1

    def reducer_filter(self, _, values):
      yield "UNIQUE IPs", sum(values)

if __name__ == '__main__':
    MRUniqueIPs.run()

Overwriting unique_ips.py


In [None]:
!rm -rf results
!python -m unique_ips  --output-dir results --cleanup NONE web.log
!head results/*

Using configs in /etc/mrjob.conf
No configs specified for inline runner
Running step 1 of 2...
Creating temp directory /tmp/unique_ips.root.20221011.145610.145745
Running step 2 of 2...
job output is in results
"UNIQUE IPs"	167


### 2. For each interval of 10 seconds, provide the following information: [number of requests, average execution time, maximum time, minimum time]





In [None]:
%%file interval_stats.py

from statistics import *
from mrjob.job import MRJob, MRStep

class MRIntervalStats(MRJob):

  def mapper(self, _, line):
        vals = line.strip().split(' ')
        timestamp = vals[0]
        execution_time = float(vals[5])
        interval = timestamp[0:18] # YYYY-MM-DDTHH:MM:S -> 10s intervals
        yield interval, execution_time

  def reducer(self, interval, values):
      times = list(values)
      yield interval, (len(times), min(times), mean(times), max(times))

if __name__ == '__main__':
    MRIntervalStats.run()

Overwriting interval_stats.py


In [None]:
%%shell
rm -rf results
python -m interval_stats --output-dir results --cleanup NONE web.log && head results/*

Using configs in /etc/mrjob.conf
No configs specified for inline runner
Running step 1 of 1...
Creating temp directory /tmp/interval_stats.root.20221011.151041.444906
job output is in results
==> results/part-00000 <==
"2016-12-06T08:58:3"	[483,0.013,7.593424430641822,46.849]
"2016-12-06T08:58:4"	[2611,0.014,30.15984565300651,69.654]
"2016-12-06T08:58:5"	[5500,0.017,38.52511163636364,80.846]
"2016-12-06T08:59:0"	[6914,0.018,38.534382123228234,81.659]
"2016-12-06T08:59:1"	[6271,0.017,32.96384978472333,83.993]
"2016-12-06T08:59:2"	[5434,0.051,17.29333143172617,77.967]
"2016-12-06T08:59:3"	[8015,0.056,11.21015221459763,67.441]
"2016-12-06T08:59:4"	[7947,0.914,7.7618157795394485,65.706]

==> results/part-00001 <==
"2016-12-06T08:59:5"	[5983,0.678,3.8216643824168477,54.29]
"2016-12-06T09:00:0"	[6882,0.017,8.649971519907004,45.314]
"2016-12-06T09:00:1"	[9719,0.225,7.857372672085606,34.406]
"2016-12-06T09:00:2"	[6616,0.014,4.611345223700121,25.847]
"2016-12-06T09:00:3"	[6771,0.007,1.604763845



### 3. Create an inverted index that, for each interval of 10 seconds, has a list of (unique) IPs executing accesses (to each URL).

In [None]:
%%file inverted_index.py

from mrjob.job import MRJob, MRStep

class MRInvertedIndex(MRJob):

  def mapper(self, _, line):
        vals = line.strip().split(' ')
        if len(vals) >= 6:
          timestamp = vals[0]
          interval = timestamp[0:18] # YYYY-MM-DDTHH:MM:S -> 10s intervals

          source_ip = vals[1]
          target_url = vals[4]
          yield "{}-{}".format(interval, target_url), source_ip

  def reducer(self, key, values):
    yield key, list(values)

if __name__ == '__main__':
    MRInvertedIndex.run()

Overwriting inverted_index.py


In [None]:
%%shell
rm -rf results
python -m inverted_index --output-dir results web.log && head results/*

Using configs in /etc/mrjob.conf
No configs specified for inline runner
Running step 1 of 1...
Creating temp directory /tmp/inverted_index.root.20221011.151213.583589
job output is in results
Removing temp directory /tmp/inverted_index.root.20221011.151213.583589...
==> results/part-00000 <==
"2016-12-06T08:58:3-\/codemove\/01IX95N3AFP4"	["120.52.73.98","120.52.73.98","120.52.73.98","120.52.73.98","120.52.73.98","120.52.73.98","120.52.73.98","120.52.73.98","120.52.73.98","120.52.73.98","120.52.73.98"]
"2016-12-06T08:58:3-\/codemove\/0GLNQSHCISWJ"	["120.52.73.98","120.52.73.98","120.52.73.98","120.52.73.98","120.52.73.98","120.52.73.98","120.52.73.98","120.52.73.98","120.52.73.98","120.52.73.98","120.52.73.98"]
"2016-12-06T08:58:3-\/codemove\/1N80W0N2R36C"	["120.52.73.97","120.52.73.97","120.52.73.97","120.52.73.97","120.52.73.97","120.52.73.97","120.52.73.97","120.52.73.97","120.52.73.97","120.52.73.97","120.52.73.97"]
"2016-12-06T08:58:3-\/codemove\/1U6HCG3V2S9D"	["185.28.193.95","185

