### Setup

In [3]:
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-2.3.1/spark-2.3.1-bin-hadoop2.7.tgz
!tar xf spark-2.3.1-bin-hadoop2.7.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.3.1-bin-hadoop2.7"

# !ls

import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate() 
spark

0% [Working]            Hit:1 http://security.ubuntu.com/ubuntu bionic-security InRelease
0% [Connecting to archive.ubuntu.com (185.125.190.39)] [Waiting for headers] [C                                                                               Hit:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
0% [Connecting to archive.ubuntu.com (185.125.190.39)] [Connected to developer.0% [1 InRelease gpgv 88.7 kB] [Connecting to archive.ubuntu.com (185.125.190.39                                                                               Hit:3 http://archive.ubuntu.com/ubuntu bionic InRelease
0% [1 InRelease gpgv 88.7 kB] [Waiting for headers] [Waiting for headers] [Conn                                                                               Ign:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
0% [1 InRelease gpgv 88.7 kB] [Waiting for headers] [Waiting for headers] [Wait                           

### Challenge
1. Add a column to say if country is Mexico
2. Group by Country Column and sum bytes used
3. Group by Country and count amout of IP adresses in each Country

In [9]:
from pyspark.sql.types import *
schema = StructType([
  StructField('ip_address', StringType()),
  StructField('country', StringType()),
  StructField('domain_name', StringType()),
  StructField('bytes_used', FloatType())
])

df = spark.read.csv('/content/drive/MyDrive/challenge.csv', header=True, schema = schema)
df.show(3)

+--------------+-------+----------------+----------+
|    ip_address|country|     domain_name|bytes_used|
+--------------+-------+----------------+----------+
| 52.81.192.172|  China|odnoklassniki.ru|     463.0|
|119.239.207.13|  China|        youtu.be|      51.0|
| 68.69.217.210|  China|       adobe.com|      10.0|
+--------------+-------+----------------+----------+
only showing top 3 rows



In [10]:
from pyspark.sql.functions import *
df = df.withColumn('Mexico?', when(df.country == 'Mexico', 1).otherwise(0))
df.show()

+---------------+--------------+-----------------+----------+-------+
|     ip_address|       country|      domain_name|bytes_used|Mexico?|
+---------------+--------------+-----------------+----------+-------+
|  52.81.192.172|         China| odnoklassniki.ru|     463.0|      0|
| 119.239.207.13|         China|         youtu.be|      51.0|      0|
|  68.69.217.210|         China|        adobe.com|      10.0|      0|
|   7.191.21.223|      Bulgaria|     linkedin.com|     853.0|      0|
|   211.13.10.68|     Indonesia|          hud.gov|      29.0|      0|
|   239.80.21.97|      Suriname|       smh.com.au|     218.0|      0|
|106.214.106.233|       Jamaica|    amazonaws.com|      95.0|      0|
| 127.242.24.138|         China| surveymonkey.com|     123.0|      0|
|     99.2.6.139|Czech Republic|     geocities.jp|     322.0|      0|
|   237.54.11.63|         China|       amazon.com|      83.0|      0|
| 252.141.157.25|         Japan|      cornell.edu|     374.0|      0|
|185.220.128.248|   

In [13]:
import pyspark.sql.functions as sqlfunc
df_bytes_per_country = df.groupBy('country').agg(sqlfunc.sum('bytes_used').alias('total_used'))
df_bytes_per_country.show()

+-----------+----------+
|    country|total_used|
+-----------+----------+
|       Chad|     220.0|
|     Russia|   32193.0|
|   Paraguay|     183.0|
|      Yemen|     670.0|
|     Sweden|   15422.0|
|Philippines|   30338.0|
|   Malaysia|    2550.0|
|     Turkey|     880.0|
|     Malawi|     927.0|
|    Germany|    2575.0|
|    Comoros|     742.0|
|Afghanistan|    2538.0|
|     Rwanda|     371.0|
|      Sudan|     714.0|
|     France|   10559.0|
|     Greece|    3423.0|
|  Sri Lanka|    1918.0|
|   Dominica|     161.0|
|  Argentina|    8574.0|
|    Belgium|     389.0|
+-----------+----------+
only showing top 20 rows



In [18]:
df_ip_per_country = df.groupBy('country').agg(sqlfunc.countDistinct('ip_address').alias('number_of_ips'))
df_ip_per_country.sort(col('number_of_ips').desc()).show()

+--------------+-------------+
|       country|number_of_ips|
+--------------+-------------+
|         China|          172|
|     Indonesia|          114|
|   Philippines|           65|
|        Russia|           56|
|        Brazil|           35|
|        Poland|           31|
|        Sweden|           28|
|         Japan|           25|
|Czech Republic|           23|
|      Portugal|           23|
|        France|           21|
|          Peru|           19|
|      Colombia|           17|
| United States|           15|
|       Ukraine|           14|
|     Argentina|           14|
|        Mexico|           13|
|      Thailand|           12|
|       Nigeria|           11|
|        Canada|           11|
+--------------+-------------+
only showing top 20 rows

