In [5]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [3]:
spark = SparkSession.builder.getOrCreate()
spark

In [4]:
df_chl = spark.read.csv(r"C:\Users\maheshbabu.devathi\OneDrive - Advance Auto Parts\Desktop\AAP\practice\pyspark\data\challenge.csv", header = True)

In [7]:
df_chl.show(2)

+--------------+-------+----------------+----------+
|    ip_address|Country|     Domain Name|Bytes_used|
+--------------+-------+----------------+----------+
| 52.81.192.172|  China|odnoklassniki.ru|       463|
|119.239.207.13|  China|        youtu.be|        51|
+--------------+-------+----------------+----------+
only showing top 2 rows



In [6]:
df_chl.dtypes

[('ip_address', 'string'),
 ('Country', 'string'),
 ('Domain Name', 'string'),
 ('Bytes_used', 'string')]

In [12]:
myschema = StructType([StructField('ip_address', StringType()),
                       StructField('Country', StringType()),
                       StructField('Domain Name', StringType()),
                       StructField('Bytes_used', FloatType())])

In [13]:
df_chl = spark.read.csv(r"C:\Users\maheshbabu.devathi\OneDrive - Advance Auto Parts\Desktop\AAP\practice\pyspark\data\challenge.csv", header = True, schema = myschema)

In [14]:
df_chl.show(2)

+--------------+-------+----------------+----------+
|    ip_address|Country|     Domain Name|Bytes_used|
+--------------+-------+----------------+----------+
| 52.81.192.172|  China|odnoklassniki.ru|     463.0|
|119.239.207.13|  China|        youtu.be|      51.0|
+--------------+-------+----------------+----------+
only showing top 2 rows



In [15]:
df_chl.dtypes

[('ip_address', 'string'),
 ('Country', 'string'),
 ('Domain Name', 'string'),
 ('Bytes_used', 'float')]

C1) Add a column to say yes or no to whether the country is Mexico <br>
C2) Group by your new column and sum bytes_used<br>
C3) Group by Country & use the sqlfunc.countDistinct function to calculate the number of IP addresses seen in each country

In [16]:
## C1
df_c1 = df_chl.withColumn('Cnt_Mex', when(df_chl.Country == 'Mexico','Yes').otherwise('No'))
df_c1.show()

+---------------+--------------+-----------------+----------+-------+
|     ip_address|       Country|      Domain Name|Bytes_used|Cnt_Mex|
+---------------+--------------+-----------------+----------+-------+
|  52.81.192.172|         China| odnoklassniki.ru|     463.0|     No|
| 119.239.207.13|         China|         youtu.be|      51.0|     No|
|  68.69.217.210|         China|        adobe.com|      10.0|     No|
|   7.191.21.223|      Bulgaria|     linkedin.com|     853.0|     No|
|   211.13.10.68|     Indonesia|          hud.gov|      29.0|     No|
|   239.80.21.97|      Suriname|       smh.com.au|     218.0|     No|
|106.214.106.233|       Jamaica|    amazonaws.com|      95.0|     No|
| 127.242.24.138|         China| surveymonkey.com|     123.0|     No|
|     99.2.6.139|Czech Republic|     geocities.jp|     322.0|     No|
|   237.54.11.63|         China|       amazon.com|      83.0|     No|
| 252.141.157.25|         Japan|      cornell.edu|     374.0|     No|
|185.220.128.248|   

In [24]:
# C2
import pyspark.sql.functions as sqlfunc
df_c2 = df_c1.groupBy('Cnt_Mex').agg(sqlfunc.sum('Bytes_used').alias('Total Bytes Used'))

In [25]:
df_c2.show()

+-------+----------------+
|Cnt_Mex|Total Bytes Used|
+-------+----------------+
|     No|        508076.0|
|    Yes|          6293.0|
+-------+----------------+



In [26]:
#C3
df_c3 = df_chl.groupBy('Country').agg(sqlfunc.countDistinct('ip_address').alias('Unique_IP_address'))
df_c3.show()

+-----------+-----------------+
|    Country|Unique_IP_address|
+-----------+-----------------+
|       Chad|                1|
|     Russia|               56|
|   Paraguay|                1|
|      Yemen|                1|
|     Sweden|               28|
|Philippines|               65|
|   Malaysia|                5|
|     Turkey|                1|
|     Malawi|                2|
|    Germany|                5|
|    Comoros|                1|
|Afghanistan|                5|
|     Rwanda|                1|
|      Sudan|                1|
|     France|               21|
|     Greece|                8|
|  Sri Lanka|                3|
|   Dominica|                1|
|  Argentina|               14|
|    Belgium|                1|
+-----------+-----------------+
only showing top 20 rows



In [27]:
df_c3.sort(desc('Unique_IP_address')).show()

+--------------+-----------------+
|       Country|Unique_IP_address|
+--------------+-----------------+
|         China|              172|
|     Indonesia|              114|
|   Philippines|               65|
|        Russia|               56|
|        Brazil|               35|
|        Poland|               31|
|        Sweden|               28|
|         Japan|               25|
|Czech Republic|               23|
|      Portugal|               23|
|        France|               21|
|          Peru|               19|
|      Colombia|               17|
| United States|               15|
|     Argentina|               14|
|       Ukraine|               14|
|        Mexico|               13|
|      Thailand|               12|
|       Nigeria|               11|
|        Canada|               11|
+--------------+-----------------+
only showing top 20 rows

