In [1]:
import os
import sys
import pandas as pd
import numpy as np
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# Create Spark context
from pyspark import SparkContext, SparkConf
from pyspark.sql import functions as F
from pyspark.sql.types import *
conf = SparkConf().setAppName('dj').setMaster('local[*]')\
    .set("spark.memory.offHeap.enabled","true") \
    .set("spark.memory.offHeap.size","2g")
sc = SparkContext(conf=conf)

# Create Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('dj').getOrCreate()
spark

In [2]:
(sc.applicationId, sc.master, sc.version, sc.uiWebUrl, sc.defaultParallelism, sc.pythonVer, sc.appName, sc.sparkUser,sc,spark.catalog)

('local-1698639345619',
 'local[*]',
 '3.5.0',
 'http://USHYDJDAMODH6.us.deloitte.com:4040',
 16,
 '3.11',
 'dj',
 <bound method SparkContext.sparkUser of <SparkContext master=local[*] appName=dj>>,
 <SparkContext master=local[*] appName=dj>,
 <pyspark.sql.catalog.Catalog at 0x1aede7ca050>)

In [3]:
# Import the data to a DataFrame
departures_df = spark.read.csv('./AA_DFW_2015_Departures_Short.csv.gz', header=True)

# Remove any duration of 0
departures_df = departures_df\
    .filter(departures_df['Actual elapsed time (Minutes)'] == 0)

# Add an ID column
departures_df = departures_df.withColumn('id', F.monotonically_increasing_id())

# Write the file out to JSON format
departures_df.write.json('tmp_output.json', mode='overwrite')
departures_df.show(4,truncate=False)

+-----------------+-------------+-------------------+-----------------------------+---+
|Date (MM/DD/YYYY)|Flight Number|Destination Airport|Actual elapsed time (Minutes)|id |
+-----------------+-------------+-------------------+-----------------------------+---+
|01/01/2015       |0029         |ONT                |0                            |0  |
|01/01/2015       |0043         |DTW                |0                            |1  |
|01/01/2015       |0074         |CLE                |0                            |2  |
|01/01/2015       |0150         |MSY                |0                            |3  |
+-----------------+-------------+-------------------+-----------------------------+---+
only showing top 4 rows



In [4]:
# http://vision.stanford.edu/aditya86/ImageNetDogs/

In [5]:
# !pip install --upgrade scipy

In [29]:

annotations_df = spark.read.csv('annotation.csv',header=True,inferSchema=True,sep=',')
annotations_df.count()
annotations_df = annotations_df.withColumn('dog_list', F.concat_ws('\t', F.split('dog_list', ', ')))
# annotations_df = annotations_df.withColumn('dog_listn', F.substring(F.col('dog_list').cast('string'), 2))
annotations_df = annotations_df.withColumn("dog_list", F.regexp_replace("dog_list", r"[\[\]']+", ""))
annotations_df.show(4,truncate=False)
annotations_df.printSchema()

annotations_df.write.csv('tmp_annotations.csv.gz', compression='gzip',sep='\t',mode='overwrite')

annotations_df = spark.read.csv('tmp_annotations.csv.gz',sep='|')
annotations_df = annotations_df.withColumn("_c0", F.regexp_replace("_c0", r"[\"]+", ""))

annotations_df.show(4,truncate=False)

+--------+---------------+-----+------+----------------------------+
|folder  |filename       |width|height|dog_list                    |
+--------+---------------+-----+------+----------------------------+
|02085620|n02085620_10074|333  |500   |Chihuahua\t25\t10\t276\t498 |
|02085620|n02085620_10131|395  |495   |Chihuahua\t49\t9\t393\t493  |
|02085620|n02085620_10621|500  |298   |Chihuahua\t142\t43\t335\t250|
|02085620|n02085620_1073 |345  |500   |Chihuahua\t0\t27\t312\t498  |
+--------+---------------+-----+------+----------------------------+
only showing top 4 rows

root
 |-- folder: string (nullable = true)
 |-- filename: string (nullable = true)
 |-- width: integer (nullable = true)
 |-- height: integer (nullable = true)
 |-- dog_list: string (nullable = false)

+-----------------------------------------------------------------+
|_c0                                                              |
+-----------------------------------------------------------------+
|02085620\tn02085

In [30]:
# Import the file to a DataFrame and perform a row count
# annotations_df = spark.read.csv('tmp_annotations.csv.gz', sep='|')
full_count = annotations_df.count()
from pyspark.sql.functions import col
# Count the number of rows beginning with '#'
comment_count = annotations_df.where(col('_c0').startswith('#')).count()

# Import the file to a new DataFrame, without commented rows
no_comments_df = spark.read.csv('tmp_annotations.csv.gz', sep='|', comment='#')

# Count the new DataFrame and verify the difference is as expected
no_comments_count = no_comments_df.count()
print("Full count: %d\nComment count: %d\nRemaining count: %d" % (full_count, comment_count, no_comments_count))

Full count: 20580
Comment count: 0
Remaining count: 20580


In [31]:
# Split _c0 on the tab character and store the list in a variable
tmp_fields = F.split(annotations_df['_c0'], "\t")

# Create the colcount column on the DataFrame
annotations_df = annotations_df.withColumn('colcount', F.size(tmp_fields))

# Remove any rows containing fewer than 5 fields
annotations_df_filtered = annotations_df.filter(~ (annotations_df.colcount < 5))

# Count the number of rows
final_count = annotations_df_filtered.count()
print("Initial count: %d\nFinal count: %d" % (full_count, final_count))

Initial count: 20580
Final count: 20580


In [32]:

# Remove any rows containing fewer than 5 fields
annotations_df_filtered = annotations_df.filter(~ (annotations_df.colcount < 5))

# Count the number of rows
final_count = annotations_df_filtered.count()
print("Initial count: %d\nFinal count: %d" % (full_count, final_count))

Initial count: 20580
Final count: 20580


In [33]:
# Split the content of _c0 on the tab character (aka, '\t')
split_cols = F.split(annotations_df["_c0"], '\t')

# Add the columns folder, filename, width, and height
split_df = annotations_df.withColumn('folder', split_cols.getItem(0))
split_df = split_df.withColumn('filename', split_cols.getItem(1))
split_df = split_df.withColumn('width', split_cols.getItem(2))
split_df = split_df.withColumn('height', split_cols.getItem(3))

# Add split_cols as a column
split_df = split_df.withColumn('split_cols', split_cols)

In [34]:
annotations_df.show(4,truncate=False)
split_df.show(4,truncate=False)

+-----------------------------------------------------------------+--------+
|_c0                                                              |colcount|
+-----------------------------------------------------------------+--------+
|02085620\tn02085620_10074\t333\t500\tChihuahua\t25\t10\t276\t498 |9       |
|02085620\tn02085620_10131\t395\t495\tChihuahua\t49\t9\t393\t493  |9       |
|02085620\tn02085620_10621\t500\t298\tChihuahua\t142\t43\t335\t250|9       |
|02085620\tn02085620_1073\t345\t500\tChihuahua\t0\t27\t312\t498   |9       |
+-----------------------------------------------------------------+--------+
only showing top 4 rows

+-----------------------------------------------------------------+--------+--------+---------------+-----+------+-------------------------------------------------------------------+
|_c0                                                              |colcount|folder  |filename       |width|height|split_cols                                                    

In [35]:
# # Split the content of _c0 on the tab character (aka, '\t')
# split_cols = F.split(annotations_df["dog_list"], ',')
# # |_c0     |_c1            |_c2|_c3|
# # |  folder|       filename|width|height|
# # Add the columns folder, filename, width, and height
# split_df = annotations_df.withColumnRename('_c0', 'folder')
# split_df = split_df.withColumnRename('_c1', 'filename')
# split_df = split_df.withColumnRename('_c2', 'width')
# split_df = split_df.withColumnRename('_c3', 'height')

# # Add split_cols as a column
# split_df = split_df.withColumn('split_cols', split_cols)

In [36]:
def retriever(cols, colcount):
  # Return a list of dog data
  return cols[4:colcount]

# Define the method as a UDF
udfRetriever = F.udf(retriever, ArrayType(StringType()))

# Create a new column using your UDF
split_df = split_df.withColumn(
    'dog_list',
    udfRetriever(split_df.split_cols, split_df.colcount)
)

# Remove the original column, split_cols, and the colcount
split_df = split_df\
    .drop('_c0')\
    .drop('split_cols')\
    .drop('colcount')

In [37]:
split_df.show(2,truncate=False)
split_df.printSchema()
annotations_df.show(2,truncate=False)

+--------+---------------+-----+------+-----------------------------+
|folder  |filename       |width|height|dog_list                     |
+--------+---------------+-----+------+-----------------------------+
|02085620|n02085620_10074|333  |500   |[Chihuahua, 25, 10, 276, 498]|
|02085620|n02085620_10131|395  |495   |[Chihuahua, 49, 9, 393, 493] |
+--------+---------------+-----+------+-----------------------------+
only showing top 2 rows

root
 |-- folder: string (nullable = true)
 |-- filename: string (nullable = true)
 |-- width: string (nullable = true)
 |-- height: string (nullable = true)
 |-- dog_list: array (nullable = true)
 |    |-- element: string (containsNull = true)

+----------------------------------------------------------------+--------+
|_c0                                                             |colcount|
+----------------------------------------------------------------+--------+
|02085620\tn02085620_10074\t333\t500\tChihuahua\t25\t10\t276\t498|9       |
|0208

In [38]:
# Rename the column in valid_folders_df
valid_folders_df = annotations_df.withColumn('_c0', F.split(annotations_df['_c0'], '\t')[0])
valid_folders_df = valid_folders_df.withColumnRenamed('_c0', 'folder')
valid_folders_df = valid_folders_df.drop('colcount')
valid_folders_df = valid_folders_df.withColumn('folder',valid_folders_df.folder.cast('int'))
split_df = split_df.withColumn('folder',split_df.folder.cast('int'))
valid_folders_df.printSchema()  

root
 |-- folder: integer (nullable = true)



In [39]:
# Count the number of rows in split_df
split_count = split_df.count()

# Join the DataFrames
joined_df = split_df.join(F.broadcast(valid_folders_df), "folder")

# Compare the number of rows remaining
joined_df = joined_df.dropDuplicates()
joined_count = joined_df.count()    

print("Before: %d\nAfter: %d" % (split_count, joined_count))
joined_df = joined_df.dropna(subset=['dog_list'])
joined_count = joined_df.count()    
print("Before: %d\nAfter: %d" % (split_count, joined_count))

Before: 20580
After: 19956
Before: 20580
After: 19956


In [40]:
joined_df.show(2,truncate=False)

+-------+---------------+-----+------+------------------------------------+
|folder |filename       |width|height|dog_list                            |
+-------+---------------+-----+------+------------------------------------+
|2085782|n02085782_82   |480  |427   |[Japanese_spaniel, 44, 25, 460, 386]|
|2085936|n02085936_22127|500  |500   |[Maltese_dog, 6, 31, 423, 498]      |
+-------+---------------+-----+------+------------------------------------+
only showing top 2 rows



In [41]:
# Determine the row counts for each DataFrame
split_count = split_df.count()
joined_count = joined_df.count()

# Create a DataFrame containing the invalid rows
invalid_df = split_df.join(F.broadcast(joined_df), 'folder', 'left_anti')

# Validate the count of the new DataFrame is as expected
invalid_count = invalid_df.count()
print(" split_df:\t%d\n joined_df:\t%d\n invalid_df: \t%d" % (split_count, joined_count, invalid_count))

# Determine the number of distinct folder columns removed
invalid_folder_count = invalid_df.select('folder').distinct().count()
print("%d distinct invalid folders found" % invalid_folder_count)

 split_df:	20580
 joined_df:	19956
 invalid_df: 	624
1 distinct invalid folders found


In [42]:
# joined_df = annotations_df
# joined_df.select('dog_list').show(10, truncate=False)

In [43]:
joined_df.dtypes

[('folder', 'int'),
 ('filename', 'string'),
 ('width', 'string'),
 ('height', 'string'),
 ('dog_list', 'array<string>')]

In [44]:
# Select the dog details and show 10 untruncated rows
print(joined_df.select('dog_list').show(10, truncate=False))

# Define a schema type for the details in the dog list
DogType = StructType([
	StructField("breed", StringType(), False),
    StructField("start_x", IntegerType(), False),
    StructField("start_y", IntegerType(), False),
    StructField("end_x", IntegerType(), False),
    StructField("end_y", IntegerType(), False)
])

+------------------------------------+
|dog_list                            |
+------------------------------------+
|[Japanese_spaniel, 44, 25, 460, 386]|
|[Maltese_dog, 6, 31, 423, 498]      |
|[Maltese_dog, 103, 51, 267, 427]    |
|[Maltese_dog, 129, 96, 389, 348]    |
|[Maltese_dog, 14, 28, 396, 332]     |
|[Maltese_dog, 180, 13, 471, 357]    |
|[basset, 16, 24, 350, 318]          |
|[beagle, 86, 0, 498, 319]           |
|[beagle, 110, 47, 369, 470]         |
|[bloodhound, 35, 55, 260, 227]      |
+------------------------------------+
only showing top 10 rows

None


In [45]:
joined_df.where(joined_df.filename =='n02086240_7195').show(10, truncate=False)

+-------+--------------+-----+------+-----------------------------------------------------------+
|folder |filename      |width|height|dog_list                                                   |
+-------+--------------+-----+------+-----------------------------------------------------------+
|2086240|n02086240_7195|500  |375   |[Shih-Tzu, 90, 238, 237, 369, Shih-Tzu, 218, 219, 331, 357]|
+-------+--------------+-----+------+-----------------------------------------------------------+



In [46]:
# Create a function to return the number and type of dogs as a tuple
def dogParse(doglist):
    dogs= []
    for _ in range(len(doglist)//5):
        (breed, start_x, start_y, end_x, end_y) = doglist[_*5:_*5+5]
        dogs.append((breed, int(start_x), int(start_y), int(end_x), int(end_y)))
    return dogs
dogParse(['Shih-Tzu', '90', '238', '237', '369','Shih-Tzu', '90', '238', '237', '369'])

[('Shih-Tzu', 90, 238, 237, 369), ('Shih-Tzu', 90, 238, 237, 369)]

In [47]:
# Create a function to return the number and type of dogs as a tuple
def dogParse(doglist):
  dogs = []
  for _ in range(len(doglist)//5):
    (breed, start_x, start_y, end_x, end_y) = doglist[_*5:_*5+5]
    dogs.append((breed, int(start_x), int(start_y), int(end_x), int(end_y)))
  return dogs

# Create a UDF
udfDogParse = F.udf(dogParse, ArrayType(DogType))

# Use the UDF to list of dogs and drop the old column
joined_df = joined_df\
    .withColumn('dog_list', udfDogParse('dog_list'))\
    


In [48]:
joined_df.show(3)

+-------+---------------+-----+------+--------------------+
| folder|       filename|width|height|            dog_list|
+-------+---------------+-----+------+--------------------+
|2085782|   n02085782_82|  480|   427|[{Japanese_spanie...|
|2085936|n02085936_22127|  500|   500|[{Maltese_dog, 6,...|
|2085936| n02085936_3348|  375|   500|[{Maltese_dog, 10...|
+-------+---------------+-----+------+--------------------+
only showing top 3 rows



In [49]:

# Show the number of dogs in the first 10 rows
joined_df.select(F.size('dog_list')).orderBy(F.size('dog_list').desc()).show(10)

+--------------+
|size(dog_list)|
+--------------+
|             6|
|             5|
|             5|
|             4|
|             4|
|             4|
|             4|
|             4|
|             4|
|             4|
+--------------+
only showing top 10 rows



In [52]:
# Define a UDF to determine the number of pixels per image
def dogPixelCount(doglist):
  totalpixels = 0
  for dog in doglist:
    totalpixels += (dog[3] - dog[1]) * (dog[4] - dog[2])
  return totalpixels

# Define a UDF for the pixel count
udfDogPixelCount = F.udf(dogPixelCount, IntegerType())
joined_df = joined_df.withColumn('dog_pixels', udfDogPixelCount('dog_list'))

# Create a column representing the percentage of pixels
joined_df = joined_df\
    .withColumn(
        'dog_percent',
        (joined_df.dog_pixels / (joined_df.width * joined_df.height)) * 100
    )

# Show the first 10 annotations with more than 60% dog
joined_df.where('dog_percent > 60').show(10, False)

+-------+---------------+-----+------+----------------------------------------+----------+-----------------+
|folder |filename       |width|height|dog_list                                |dog_pixels|dog_percent      |
+-------+---------------+-----+------+----------------------------------------+----------+-----------------+
|2085782|n02085782_82   |480  |427   |[{Japanese_spaniel, 44, 25, 460, 386}]  |150176    |73.27088212334114|
|2085936|n02085936_22127|500  |500   |[{Maltese_dog, 6, 31, 423, 498}]        |194739    |77.8956          |
|2085936|n02085936_807  |500  |333   |[{Maltese_dog, 14, 28, 396, 332}]       |116128    |69.74654654654655|
|2088364|n02088364_14892|500  |375   |[{beagle, 86, 0, 498, 319}]             |131428    |70.09493333333333|
|2090622|n02090622_3156 |337  |500   |[{borzoi, 0, 106, 334, 497}]            |130594    |77.50385756676557|
|2091467|n02091467_3589 |300  |245   |[{Norwegian_elkhound, 37, 0, 293, 238}] |60928     |82.8952380952381 |
|2092002|n02092002_

In [53]:
joined_df.where(joined_df.filename =='n02086240_7195').show(10, truncate=False)

+-------+--------------+-----+------+---------------------------------------------------------------+----------+-----------+
|folder |filename      |width|height|dog_list                                                       |dog_pixels|dog_percent|
+-------+--------------+-----+------+---------------------------------------------------------------+----------+-----------+
|2086240|n02086240_7195|500  |375   |[{Shih-Tzu, 90, 238, 237, 369}, {Shih-Tzu, 218, 219, 331, 357}]|34851     |18.5872    |
+-------+--------------+-----+------+---------------------------------------------------------------+----------+-----------+



In [51]:
joined_df.printSchema()

root
 |-- folder: integer (nullable = true)
 |-- filename: string (nullable = true)
 |-- width: string (nullable = true)
 |-- height: string (nullable = true)
 |-- dog_list: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- breed: string (nullable = false)
 |    |    |-- start_x: integer (nullable = false)
 |    |    |-- start_y: integer (nullable = false)
 |    |    |-- end_x: integer (nullable = false)
 |    |    |-- end_y: integer (nullable = false)
 |-- dog_pixels: integer (nullable = true)
 |-- dog_percent: double (nullable = true)

