## Getting unique elements

Let us perform few tasks to understand how to extract unique elements. We can use either of these approaches.
* We can create a list of elements first and then convert into a set.
* We can also build set directly while extracting the information.

In [None]:
%%sh

ls -ltr /data/retail_db/orders/part-00000

In [None]:
%%sh

tail /data/retail_db/orders/part-00000

In [1]:
path = 'D:\\BIGDATA-LEARN\\data-engineering-spark-main\\data\\retail_db\orders\part-00000'
# C:\\users\\itversity\\Research\\data\\retail_db\\orders\\part-00000
orders_file = open(path)

In [2]:
type(orders_file)

_io.TextIOWrapper

In [3]:
orders_raw = orders_file.read()

In [4]:
type(orders_raw)

str

In [5]:
orders_raw.splitlines?

[1;31mSignature:[0m [0morders_raw[0m[1;33m.[0m[0msplitlines[0m[1;33m([0m[0mkeepends[0m[1;33m=[0m[1;32mFalse[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Return a list of the lines in the string, breaking at line boundaries.

Line breaks are not included in the resulting list unless keepends is given and
true.
[1;31mType:[0m      builtin_function_or_method


In [6]:
orders = orders_raw.splitlines()

In [7]:
type(orders)

list

In [8]:
orders[:10]

['1,2013-07-25 00:00:00.0,11599,CLOSED',
 '2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT',
 '3,2013-07-25 00:00:00.0,12111,COMPLETE',
 '4,2013-07-25 00:00:00.0,8827,CLOSED',
 '5,2013-07-25 00:00:00.0,11318,COMPLETE',
 '6,2013-07-25 00:00:00.0,7130,COMPLETE',
 '7,2013-07-25 00:00:00.0,4530,COMPLETE',
 '8,2013-07-25 00:00:00.0,2911,PROCESSING',
 '9,2013-07-25 00:00:00.0,5657,PENDING_PAYMENT',
 '10,2013-07-25 00:00:00.0,5648,PENDING_PAYMENT']

In [9]:
type(orders[0])

str

In [10]:
len(orders)

68883

In [11]:
%%sh

wc -l /data/retail_db/orders/part-00000

Couldn't find program: 'sh'


### Task 1

Get all the unique dates from orders data.

In [12]:
order = '1,2013-07-25 00:00:00.0,11599,CLOSED'

In [13]:
order.split(',')[1]

'2013-07-25 00:00:00.0'

In [14]:
order_dates = set()

In [15]:
order_dates.add('2013-07-25 00:00:00.0')

In [16]:
order_dates

{'2013-07-25 00:00:00.0'}

In [17]:
order_dates.add('2013-07-26 00:00:00.0')

In [18]:
order_dates

{'2013-07-25 00:00:00.0', '2013-07-26 00:00:00.0'}

In [19]:
order_dates.add('2013-07-25 00:00:00.0')

In [20]:
order_dates

{'2013-07-25 00:00:00.0', '2013-07-26 00:00:00.0'}

In [21]:
order_dates = set()
for order in orders:
    order_dates.add(order.split(',')[1])

In [22]:
list(order_dates)[:10]

['2013-11-25 00:00:00.0',
 '2013-11-29 00:00:00.0',
 '2014-01-21 00:00:00.0',
 '2013-12-18 00:00:00.0',
 '2014-07-21 00:00:00.0',
 '2014-01-26 00:00:00.0',
 '2013-09-17 00:00:00.0',
 '2013-08-06 00:00:00.0',
 '2013-12-10 00:00:00.0',
 '2014-02-11 00:00:00.0']

In [23]:
len(order_dates)

364

In [24]:
order_dates = {order.split(',')[1] for order in orders}

In [25]:
list(order_dates)[:10]

['2013-11-25 00:00:00.0',
 '2013-11-29 00:00:00.0',
 '2014-01-21 00:00:00.0',
 '2013-12-18 00:00:00.0',
 '2014-07-21 00:00:00.0',
 '2014-01-26 00:00:00.0',
 '2013-09-17 00:00:00.0',
 '2013-08-06 00:00:00.0',
 '2013-12-10 00:00:00.0',
 '2014-02-11 00:00:00.0']

In [26]:
len(order_dates)

364

### Task 2

Get all the unique weekend dates from orders data.

In [27]:
order_date = '2014-01-25 00:00:00.0'

In [28]:
import datetime as dt

In [29]:
dt.datetime.strptime(order_date, '%Y-%m-%d %H:%M:%S.%f')

datetime.datetime(2014, 1, 25, 0, 0)

In [30]:
d = dt.datetime.strptime(order_date, '%Y-%m-%d %H:%M:%S.%f')

In [31]:
d.weekday?

[1;31mDocstring:[0m
Return the day of the week represented by the date.
Monday == 0 ... Sunday == 6
[1;31mType:[0m      builtin_function_or_method


In [32]:
dt.datetime.strptime(order_date, '%Y-%m-%d %H:%M:%S.%f').weekday() # Returns 0 to 6 (for Monday to Sunday)

5

In [33]:
import calendar

In [34]:
list(calendar.day_name)

['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

In [35]:
calendar.day_name[5]

'Saturday'

In [36]:
calendar.day_name[dt.datetime.strptime(order_date, '%Y-%m-%d %H:%M:%S.%f').weekday()]

'Saturday'

In [37]:
calendar.day_abbr[dt.datetime.strptime(order_date, '%Y-%m-%d %H:%M:%S.%f').weekday()]

'Sat'

In [38]:
dt.datetime.strptime(order_date, '%Y-%m-%d %H:%M:%S.%f').weekday() in (5, 6)

True

In [39]:
import datetime as dt
def is_weekend(order_date):
    return dt.datetime.strptime(order_date, '%Y-%m-%d %H:%M:%S.%f').weekday() in (5, 6)

In [40]:
is_weekend('2014-01-25 00:00:00.0')

True

In [41]:
is_weekend('2014-01-22 00:00:00.0')

False

In [42]:
weekend_dates = set()
for order in orders:
    order_date = order.split(',')[1]
    if is_weekend(order_date):
        weekend_dates.add(order_date)

In [43]:
list(weekend_dates)[:10]

['2013-09-08 00:00:00.0',
 '2014-05-25 00:00:00.0',
 '2013-09-28 00:00:00.0',
 '2014-04-13 00:00:00.0',
 '2013-09-01 00:00:00.0',
 '2013-09-21 00:00:00.0',
 '2014-06-21 00:00:00.0',
 '2013-11-16 00:00:00.0',
 '2013-10-26 00:00:00.0',
 '2014-01-26 00:00:00.0']

In [44]:
len(weekend_dates)

103