# Data Wrangling 1.1

In [None]:
import base64

# Lab: Building Blocks of Storage and Encoding

## binary and hex (hexidecimal) numbers

## binary is base 2, with digits 0 and 1

## hex is base 16 with digits 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, a, b, c, d, e, f, where a = 10, b = 11, c = 12, d = 13, d = 14, f = 15

In [None]:
for i in range(0, 33):
    
    binary_string = f"{i:08b}"
    
    nibble_1 = binary_string[:4]
    
    nibble_0 = binary_string[4:]
    
    hex_string = f"{i:02x}"
    
    hex_1 = hex_string[:1]
    
    hex_0 = hex_string[1:]
    
    print(f"{i: 3}", ': binary:', nibble_1, nibble_0,'hex:', hex_1, hex_0)
    

In [None]:
for i in range(0, 16):
    
    hex_string = f"{i:1x}"
    
    print(f"{i: 3}", 'hex:', hex_string)

## storage sizing

In [None]:
print('1 KiB = ', f"{1024: 35,}")
print('1 MiB = ', f"{1024**2: 35,}")
print('1 GiB = ', f"{1024**3: 35,}")
print('1 TiB = ', f"{1024**4: 35,}")
print('1 PiB = ', f"{1024**5: 35,}")
print('1 EiB = ', f"{1024**6: 35,}")
print('1 ZiB = ', f"{1024**7: 35,}")
print('1 YiB = ', f"{1024**8: 35,}")


## ascii - print a string in its binary format - note 1 byte = 8 bits = 2 nibbles per character

In [None]:
def my_print_string_binary(s):
    return " ".join(f"{ord(i):08b}" for i in s)

In [None]:
my_print_string_binary("Oski the Bear")

In [None]:
my_print_string_binary("Go Bears!!!")

In [None]:
my_print_string_binary("Cali Rules!!!")

## ascii - print a string in its hex format - note 1 byte = 2 hex characters = 2 nibbles ("nibble pair") per character

In [None]:
def my_print_string_hex(s):
    print("nibble pair (lower case):", " ".join(f"{ord(i):02x}" for i in s))
    print("nibble pair (upper case):", " ".join(f"{ord((i)):02x}" for i in s).upper())
    print("digest                  :", ":".join(f"{ord(i):02x}" for i in s))
    return 

In [None]:
my_print_string_hex("Oski the Bear")

In [None]:
my_print_string_hex("Go Bears!!!")

In [None]:
my_print_string_hex("Cali Rules!!!")

## unicode - multibyte character set to represent all languages of the world, plus emojiis, glyphs, etc.

https://home.unicode.org/

This website uses the U+ notation, such as U+1F47D.  

In Python use '\uxxxx' for 4 hex digits, '\Uxxxxxxxx' for 8 hex digits.  Padd with leading 0's.

U+1F47D would be '\U0001f47d'

U+270C would be '\u270c'


## utf-8 is a way to encode unicode; it allows us to mix single byte characters and multi-byte characters; multi-byte characters are shown in hex, as in \xc3\xa9 is e with accent mark, \xc3\xb1 is n with accent mark

In [None]:
resume = b'r\xc3\xa9sum\xc3\xa9'.decode("utf-8")
resume

In [None]:
resume.encode("utf-8")

In [None]:
el_nino = b'El Ni\xc3\xb1o'.decode("utf-8")
el_nino

In [None]:
el_nino.encode("utf-8")


In [None]:
delta = "\N{GREEK CAPITAL LETTER DELTA}"
delta

In [None]:
delta.encode("utf-8")

## Emojis are part of unicode and can be encoded in UTF-8; note that emojis can change or enhance meanings; emoji analysis as part of natural language processing


In [None]:
grin = "\N{GRINNING FACE}"
grin

In [None]:
grin.encode("utf-8")

In [None]:
wink = "\N{WINKING FACE}"
wink

In [None]:
wink.encode("utf-8")

In [None]:
knight = "\N{BLACK CHESS KNIGHT}"
knight

In [None]:
knight.encode("utf-8")

In [None]:
alien = '\U0001f47d'
alien

In [None]:
alien.encode("utf-8")

In [None]:
peace = '\u270c'
peace

In [None]:
peace.encode("utf-8")

## UTF-8 can be easily written to / read from files using Python; Python 3.x assumes utf-8, it's no longer necessary to specify encoding for files

In [None]:
f = open("my_file", "w")

f.write("Did you finish your " + resume + "?\n")
f.write("Will " + el_nino + " affect our weather this year?\n")
f.write(delta + " is a Greek letter\n")
f.write("Life is great! " + grin + "\n")
f.write("I couldn't have done it better myself! " + wink + "\n")
f.write("Paladin!" + knight + "\n")
f.write("If an " + alien + " lands a space ship in my backyard, I hope they want " + peace + "\n")

f.close()

In [None]:
f = open("my_file", "r")

print(f.read())

f.close()

## Base64 is the latest version of uuencoding; it allows binary data to be encoded to pass through networks; Internet, web pages, email, MIME, etc.

In [None]:
my_string = bytes("Uuencoding was invented by Mary Ann Horton at UC Berkeley in 1980", "utf-8")

base_64_encoded = base64.b64encode(my_string)
base_64_encoded

In [None]:
base_64_decoded = base64.b64decode(base_64_encoded)
base_64_decoded

In [None]:
f = open("berkeley_logo.png", "rb")
binary_data = f.read()
f.close()

print("binary data for an image (first 100 bytes):\n", binary_data[:100], '\n')

base_64_data = base64.b64encode(binary_data)

print("base 64 encoding for the image (first 100 characters):\n", base_64_data[:100])


## You try it 

* Print the string "Hello, world!" in binary and hex

* translate the unicode character U+1F9E1 into a Python unicode string, and then into a UTF-8 string

* Encode the string "Hello, world!" in Base64, then decode it back