# Data wrangling basics

## Mutable and immutable examples

In [35]:
a_list= ['ken', 1, 2, [3,4]]

In [36]:
a_list[2] = (3,4)

In [37]:
a_list

['ken', 1, (3, 4), [3, 4]]

In [39]:
a_tuple = (3,5, (4,5))

In [40]:
a_tuple [1] = 'four'

TypeError: 'tuple' object does not support item assignment

In [41]:
a = "This is a string"

In [48]:
b = a.replace('string', 'this is a longer string')

In [49]:
b

'This is a this is a longer string'

In [50]:
a

'This is a string'

In [51]:
a = 999999999

In [53]:
s = str(a)
print(s)

999999999


In [54]:
c = 'python'

In [55]:
list(c)

['p', 'y', 't', 'h', 'o', 'n']

In [56]:
c[:3]

'pyt'

In [57]:
a = "There is one thing we say to death..."
b= "not today"
a + b

'There is one thing we say to death...not today'

### Built-in-sequence functions

In [63]:
#mapping & enumerate to keep track of index and to map the values of a sequence

some_list = ['jon', "arya", "sansa"]

mapping = {}

for i, v in enumerate(some_list):
    mapping[v] = i

mapping

{'jon': 0, 'arya': 1, 'sansa': 2}

In [72]:
# sorted returns a new list from the elements of a sequence

sorted([10,9,8,7,6,5])


[5, 6, 7, 8, 9, 10]

In [73]:
sorted("game of thrones")

[' ', ' ', 'a', 'e', 'e', 'f', 'g', 'h', 'm', 'n', 'o', 'o', 'r', 's', 't']

In [81]:
#zip pairs up the elements of a number of lists, tuples, or sequences to create a list of tuples

s1= ['jon','sansa','arya']
s2=['ghost','lady','nymeria']

zipped = zip(s1,s2)

list(zipped)

[('jon', 'ghost'), ('sansa', 'lady'), ('arya', 'nymeria')]

In [82]:
# zip can take an arbitrary number of sequences, 
# and the number of elements it produces is determined by the shortest sequence

s3= [True, False]

list(zip(s1,s2,s3))

[('jon', 'ghost', True), ('sansa', 'lady', False)]

In [83]:
#zip used to iterate over multiple sequence, used with enumerate

for i, (a,b) in enumerate(zip(s1,s2)):
    print('{0}: {1},{2}'. format(i,a,b))

0: jon,ghost
1: sansa,lady
2: arya,nymeria


In [92]:
# can also be used to unzip, or convert a list of rows into a list of columns

himym = [('Mosby', 'Ted'), ('Stinson', 'Barney'),('Sherbatsky', 'Robin')]

In [100]:
last_names, first_names = zip(*(himym))

first_names

('Ted', 'Barney', 'Robin')

In [101]:
last_names

('Mosby', 'Stinson', 'Sherbatsky')

## Creating dicts from sequences

In [103]:
# pair up two sequences element wise in a dictionary

key_list=[]
value_list=[]

mapping = {}
for key, value in zip(key_list, value_list):
    mapping[key] = value

In [105]:
mapping = dict(zip(range(5), reversed(range(5))))

In [106]:
mapping

{0: 4, 1: 3, 2: 2, 3: 1, 4: 0}

### key values and default values

```python
if key in some_dict:
    value = some_dict[key]

else:
    value= default_value```
    
simplified to:

```value = some_dict.get(key, default_value)```

In [111]:
words = ['apple', 'banana','blueberries','aphid','ant']

by_letter ={}

In [112]:
for word in words:
    letter = word[0]
    if letter not in by_letter:
        by_letter[letter] = [word]
    else:
        by_letter[letter].append(word)

In [113]:
by_letter

{'a': ['apple', 'aphid', 'ant'], 'b': ['banana', 'blueberries']}

In [116]:
# setdefault dict method

words = ['apple', 'banana','blueberries','aphid','ant']

by_letter ={}

for word in words:
    letter = word[0]
    by_letter.setdefault(letter,[]).append(word)

In [117]:
by_letter

{'a': ['apple', 'aphid', 'ant'], 'b': ['banana', 'blueberries']}

In [119]:
# collections module

words = ['apple', 'banana','blueberries','aphid','ant']

by_letter ={}

from collections import defaultdict
by_letter = defaultdict(list)
for word in words:
    by_letter[word[0]].append(word)
    
by_letter

defaultdict(list,
            {'a': ['apple', 'aphid', 'ant'], 'b': ['banana', 'blueberries']})

### Valid dict types and hashability

hashability= values of a dict can be any python object, keys have to be immutable objects, i.e. scalar types
* int
* float
* string
* tuples (all the objets in the tuple need to be immutable)

In [120]:
hash('string')

2513491321090692020

In [123]:
hash((1,2,(2,3)))

1097636502276347782

In [124]:
# lists are mutable
hash((1, 2, [2, 3]))

TypeError: unhashable type: 'list'

In [125]:
# to use a list as a key, convert it to a tuple, hashed as long as its elements

d={}
d[tuple([1,2,3])] = 5

d

{(1, 2, 3): 5}

## Sets

In [126]:
set([2,2,2,1,3,3])

{1, 2, 3}

In [127]:
{2,2,2,1,3,3}

{1, 2, 3}

In [128]:
## sets support mathemetical set operations like union, intersection, difference

a= {1,2,3,4,5}
b= {3,4,5,6,7,8}

In [129]:
a.union(b)

{1, 2, 3, 4, 5, 6, 7, 8}

In [130]:
a | b

{1, 2, 3, 4, 5, 6, 7, 8}

In [131]:
a.intersection(b)
a & b

{3, 4, 5}

In [135]:
c = a.copy()
c |=b
c

{1, 2, 3, 4, 5, 6, 7, 8}

In [136]:
d = a.copy()
d &= b
d

{3, 4, 5}

In [137]:
# sets as mutable or immutable objects

my_data =[1,2,3,4]

my_set = {tuple(my_data)}

my_set

{(1, 2, 3, 4)}

In [139]:
# check if a set is a subset (contained in) or a superset (contains all)

a_set = {1,2,3,4,5}

{1,2,3}.issubset(a_set)



True

In [140]:
a_set.issuperset({1,2,3})

True

In [141]:
#sets are equal if and only if their contents are equal

{1,2,3} == {3,2,1}

True

## List, Set, and Dict Comprehensions

How to form a new list by filtering the elements of a collection, transforming the elements passing the filter
in one concise expression

```[expr for val in collection if condition]```

or

```python
result = []
    for val in collection:
    if condition:
    result.append(expr)
```


In [142]:
#filter condition based on length of word

strings = ['a','alpha','b','beta','d','delta']

[x.upper() for x in strings if len(x) >2]

['ALPHA', 'BETA', 'DELTA']

### set and dict comprehensions are natural extensions, producing sets and dicts in a similar way instead of lists

```python
dict_comp= {key-expr: value-expr for value in collection if condition}```


set comprehension looks like the likst comprehension except with curly braces instead of 

```python
set_comp = {expr for value in collection if condition}```



In [143]:
# set of length of strings
strings = ['a','alpha','b','beta','d','delta']
unique_lengths = {len(x) for x in strings}
unique_lengths

{1, 4, 5}

In [144]:
# or set of length of strings

set(map(len,strings))

{1, 4, 5}

In [145]:
# lookup map of these strings in the list

loc_mapping = {val: index for index, val in enumerate(strings)}

loc_mapping

{'a': 0, 'alpha': 1, 'b': 2, 'beta': 3, 'd': 4, 'delta': 5}

### nested list comprehensions

In [150]:
# for loop

marios = [['mario','link','yoshi','luigi','toad','bowser','goomba']]

names_of_interest= []

names_of_interest = []
for names in marios:
    enough_is = [name for name in names if name.count('i') >= 2]
    names_of_interest.extend(enough_is)

names_of_interest

['luigi']

In [154]:
# one operation

result = [name for names in marios for name in names if name.count('i') >= 2]

result

['luigi']

In [156]:
# flatten a list of tuples of integers into a simple list of integers

some_tuples = [(1,2,3),(4,5,6),(7,8,9)]

flattened = [x for tup in some_tuples for x in tup]

flattened

[1, 2, 3, 4, 5, 6, 7, 8, 9]

In [158]:
# or the long way with a for loop

flattened2 = []

for tup in some_tuples:
    for x in tup:
        flattened2.append(x)

flattened2

[1, 2, 3, 4, 5, 6, 7, 8, 9]

## Functions review

In [160]:
# functions declared with the def keyword and returned with the return keyword
# x & y positional arguments and z keyword argument
#

def my_function(x,y,z=1.5):
    if z > 1:
        return z * (x + y)
    else:
        return z/ (x +y)

In [163]:
my_function(5,6, z=0.7)

45.0

In [164]:
my_function(10,20)

45.0

In [165]:
my_function(3.14,7,3.5)

35.49

## Namespaces, scope, and local functions

Variables assigned within a function by default are assigned to the local namespace. The local namespace is created when
the function is called and immediately populated by the function's arguments.
After the function is finished, the local namespace is destroyed.


Don't use global keywords, because global variables are used to store some kind of state in a system. 

In [169]:
def func():
    a=[]
    for i in range(5):
        a.append(i)

a

[]

### Returning multiple values with simple syntax

In [177]:
# function returning a tuple, then unpacked into variables
def f():
    a =5
    b=6
    c=7
    return a,b,c

a,b,c = f()

f()

(5, 6, 7)

In [174]:
# or simpler way

return_value = f()

return_value

(5, 6, 7)

In [179]:
# or return a dict

def f():
    a=5
    b=6
    c=7
    return{'a':a, 'b':b, 'c':c}

f()

{'a': 5, 'b': 6, 'c': 7}

## Functions are objects

This is great especially for data cleaning!

In [182]:
states = [' Alabama ', 'Georgia!', 'Washingt!on', 'CalifORNIa!?', 'FlOrIda',
'south carolina##', 'West virginia?', 'AriZONa']

In [183]:
#use a re library
import re

def clean_strings(strings):
    result=[]
    for value in strings:
        value = value.strip()
        value = re.sub('[!#?]', '', value)
        value = value.title()
        result.append(value)
    return result

clean_strings(states)

['Alabama',
 'Georgia',
 'Washington',
 'California',
 'Florida',
 'South Carolina',
 'West Virginia',
 'Arizona']

In [184]:
# or create a list of operations you want to apply to a particular set of strings

def remove_punctuation(value):
    return re.sub('[!#?]', '', value)

clean_ops = [str.strip, remove_punctuation, str.title]

def clean_strings(strings, ops):
    result = []
    for value in strings:
        for function in ops:
            value = function(value)
        result.append(value)
    return result

In [185]:
clean_strings(states, clean_ops)

['Alabama',
 'Georgia',
 'Washington',
 'California',
 'Florida',
 'South Carolina',
 'West Virginia',
 'Arizona']

## Using lambdas as functions

Useful where data transformations will take functions as arguments. Easier to pass a lambda functiono as opposed to writing a full-out function declaration or assigning the lambda function to a local variable. 

```python
def short_function(x):
    return x*2

equiv_an = lambda x : x*2```

In [187]:
# one simple example

def apply_to_list(some_list, f):
    return[f(x) for x in some_list]

ints = [4,0,1,5,6]
apply_to_list(ints, lambda x: x*2)

[8, 0, 2, 10, 12]

In [192]:
# lambda function to list's sort method, based on the number of distinct letters in each string

strings = ['aaaaaa','ab','aaabbb','abc','aabb','bb']
strings.sort(key=lambda x: len(set(list(x))))

strings

['aaaaaa', 'bb', 'ab', 'aaabbb', 'aabb', 'abc']

## Currying

Deriving new functions from existing ones by partial argument application. 

In [195]:
def add_numbers(x,y):
    return x + y

#second argument is curried, define a new function that calls an existing function
add_five = lambda y: add_numbers (5,y)

In [196]:
# simplify this process

from functools import partial

add_five = partial(add_numbers, 5)

## Iterators vs Generators

Iterator protocol is a consistent way to iterate over sequences. An iterator is any object that will yield objects to the Python interpreter when used in context like a for loop. Most methods expecting a list or list-like object will also accep any iterable object, includes : min, max, and sum; and type constructors like list and tuple

In [199]:
# iterator
some_dict = {'a': 1, 'b':2, 'c':3}

for key in some_dict:
    print(key)

a
b
c


In [200]:
# python creates an iterator out of some_dict

dict_iterator= iter(some_dict)

dict_iterator

<dict_keyiterator at 0x2b4e1d909a8>

### Generator

Is a concise way to construct a new iterable object. Normal functions execute and return a single result at a time,
generators return a sequence of multiple results by pausing after each one until the next one is requested. 

To create a generator, use the ```yield``` keyword instead of ```return``` in a function

In [202]:
def squares(n=10):
    print('Generating squares from 1 to {0}'.format(n ** 2))
    for i in range(1, n + 1):
        yield i ** 2

In [203]:
gen = squares()

# not until you request elements form the generator that it begins executing its code
gen

<generator object squares at 0x000002B4E8B4C748>

In [204]:
for x in gen:
    print(x, end=' ')

Generating squares from 1 to 100
1 4 9 16 25 36 49 64 81 100 

### Generator Expression

Generator analogue to list, dict, set comprehensions. To create one, enclose what would otherwise be a list comprehension
within parentheses instead of brackets.

In [205]:
gen = (x ** 2 for x in range (100))

In [206]:
gen

<generator object <genexpr> at 0x000002B4E8B4C8C8>

In [207]:
# or equivalent to 

def _make_gen():
    for x in range(100):
        yield x ** 2

        gen= _make_gen

In [208]:
# generator expressions can be used instead of list comprehensions as function arguments

sum(x**2 for x in range (100))

328350

In [209]:
dict((i, i **2) for i in range(5))

{0: 0, 1: 1, 2: 4, 3: 9, 4: 16}

### itertools module

Is a collection of generators for common data algorithms.

[itertools module](https://docs.python.org/3/library/itertools.html)

In [210]:
import itertools

In [211]:
first_letter = lambda x:x[0]

In [218]:
names = ['Ken', 'Kenneth','Julia','James','Anna','Guo','Frankie']

In [219]:
for letter, names in itertools.groupby(names, first_letter):
    print(letter, list(names)) # names in a generator

K ['Ken', 'Kenneth']
J ['Julia', 'James']
A ['Anna']
G ['Guo']
F ['Frankie']


### Errors and Exception handling during data analysis

Many functions only work on certain kinds of input. For example, Python's float function is capable of casting a string
to a floating-point number, but fails with ```ValueError``` on improper inputs:

In [220]:
float('1.23456')

1.23456

In [221]:
float('something')

ValueError: could not convert string to float: 'something'

In [222]:
# can write a function that encloses the call to float in a try/except block

def attempt_float(x):
    try:
        return float(x)
    except:
        return x

#### code only executed if ```float(x)``` raises an exception


In [225]:
attempt_float('1.2345')


1.2345

In [226]:
attempt_float('something')

'something'

In [227]:
float((1,2))

TypeError: float() argument must be a string or a number, not 'tuple'

#### some cases want to supress ```ValueError```, since ```TypeError``` 
(the input was not a string or numberic value) might indicate
a bug in the program. To do this, write the exception type after except:

In [231]:
def attempt_float(x):
    try:
        return float(x)
    except ValueError:
        return x

In [232]:
attempt_float((1,2))

TypeError: float() argument must be a string or a number, not 'tuple'

In [233]:
# catch multiple exception types by writing a tuple of exception types instead using parentheses

def attempt_float(x):
    try:
        return float(x)
    except (TypeError, ValueError):
        return x

#### may not want to suprress an exception

want some code to be executed regardless of whether the code in the try block succeeds or not. To do this use, ```finally```

```python
f = open(path, 'w')

try:
    write_to_file(f)
except:
    print('Failed')
else:
    print('Succeeded')
finally:
    f.close()
    ```

### Files and the Operating System

Review of the basics of how to work with files in Python. Really simple

In [242]:
path = 'zen.txt'

In [243]:
f = open(path)

In [244]:
for line in f:
    pass

In [246]:
lines = [x.rstrip() for x in open(path)]

lines

['The Zen of Python, by Tim Peters',
 '',
 'Beautiful is better than ugly.',
 'Explicit is better than implicit.',
 'Simple is better than complex.',
 'Complex is better than complicated.',
 'Flat is better than nested.',
 'Sparse is better than dense.',
 'Readability counts.',
 "Special cases aren't special enough to break the rules.",
 'Although practicality beats purity.',
 'Errors should never pass silently.',
 'Unless explicitly silenced.',
 'In the face of ambiguity, refuse the temptation to guess.',
 'There should be one-- and preferably only one --obvious way to do it.',
 "Although that way may not be obvious at first unless you're Dutch.",
 'Now is better than never.',
 'Although never is often better than *right* now.',
 "If the implementation is hard to explain, it's a bad idea.",
 'If the implementation is easy to explain, it may be a good idea.',
 "Namespaces are one honking great idea -- let's do more of those!"]

In [247]:
f.close()

In [248]:
# to make it easier to clean up open files use the with statements

with open(path) as f:
    lines = [x.rstrip() for x in f]

In [252]:
# common use methods for opening files

# to read words in the title of the path
f = open(path)

f.read(3)

f2 = open(path, 'rb')
f2.read(10)

b'The Zen of'

In [253]:
# tell

f.tell()

3

In [254]:
f.close()
f2.close()