# Python built-in types

## Reference
https://jakevdp.github.io/WhirlwindTourOfPython/05-built-in-scalar-types.html

In [1]:
a = 1
b = 'abc'
c, d = 1., 2**1000
n = None
m = True
type(a), type(b), type(c), type(d), type(n), type(m)

(int, str, float, int, NoneType, bool)

In [2]:
# int type can hold bigint numbers as well
2**1000

10715086071862673209484250490600018105614048117055336074437503883703510511249361224931983788156958581275946729175531468251871452856923140435984577574698574803934567774824230985421074605062371141877954182153046474983581941267398767559165543946077062914571196477686542167660429831652624386837205668069376

In [3]:
# Other important types
type([1, 2, 3]), \
type({'a': 1}), \
type({1, 2, 3}), \
type( ([1, 2], 2, "3", 3.14) )

(list, dict, set, tuple)

In [4]:
from collections import defaultdict
d = defaultdict(type(None))
d['a'] = 1
d['b'] is None

True

## iteration, enumerate, zip

In [5]:
# iteration
for i in range(10):
    print(i)

0
1
2
3
4
5
6
7
8
9


In [6]:
# enumerate
a = [2, 3, 4]
for idx, el in enumerate(a):
    print((idx, el))

(0, 2)
(1, 3)
(2, 4)


In [7]:
list(enumerate('abcdefg', 0))

[(0, 'a'), (1, 'b'), (2, 'c'), (3, 'd'), (4, 'e'), (5, 'f'), (6, 'g')]

In [8]:
list(enumerate('abcdefg', 2))

[(2, 'a'), (3, 'b'), (4, 'c'), (5, 'd'), (6, 'e'), (7, 'f'), (8, 'g')]

In [9]:
# zip

a = [1, 2, 3]
b = [4, 5, 6]

for a, b in zip(a, b):
    print((a, b))

(1, 4)
(2, 5)
(3, 6)


## List, dict and set comprehensions

In [10]:
# list comprehensions

[2*a for a in range(10)]

[0, 2, 4, 6, 8, 10, 12, 14, 16, 18]

In [11]:
# set comprehensions

{2*a for a in range(10)}

{0, 2, 4, 6, 8, 10, 12, 14, 16, 18}

In [12]:
{1 for a in range(10)}

{1}

In [13]:
# dict comprehensions

{i: a for i, a in enumerate('abc') }

{0: 'a', 1: 'b', 2: 'c'}

In [14]:
# kahoot time

# Numpy and Pandas basics 

### Numpy

In [2]:
# just standard convention to import them as aliases pd and np
import pandas as pd
import numpy as np

In [3]:
# define arrays
np.array([[1, 2, 3], [4, 5, 6]])

array([[1, 2, 3],
       [4, 5, 6]])

In [4]:
# matrix multiply
arr = np.array([[1, 2, 3], [4, 5, 6]])
arr2 = np.array([[1], [2], [3]])

print(arr.shape, arr2.shape)
np.dot(arr, arr2)

(2, 3) (3, 1)


array([[14],
       [32]])

In [5]:
# np.linalg for linear algebra functions
np.linalg.inv([[1, 2], [3, 4]])

array([[-2. ,  1. ],
       [ 1.5, -0.5]])

In [6]:
# find normal
np.linalg.norm([1, 2, 3])

def eucledean_distance(a, b):
    return np.linalg.norm(a - b, axis=1)

eucledean_distance(np.random.rand(10, 2), np.random.rand(10, 2))

array([0.30834731, 0.4230303 , 0.2884723 , 0.32544544, 0.62981501,
       0.35751044, 0.53700063, 0.48897287, 0.46969759, 0.21710429])

In [None]:
m = np.random.rand(10, 2)
n = np.random.rand(1, 2)


In [8]:
arr = np.array([[4, -2, 3], [1, 3, 4], [3, 1, 2]])
np.dot(np.linalg.inv(arr), np.array([[1], [-7], [5]]))

array([[ 3.66666667],
       [ 1.33333333],
       [-3.66666667]])

### Pandas

In [10]:
#https://www.kaggle.com/uciml/adult-census-income
data = pd.read_csv('./adult.csv')

In [87]:
data

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income,high_income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K,0.0
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K,0.0
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K,0.0
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K,0.0
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K,0.0
5,34,Private,216864,HS-grad,9,Divorced,Other-service,Unmarried,White,Female,0,3770,45,United-States,<=50K,0.0
6,38,Private,150601,10th,6,Separated,Adm-clerical,Unmarried,White,Male,0,3770,40,United-States,<=50K,0.0
7,74,State-gov,88638,Doctorate,16,Never-married,Prof-specialty,Other-relative,White,Female,0,3683,20,United-States,>50K,1.0
8,68,Federal-gov,422013,HS-grad,9,Divorced,Prof-specialty,Not-in-family,White,Female,0,3683,40,United-States,<=50K,0.0
9,41,Private,70037,Some-college,10,Never-married,Craft-repair,Unmarried,White,Male,0,3004,60,?,>50K,1.0


In [15]:
data.income.unique()

array(['<=50K', '>50K'], dtype=object)

In [20]:
data.is_high_income = (data.income == '>50K')*1.

  """Entry point for launching an IPython kernel.


In [22]:
data.is_high_income.mean()

0.2408095574460244

In [23]:
data.education.unique()

array(['HS-grad', 'Some-college', '7th-8th', '10th', 'Doctorate',
       'Prof-school', 'Bachelors', 'Masters', '11th', 'Assoc-acdm',
       'Assoc-voc', '1st-4th', '5th-6th', '12th', '9th', 'Preschool'],
      dtype=object)

In [28]:
data.groupby(by = 'education').apply(lambda g: np.sum((g.income == '>50K')*1.) / len(g)).sort_values()

education
Preschool       0.000000
1st-4th         0.035714
5th-6th         0.048048
11th            0.051064
9th             0.052529
7th-8th         0.061920
10th            0.066452
12th            0.076212
HS-grad         0.159509
Some-college    0.190235
Assoc-acdm      0.248360
Assoc-voc       0.261216
Bachelors       0.414753
Masters         0.556587
Prof-school     0.734375
Doctorate       0.740920
dtype: float64