In [4]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# show all outputs of a cell (such as if df.head() and df.tail() are in the same cell)
#default is 'last_expr'

In [5]:
import pandas as pd
import numpy as np

# Transforming categorical data into numerical values

**label_encoder.fit()** to determine all unique values <br>
**label_encoder.transform()** to transform all unique values into count numbers starting at 0. <br>
**label_encoded_data=label_encoder.fit_transform()** does both of the above in one step<br>
**one_hot_encoder.fit_transform(label_encoded_data.reshape(-1,1)** to make a dummy for each distinct numerical value created using label_encoder (since some methods need dummies).  This method CANNOT take a 1D array, which is what will be outputted from label_encoder, so convert it into a 2D array using reshape.

In [96]:
df=pd.DataFrame([0,1,2,3])
s=pd.Series(['hi','bye','him','her'])

In [84]:
label_encoder.fit(s).classes_

array(['bye', 'her', 'hi', 'him'], dtype=object)

## Transform column of categorical values into integer data

In [116]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
label_encoded_data=label_encoder.fit_transform(s)

In [118]:
label_encoded_data

array([2, 0, 3, 1])

## Transform column of integer data into columns of dummy variables using OneHotEncoder

In [117]:
one_hot_encoder=preprocessing.OneHotEncoder()
dummies=one_hot_encoder.fit_transform(label_encoded_data.reshape(-1,1))

In [108]:
list(enumerate(dummies.toarray()))

[(0, array([ 0.,  0.,  1.,  0.])),
 (1, array([ 1.,  0.,  0.,  0.])),
 (2, array([ 0.,  0.,  0.,  1.])),
 (3, array([ 0.,  1.,  0.,  0.]))]

Add the dummies to your dataframe.

In [112]:
dummies.shape #2D array with shape (4,4)
dummies.toarray() 
for index, dimension in enumerate(dummies.toarray()): # for each 2D array, makes tuple of its index and the 2D array
    for element in dimension:
        df[s[index]]=element
df

(4, 4)

array([[ 0.,  0.,  1.,  0.],
       [ 1.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  1.],
       [ 0.,  1.,  0.,  0.]])

Unnamed: 0,0,hi,bye,him,her
0,0,0.0,0.0,1.0,0.0
1,1,0.0,0.0,1.0,0.0
2,2,0.0,0.0,1.0,0.0
3,3,0.0,0.0,1.0,0.0


In [115]:
#Enumerate produces tuples for each element that are (index of element, element)
list(enumerate(dummies.toarray()))

[(0, array([ 0.,  0.,  1.,  0.])),
 (1, array([ 1.,  0.,  0.,  0.])),
 (2, array([ 0.,  0.,  0.,  1.])),
 (3, array([ 0.,  1.,  0.,  0.]))]

In [90]:
label_encoded_data.reshape(-1,1) #a 2D array where the 2nd dimension has length 4 and 1st dimension has length 1.

array([[2],
       [0],
       [3],
       [1]])

# Transform categorical data into dummies using pandas.get_dummies

In [129]:
df_dummies=pd.get_dummies(s)
df_dummies
df=pd.DataFrame([0,1,2,3])
pd.concat([df,df_dummies],axis=1)

Unnamed: 0,bye,her,hi,him
0,0,0,1,0
1,1,0,0,0
2,0,0,0,1
3,0,1,0,0


Unnamed: 0,0,bye,her,hi,him
0,0,0,0,1,0
1,1,1,0,0,0
2,2,0,0,0,1
3,3,0,1,0,0


In [132]:
pd.concat([df,df_dummies],axis=1).select_dtypes(include=[object])

0
1
2
3


# Numpy Arrays

Numpy arrays are a way to represent a table of elements. <br>
Numpy arrays must contain the same data type. <br>
Numpy arrays are preferrable to Python lists since they take up less space and are thus faster to work with, and also have many methods. <br>
A numpy array's elements are grouped into **dimensions**, which are called **axes**.  An n-dimensional array is said to have **rank n**. <br>

A multi-dimensional numpy array, or N-Dimensional array, is composed of multiple arrays, where each array itself can contain arrays.  It's like a tree with branches, and points on the branches extending from an earlier point on the tree are all arrays contained within the array at that earlier point.  The later points are like children of that array and that array is like the parent.  The parent array is in a higher dimension than all of its children arrays.  The 1st dimension array just contains elements and there are no branches extending from it.  An array in the 5th dimension contains arrays in the 4th dimension, which contains arrays in the 3rd dimension, and so on.  An array in dimension N contains arrays in dimension (N-1), (N-2),..., and 1.  Access an element by specifying which dimension N array it is in, which dimension N-1 array it is in,..., and which dimension 1 array it is in.  Do so using the axis <br>

The location of an element is uniquely specified by that element's location in each axis.  For an array with N axes, the Nth axis has index N-1 (since indices start at zero), ..., and first axis (with just elements in it and not arrays) has axis index 0.  Thus for axis 1, or dimension 2, you specify which array it is in (0,...,L) where L is the length of dimension 2 and represents how many arrays are in dimension 2.  You can view each axis's index by looking at its location in the shape method. <br>

To access a certain element in an array, you specify its location within each axes (the array its in), since each axes contains multiple arrays (though for the 1D axes, you specify its element position).  <br>

**np.shape** provides a tuple of elements, where element j is the size of (number of arrays in) dimension j (where for dimension 1, its the number of values).  In other words, the size of the array in dimension j.  The length of the shape tuple (number of tuple elments) is the number of dimensions of the numpy array and is the rank. <br>

A single list must be passed as an argument to np.array(), and each list or tuple within this list represents a dimension. <br>
**np.size** returns the total number of elements in the numpy array. <br>



Alternatively, I like to think of arrays as containers.  The N-Dim array contains all arrays with lower dimensions that are children of that array.  So if a 3-D array is a kingdom, then it naturally has villages (all 2d arrays that are part of that kingdom), and villagers (all 1d arrays, meaning elements, that are part of each of the kingdoms villages).<br>
1D Villagers <br>
2D Villages <br>
3D Kingdoms<br>
4D Lands<br>
5D Worlds<br>
6D Universes<br>
7D Multiverses<br>


# Numpy Array Reshape Method

**array.reshape()** method reshapes an array without changing the data. <br>
You specify the length of each dimension the same way as for the shape method, which shows the length of each dimension for an array. <br>
array.reshape(# of A arrays, # of B arrays within A arrays, # of C arrays within B arrays, ... , # of elements within J arrays) <br> 
array.reshape(# of Ndim arrays, # of (N-1)dim arrays, # of (N-2)dim arrays, ... , # of elements within 1dim arrays) <br> 
The last specified argument always represents the number of elements. <br>

reshape(3,-1) --> create 3 arrays, and within each of these, however many arrays needed so that the size of the data remains the same<br>
reshape(3,2,1) --> create 3 arrays, 2 arrays within each of these, and 1 element within each of these. <br>
reshape(4,3,2,2) --> create 4 arrays, 3 within each of these, 2 within each of these, 2 elements within each of these.

For the second two reshapes, the number of elements needed to use these is (3\*2\*1=6, 4\*3\*2\*2=48).

np.arange() returns a numpy array<br>
Python has a built in range() function, which in Python 2, returns a list and in Python 3, returns an iterator.  

In [275]:
print len(np.array(np.arange(24)).reshape(3,4,2,1)), # 3 arrays
print len(np.array(np.arange(24)).reshape(3,4,2,1)[0]), #4 arrays within each of the above 3 arrays
print len(np.array(np.arange(24)).reshape(3,4,2,1)[0][0]), #2 arrays within each of the above 4 arrays
print len(np.array(np.arange(24)).reshape(3,4,2,1)[0][0][0]), #1 element within each of the above 2 arrays

3 4 2 1


The way in in which numbers go into the new array using reshape is by default, according to the "C-like" index order, an first element is placed in the first spot in the 1st dimension of the first array in the 2nd dimension, etc, then the next element is placed in the 2nd spot in the 1st dimension of the first array in the 2nd dimension, etc. This means the first axis index changes fastest and the last axis index changes slowest:<br>
reshape(3,4,2,1) ---> <br>
reshape(3 lands, 4 kingdoms, 2 villages, 1 element) <br>
The 1st element is placed in (1st land, 1st kingdom, 1st village, the only spot) <br>
The 2nd element is placed in (1st land, 1st kingdom, 2nd village) <br>
The 3rd element is placed in (1st land, 2nd kingdom, 3rd village) <br>

In [15]:
reshaped[0] #access 1st array in 4th dimension array (), meaning we'd like to visit land 0.
reshaped[0][0] #access 1st array in the 3rd dimension array that is the 1st array in the 4th dimension array
               #we're visiting kingdom 0 of land 0.
reshaped[0][0][0] #access [0,0,0] meaning village 0, kingdom 0, land 0

array([[[0],
        [1]],

       [[2],
        [3]],

       [[4],
        [5]],

       [[6],
        [7]]])

array([[0],
       [1]])

array([0])

# Shape Method of Numpy Arrays

The following array is a rank 4 array since it has 4 dimensions, each with their own respective lengths.

In [9]:
reshaped.shape

(3, 4, 2, 1)

Shows the length of each axes (dimension), meaning the number of arrays in it.  <br>
For the first dimension, it's the number of elements. <br>
Len(array.shape) returns the number of dimensions. <br>
(# of Dim 3 arrays, # of Dim 2 arrays, # of Dim 1 elements) --> rank 3 array<br>
(# of Dim 2 arrays, # of Dim 1 arrays) --> rank 2 array<br>
(# of elements in a Dim 1 array, )--> rank 1 array<br>

In [32]:
a=np.array([4,3]) 
b=np.array([[1],[5],[3]]) 
c=np.array([[1,0],[1,2],[5,4]]) 
d=np.array([[[1,2],[3,4]],[[1,2],[3,4]]])
e=np.array([[[1],[2]]])
a.shape, b.shape, c.shape, d.shape, e.shape,

((2,), (3, 1), (3, 2), (2, 2, 2), (1, 2, 1))

# Accessing a certain element in an array of rank N

In [66]:
x=np.arange(24)
x=x.reshape(4,3,2) #create a rank 3 array
x

array([[[ 0,  1],
        [ 2,  3],
        [ 4,  5]],

       [[ 6,  7],
        [ 8,  9],
        [10, 11]],

       [[12, 13],
        [14, 15],
        [16, 17]],

       [[18, 19],
        [20, 21],
        [22, 23]]])

1) Access array.shape() <br>
2) For each axis, identify the index number associated with the element for that axis (meaning which array the element is located in in that axis).<br>

In [73]:
print x[0,2,1],
print x[1,1,0], 
print x[2,0,1],
print x[3, 1, 0]

5 8 13 20
