In [1]:
#Review: array is the simplistic way of storing matrix data in NumPy;
#Series and DataFrame are more commonly used in Pandas
    #Pandas are built on NumPy, but with a focus on tabular data
    #NumPy is mostly for numerical computing. We'll mostly use Pandas

In [2]:
import numpy as np

np.random.seed(123)

m1 = np.random.randint(0, 100, size=10)
m2 = np.random.randint(10, size=(3, 4))

print("The one-dimensional array looks like this:\n", m1)
print("The two-dimensional array looks like this:\n", m2)

The one-dimensional array looks like this:
 [66 92 98 17 83 57 86 97 96 47]
The two-dimensional array looks like this:
 [[9 0 0 9]
 [3 4 0 0]
 [4 1 7 3]]


In [6]:
#Indexing in NumPy: access i-th value, counting from zero, by specifying the desired index in square brackets
#This just prints the number in the array

In [3]:
m1[0]

66

In [4]:
#negative numbers indicate counting from the end
m1[-2]

96

In [7]:
#multi-dimensional array
m2[0, -1] #first row, last column

9

In [8]:
#SUBARRAYS
#use brackets to also access subarrays with the slice notation, marked by a colon

In [10]:
m1[1:7:2] #first number (1) indicates which position to start in the new subarray; second number (7) indicates stopping at the 8th element without including it, and last number(2) indicates the length of the counting steps, aka print every 2 numbers

array([92, 17, 57])

In [11]:
m1[:5] #access the first five elements

array([66, 92, 98, 17, 83])

In [12]:
m1[5:] #access elements after the 6th element

array([57, 86, 97, 96, 47])

In [13]:
m1[::2] #print every other element

array([66, 98, 83, 86, 96])

In [14]:
#Challenge: how would you access every other element starting from the 2nd?
m1[1::2]

array([92, 17, 57, 97, 47])

In [15]:
m1[::-1] #all elements, reversed

array([47, 96, 97, 86, 57, 83, 17, 98, 92, 66])

In [16]:
#multi-dimensional slices work the same way, with multiple slices separated by commas

In [17]:
m2[:2, :3] #first two rows, first 3 columns

array([[9, 0, 0],
       [3, 4, 0]])

In [18]:
m2[:, 0] #first column

array([9, 3, 4])

In [19]:
m2[0, :]
m2[0]

array([9, 0, 0, 9])

In [20]:
m2[m2>1]

array([9, 9, 3, 4, 4, 7, 3])

In [21]:
#other useful tricks for array manipulation
#np.concatenate: stacking arrays of same dimension together
#np.vstack: similar to np.concatenate, but clearer. Stacks arrays together vertically.
#np.hstack: stack arrays together horizontally

In [22]:
grid = np.array([[1, 2, 3],
                [4, 5, 6]])
np.concatenate([grid, grid])

array([[1, 2, 3],
       [4, 5, 6],
       [1, 2, 3],
       [4, 5, 6]])

In [23]:
np.vstack([grid, grid+1]) #grid +1 = every element in grid increments 1

array([[1, 2, 3],
       [4, 5, 6],
       [2, 3, 4],
       [5, 6, 7]])

In [24]:
np.hstack([grid, grid+1])

array([[1, 2, 3, 2, 3, 4],
       [4, 5, 6, 5, 6, 7]])

In [25]:
#each array has attributes ndim (the number of dimensions), shape (the size of each dimension), and size (the total size of the array)
#reshape () an array
#np.split() an array

In [27]:
a = np.arange(6).reshape((3, 2))
a

array([[0, 1],
       [2, 3],
       [4, 5]])

In [28]:
np.reshape(a, (2, 3)) #C-like index ordering

array([[0, 1, 2],
       [3, 4, 5]])

In [29]:
np.reshape(np.ravel(a), (2, 3)) #equivalent to C ravel then C reshape

array([[0, 1, 2],
       [3, 4, 5]])

In [30]:
import pandas as pd
s1 = pd.Series([0.25, 0.5, 0.75, 1.0],
              index=['a', 'b', 'c', 'd'])
s1

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [31]:
s1['b']

0.5

In [33]:
s1[1] #use implicit index or array/Python style index

0.5

In [40]:
 s1[2] #COME BACK TO THIS TO FIGURE OUT
    #figured it out --> 2 is the position in the array

0.75

In [41]:
#slicing by explicit index:
s1['a':'c'] #can also be written as s[0:2], aka command to create a subarray of everything from first position to 3rd position, inclusive

a    0.25
b    0.50
c    0.75
dtype: float64

In [42]:
s1[(s1>0.3) & (s1<0.8)] #very cool

b    0.50
c    0.75
dtype: float64

In [43]:
#don't get confused by slicing and indexing conventions
#if the Series has an explicit integer index, an indexing operation such as data[1] will use explicit indices,
#while a slicing operation like data[1:3] will use implicit Python-style index

In [44]:
s2 = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
s2

1    a
3    b
5    c
dtype: object

In [45]:
#explicit index when indexing
s2[1]

'a'

In [49]:
#s2['1'] will return an error because index '1' is stored as a number
s2['1']

KeyError: '1'

In [50]:
s2[1:3]

3    b
5    c
dtype: object

In [None]:
#SKIPPING TO COMBINING DATASETS

In [2]:
import pandas as pd

In [3]:
def make_df(cols, ind):
    #Quickly make a DataFrame
    data = {c: [str(c) + str(i) for i in ind]
           for c in cols}
    return pd.DataFrame(data, ind)

#example DataFrame
make_df('ABC', range(3))

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


In [4]:
df1 = make_df('AB', [1, 2])
df2 = make_df('AB', [3, 4])
print(df1, "\n\n\n")
print(df2, "\n\n\n")
print(pd.concat([df1, df2]))
#pd.concat() finds the columns in common and adds the new rows & values

    A   B
1  A1  B1
2  A2  B2 



    A   B
3  A3  B3
4  A4  B4 



    A   B
1  A1  B1
2  A2  B2
3  A3  B3
4  A4  B4


In [5]:
#concatenation of two dataframes which have some columns in common
df3 = make_df('ABC', [1, 2])
df4 = make_df('BCD', [3, 4])
print(df3, "\n\n\n")
print(df4, "\n\n\n")
print(pd.concat([df3, df4]))

    A   B   C
1  A1  B1  C1
2  A2  B2  C2 



    B   C   D
3  B3  C3  D3
4  B4  C4  D4 



     A   B   C    D
1   A1  B1  C1  NaN
2   A2  B2  C2  NaN
3  NaN  B3  C3   D3
4  NaN  B4  C4   D4


In [7]:
#pd.concat above shows NaN in entries without values.
#Use join='inner' with the pd.concat() to create the table excluding empty cells
pd.concat([df3, df4], join = 'inner')

Unnamed: 0,B,C
1,B1,C1
2,B2,C2
3,B3,C3
4,B4,C4


In [8]:
pd.concat([df3, df4], join = 'outer')

Unnamed: 0,A,B,C,D
1,A1,B1,C1,
2,A2,B2,C2,
3,,B3,C3,D3
4,,B4,C4,D4


In [None]:
#pd.merge() is more common and can implement multiple types of joins: one-to-one, many-to-one, many-to-many