In [1]:
import pandas as pd
import numpy as np

# A Note on Column (Re)Assignment

* It is not always clear when Pandas returns a copy vs view.
* It's dangerous, don't do it!
* See Chapter 2.3 and 4.1 in the textbook.
* Caution: Tutorials lie to you!

### TLDR; Make copies whenever possible, use `assign` for adding columns

In [2]:
def create_rands():
    np.random.seed(42)
    data = np.random.randint(0, 5, size=(10, 5))
    df = pd.DataFrame(data, columns=['col%d' % n for n in range(5)])
    return df

# 1 -- slice rows, then change column

In [3]:
df = create_rands()
df

Unnamed: 0,col0,col1,col2,col3,col4
0,3,4,2,4,4
1,1,2,2,2,4
2,3,2,4,1,3
3,1,3,4,0,3
4,1,4,3,0,0
5,2,2,1,3,3
6,2,3,3,0,2
7,4,2,4,0,1
8,3,0,3,1,1
9,0,1,4,1,3


In [4]:
# What are we doing here?

evens = df[df['col0'] % 2 == 0]
# evens = df.loc[9]
evens

Unnamed: 0,col0,col1,col2,col3,col4
5,2,2,1,3,3
6,2,3,3,0,2
7,4,2,4,0,1
9,0,1,4,1,3


# Array Broadcasting

* Arrays with different sizes (usually) cannot be added, subtracted, or generally be used in arithmetic.
* A way to overcome this is to duplicate the smaller array so that it is the dimensionality and size as the larger array. 
* This is called array **broadcasting** and is available in NumPy when performing array arithmetic.



https://machinelearningmastery.com/broadcasting-with-numpy-arrays/

In [None]:
# replicating the smaller array along the last mismatched dimension

a = np.array([1, 2, 3])
print(a)
b = 2
print(b)
c = a + b
print(c)

In [None]:
# one-dimensional and two-dimensional
from numpy import array
A = array([[1, 2, 3], [1, 2, 3]])
print(A)
print()
b = array([10, 20, 30])
print(b)
print()
C = A + b
print(C)


In [None]:
# Limitations
# can only be performed when the shape of each dimension in the arrays are 
# equal or one has the dimension size of 1

a = np.array([1, 2])
b = np.array([1, 2, 3])
c = a + b
c

In [None]:
# What am I trying to do? What is the output I expect? 
# Modyfing the existing dataframe

evens['col0'] = -1000

In [None]:
# meaning of the warning
# You just reassigned evens here by setting a new value in the dataframe
# but that dataframe is a COPY of a DIFFERENT dataframe.
# The result maybe not what you had in mind

In [None]:
evens

In [None]:
df

In [None]:
# ^^ Copy (even) was created

## 2 -- slice column, then change row

In [None]:
df = create_rands()
df

In [None]:
# What am I doing?

col = df.loc[:, 'col0']
col.loc[df['col0'] % 2 == 0] = -1000
col

In [None]:
df

In [None]:
# ^^ it was a reference

# 3 -- select row, then change entry

# Single row is selected

Unlike the first example we selected a **single** row. What did we get? A copy or a reference?

In [None]:
df = create_rands()
df

In [None]:
row = df.loc[0:3]
row
row['col0'] = -1000

In [None]:
df

In [None]:
# ^^ reference!
# It treats slices of a dataframe and refers to the by reference 


# 4 -- select row, then change entry (w/string column)

In [None]:
df = create_rands().assign(col4='a')
df

In [None]:
row = df.loc[0]
row['col0'] = -1000

In [None]:
df

## References vs Copies
* If table is homogeneous -- Pandas outsources memory management to numpy (reference)
* If table is heterogeneous -- Pandas makes copies

In [None]:
df = create_rands()
df

In [None]:
arr = df.values
arr[arr == 0] = -100000

In [None]:
arr

In [None]:
df

In [None]:
df = create_rands().assign(col5='a')
df

In [None]:
arr = df.values
arr[arr == 0] = -100000

In [None]:
arr

In [None]:
df

In [None]:
# Use .copy and .assign as much as you can