# Extracting row and columns
* How can I extract specific rows and columns from a Dataframe?
* How can I add or delete columns from a Dataframe?
* How can I find and change missing values in a Dataframe?

## Reading in the data

In [1]:
import pandas as pd
df = pd.read_csv("SN7577.tab", sep='\t')

In [2]:
df.head(2)

Unnamed: 0,Q1,Q2,Q3,Q4,Q5ai,Q5aii,Q5aiii,Q5aiv,Q5av,Q5avi,...,numhhd,numkid,numkid2,numkid31,numkid32,numkid33,numkid34,numkid35,numkid36,wts
0,1,-1,1,8,0,0,0,1,0,0,...,3,11,2,0,0,0,0,1,0,1.11439
1,3,-1,1,4,0,0,0,0,0,0,...,3,11,2,0,0,0,0,1,0,2.56604


## Selecting rows and columns

In [6]:
# Adding a cell before: a
# Adding a cell behind: b
# Deleting a cell: x

In [4]:
# Accessing a single column
df.Q1 # alternative
df['Q1'] # preferred way of accessing a column

0        1
1        3
2       10
3        9
4       10
        ..
1281     2
1282    10
1283     9
1284    11
1285    10
Name: Q1, Length: 1286, dtype: int64

In [None]:
[] # used for acessing stuff
() # used for calling stuff

In [7]:
# Access multiple columns
df[['Q1', 'Q2', 'Q3']]

Unnamed: 0,Q1,Q2,Q3
0,1,-1,1
1,3,-1,1
2,10,3,2
3,9,-1,10
4,10,2,6
...,...,...,...
1281,2,-1,3
1282,10,10,2
1283,9,-1,8
1284,11,11,1


### Exercise: Pandas columns
What happens if you:
1. List the columns you want out of order from the way they appear in the file?
2. Put the same column name in twice?
3. Put in a non-existing column name? (a.k.a Typo)

In [9]:
# 1
new_df = df[['Q3', 'Q1', 'Q2']]
new_df.head(2)

Unnamed: 0,Q3,Q1,Q2
0,1,1,-1
1,1,3,-1


In [10]:
# 2
new_df = df[['Q3', 'Q3']]
new_df.head(2)

Unnamed: 0,Q3,Q3.1
0,1,1
1,1,1


In [11]:
# 3
# stacktrace = the different steps that lead to the error
df['random']

KeyError: 'random'

## Filtering by rows

In [12]:
# select rows with index of 1, 2, 3 (rows 2, 3, 4 in the Dataframe)
df[1:4]

Unnamed: 0,Q1,Q2,Q3,Q4,Q5ai,Q5aii,Q5aiii,Q5aiv,Q5av,Q5avi,...,numhhd,numkid,numkid2,numkid31,numkid32,numkid33,numkid34,numkid35,numkid36,wts
1,3,-1,1,4,0,0,0,0,0,0,...,3,11,2,0,0,0,0,1,0,2.56604
2,10,3,2,6,0,0,0,0,0,0,...,2,11,2,0,0,0,0,1,0,2.04468
3,9,-1,10,10,0,0,0,0,0,0,...,1,-1,2,0,0,0,0,1,0,1.07592


In [13]:
df.head(3)

Unnamed: 0,Q1,Q2,Q3,Q4,Q5ai,Q5aii,Q5aiii,Q5aiv,Q5av,Q5avi,...,numhhd,numkid,numkid2,numkid31,numkid32,numkid33,numkid34,numkid35,numkid36,wts
0,1,-1,1,8,0,0,0,1,0,0,...,3,11,2,0,0,0,0,1,0,1.11439
1,3,-1,1,4,0,0,0,0,0,0,...,3,11,2,0,0,0,0,1,0,2.56604
2,10,3,2,6,0,0,0,0,0,0,...,2,11,2,0,0,0,0,1,0,2.04468


## Using criteria to filter rows

In [14]:
df['Q2'] == -1 # A mask or a filter

0        True
1        True
2       False
3        True
4       False
        ...  
1281     True
1282    False
1283     True
1284    False
1285    False
Name: Q2, Length: 1286, dtype: bool

In [15]:
# Return a new dataframe with the rows where Q2 == -1
df[(df['Q2'] == -1)]

Unnamed: 0,Q1,Q2,Q3,Q4,Q5ai,Q5aii,Q5aiii,Q5aiv,Q5av,Q5avi,...,numhhd,numkid,numkid2,numkid31,numkid32,numkid33,numkid34,numkid35,numkid36,wts
0,1,-1,1,8,0,0,0,1,0,0,...,3,11,2,0,0,0,0,1,0,1.11439
1,3,-1,1,4,0,0,0,0,0,0,...,3,11,2,0,0,0,0,1,0,2.56604
3,9,-1,10,10,0,0,0,0,0,0,...,1,-1,2,0,0,0,0,1,0,1.07592
5,1,-1,1,1,0,0,0,0,0,0,...,2,-1,2,0,0,0,0,1,0,0.85126
6,1,-1,1,8,0,0,0,0,0,0,...,2,-1,2,0,0,0,0,1,0,1.51383
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1277,2,-1,5,4,0,0,0,0,0,0,...,3,1,1,0,0,1,0,0,0,0.99600
1278,2,-1,4,5,0,0,0,0,0,0,...,4,2,1,0,0,0,1,0,0,0.82948
1279,1,-1,2,3,0,0,0,0,0,0,...,1,-1,2,0,0,0,0,1,0,0.87891
1281,2,-1,3,6,0,0,0,0,0,0,...,2,-1,2,0,0,0,0,1,0,0.85231


In [16]:
# Select all rows where Q2 == -1 AND numage is larger than 60
df[(df['Q2'] == -1) & (df.numage > 60)]

Unnamed: 0,Q1,Q2,Q3,Q4,Q5ai,Q5aii,Q5aiii,Q5aiv,Q5av,Q5avi,...,numhhd,numkid,numkid2,numkid31,numkid32,numkid33,numkid34,numkid35,numkid36,wts
0,1,-1,1,8,0,0,0,1,0,0,...,3,11,2,0,0,0,0,1,0,1.11439
6,1,-1,1,8,0,0,0,0,0,0,...,2,-1,2,0,0,0,0,1,0,1.51383
7,1,-1,1,1,0,0,0,0,0,0,...,2,-1,2,0,0,0,0,1,0,0.86168
8,9,-1,10,10,0,0,0,0,0,0,...,2,11,2,0,0,0,0,1,0,0.48973
9,2,-1,1,1,0,0,0,0,0,0,...,1,-1,2,0,0,0,0,1,0,0.36381
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1243,2,-1,6,10,0,0,0,0,0,0,...,1,-1,2,0,0,0,0,1,0,0.44283
1267,4,-1,3,4,0,0,0,1,0,1,...,1,-1,2,0,0,0,0,1,0,0.57340
1268,2,-1,4,4,0,0,0,0,0,1,...,1,-1,2,0,0,0,0,1,0,0.45117
1271,2,-1,1,6,0,0,0,0,0,0,...,2,-1,2,0,0,0,0,1,0,0.44399


In [19]:
# Select all rows where Q2 == -1 AND numage is larger than 60
# Then only select Q1, Q2, numage columns
df[(df['Q2'] == -1) & (df['numage'] > 60)][['Q1', 'Q2', 'numage']]

Unnamed: 0,Q1,Q2,numage
0,1,-1,64
6,1,-1,74
7,1,-1,79
8,9,-1,65
9,2,-1,62
...,...,...,...
1243,2,-1,61
1267,4,-1,73
1268,2,-1,82
1271,2,-1,73


## Sampling

In [18]:
# Take ten records randomly, do not select the same row twice
df.sample(10, replace=False) 

Unnamed: 0,Q1,Q2,Q3,Q4,Q5ai,Q5aii,Q5aiii,Q5aiv,Q5av,Q5avi,...,numhhd,numkid,numkid2,numkid31,numkid32,numkid33,numkid34,numkid35,numkid36,wts
1060,10,10,4,8,0,0,0,1,0,0,...,4,11,2,0,0,0,0,1,0,3.75422
774,5,-1,1,3,0,0,1,0,1,1,...,1,-1,2,0,0,0,0,1,0,1.13536
290,2,-1,5,8,0,0,0,0,0,0,...,2,-1,2,0,0,0,0,1,0,0.70153
1067,2,-1,2,1,0,0,0,0,0,0,...,4,2,1,0,0,1,1,0,0,0.65176
393,2,-1,1,6,0,0,0,0,0,0,...,5,3,1,0,0,1,1,0,0,1.03287
19,2,-1,3,8,0,0,0,1,1,0,...,4,11,2,0,0,0,0,1,0,0.80929
639,11,11,1,2,0,0,0,0,0,0,...,2,-1,2,0,0,0,0,1,0,1.86015
243,2,-1,1,3,0,0,0,0,0,0,...,5,11,2,0,0,0,0,1,0,0.49588
792,1,-1,1,1,0,0,0,0,0,0,...,1,-1,2,0,0,0,0,1,0,0.82116
327,11,11,1,1,0,0,0,1,0,1,...,2,11,2,0,0,0,0,1,0,1.65172


* We can select columns by using brackets
* We can also select rows by giving a lice within brackets
* We can do conditonal or criteria-based filtering

### Exercise: filtering
Select all the rows where numkid is larger than 5 OR Q3 == 4, then select only numkid and Q3 columns.

In [30]:
["Hello Bram"]==4

False

In [28]:
df[(df["numkid"]>5) | (["Q3"]==4)]

Unnamed: 0,Q1,Q2,Q3,Q4,Q5ai,Q5aii,Q5aiii,Q5aiv,Q5av,Q5avi,...,numhhd,numkid,numkid2,numkid31,numkid32,numkid33,numkid34,numkid35,numkid36,wts
0,1,-1,1,8,0,0,0,1,0,0,...,3,11,2,0,0,0,0,1,0,1.11439
1,3,-1,1,4,0,0,0,0,0,0,...,3,11,2,0,0,0,0,1,0,2.56604
2,10,3,2,6,0,0,0,0,0,0,...,2,11,2,0,0,0,0,1,0,2.04468
8,9,-1,10,10,0,0,0,0,0,0,...,2,11,2,0,0,0,0,1,0,0.48973
15,2,-1,1,3,1,0,0,0,0,0,...,2,11,2,0,0,0,0,1,0,0.85808
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1261,3,-1,1,1,0,0,1,0,0,1,...,5,11,2,0,0,0,0,1,0,1.48170
1262,1,-1,7,7,0,0,0,1,0,0,...,4,11,2,0,0,0,0,1,0,0.79911
1264,10,10,10,10,0,0,0,0,0,0,...,5,11,2,0,0,0,0,1,0,1.29831
1266,2,-1,1,1,0,0,0,0,0,0,...,3,11,2,0,0,0,0,1,0,1.73196


In [24]:
# df[(df["numkid"]>5) | ["Q3"]==4]
df[(df["numkid"]>5) | (df["Q3"]==4)]

Unnamed: 0,Q1,Q2,Q3,Q4,Q5ai,Q5aii,Q5aiii,Q5aiv,Q5av,Q5avi,...,numhhd,numkid,numkid2,numkid31,numkid32,numkid33,numkid34,numkid35,numkid36,wts
0,1,-1,1,8,0,0,0,1,0,0,...,3,11,2,0,0,0,0,1,0,1.11439
1,3,-1,1,4,0,0,0,0,0,0,...,3,11,2,0,0,0,0,1,0,2.56604
2,10,3,2,6,0,0,0,0,0,0,...,2,11,2,0,0,0,0,1,0,2.04468
8,9,-1,10,10,0,0,0,0,0,0,...,2,11,2,0,0,0,0,1,0,0.48973
15,2,-1,1,3,1,0,0,0,0,0,...,2,11,2,0,0,0,0,1,0,0.85808
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1268,2,-1,4,4,0,0,0,0,0,1,...,1,-1,2,0,0,0,0,1,0,0.45117
1273,3,-1,4,5,0,0,0,0,0,0,...,2,-1,2,0,0,0,0,1,0,0.70474
1275,10,2,4,7,0,0,0,0,0,0,...,3,1,1,1,0,0,0,0,0,1.06163
1276,10,10,4,6,0,0,0,0,0,0,...,3,11,2,0,0,0,0,1,0,0.88358


In [None]:
df['Q3'] # accesing column (preferred)
df.Q3 # alternative way of accessing column

In [33]:
df[(df["numkid"] > 5) | (df["Q3"] == 4)]

Unnamed: 0,Q1,Q2,Q3,Q4,Q5ai,Q5aii,Q5aiii,Q5aiv,Q5av,Q5avi,...,numhhd,numkid,numkid2,numkid31,numkid32,numkid33,numkid34,numkid35,numkid36,wts
0,1,-1,1,8,0,0,0,1,0,0,...,3,11,2,0,0,0,0,1,0,1.11439
1,3,-1,1,4,0,0,0,0,0,0,...,3,11,2,0,0,0,0,1,0,2.56604
2,10,3,2,6,0,0,0,0,0,0,...,2,11,2,0,0,0,0,1,0,2.04468
8,9,-1,10,10,0,0,0,0,0,0,...,2,11,2,0,0,0,0,1,0,0.48973
15,2,-1,1,3,1,0,0,0,0,0,...,2,11,2,0,0,0,0,1,0,0.85808
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1268,2,-1,4,4,0,0,0,0,0,1,...,1,-1,2,0,0,0,0,1,0,0.45117
1273,3,-1,4,5,0,0,0,0,0,0,...,2,-1,2,0,0,0,0,1,0,0.70474
1275,10,2,4,7,0,0,0,0,0,0,...,3,1,1,1,0,0,0,0,0,1.06163
1276,10,10,4,6,0,0,0,0,0,0,...,3,11,2,0,0,0,0,1,0,0.88358


In [35]:
df[(df["numkid"] > 5) | (df["Q3"] == 4)][['numkid', 'Q3']]

Unnamed: 0,numkid,Q3
0,11,1
1,11,1
2,11,2
8,11,10
15,11,1
...,...,...
1268,-1,4
1273,-1,4
1275,1,4
1276,11,4


In [37]:
selection_on_rows = df[(df["numkid"] > 5) | (df["Q3"] == 4)]
selection_on_rows.head(2)

Unnamed: 0,Q1,Q2,Q3,Q4,Q5ai,Q5aii,Q5aiii,Q5aiv,Q5av,Q5avi,...,numhhd,numkid,numkid2,numkid31,numkid32,numkid33,numkid34,numkid35,numkid36,wts
0,1,-1,1,8,0,0,0,1,0,0,...,3,11,2,0,0,0,0,1,0,1.11439
1,3,-1,1,4,0,0,0,0,0,0,...,3,11,2,0,0,0,0,1,0,2.56604


In [39]:
selection_on_rows[['Q3', 'numkid']]

Unnamed: 0,Q3,numkid
0,1,11
1,1,11
2,2,11
8,10,11
15,1,11
...,...,...
1268,4,-1
1273,4,-1
1275,4,1
1276,4,11
