## Create a pandas dataframe with random data

In [1]:
import pandas as pd
import numpy as np
from numpy.random import randn

In [2]:
np.random.seed(101)

In [3]:
df = pd.DataFrame(randn(5,4), ['A', 'B', 'C', 'D', 'E'], ['W', 'X', 'Y', 'Z'])

In [4]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


## Selecting columns

In [5]:
type(df)

pandas.core.frame.DataFrame

In [7]:
df['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [9]:
df[['W', 'Z']] #select multiple columns

Unnamed: 0,W,Z
A,2.70685,0.503826
B,0.651118,0.605965
C,-2.018168,-0.589001
D,0.188695,0.955057
E,0.190794,0.683509


## Adding new column

In [10]:
df['new'] = df['W'] + df ['Z'] #create a new column based on existing columns

In [11]:
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.210676
B,0.651118,-0.319318,-0.848077,0.605965,1.257083
C,-2.018168,0.740122,0.528813,-0.589001,-2.607169
D,0.188695,-0.758872,-0.933237,0.955057,1.143752
E,0.190794,1.978757,2.605967,0.683509,0.874303


## Removing columns

In [12]:
df.drop('W', 1) #0 is row axis and column in 1 axis, drop is a copy of the data not delete on the original data

Unnamed: 0,X,Y,Z,new
A,0.628133,0.907969,0.503826,3.210676
B,-0.319318,-0.848077,0.605965,1.257083
C,0.740122,0.528813,-0.589001,-2.607169
D,-0.758872,-0.933237,0.955057,1.143752
E,1.978757,2.605967,0.683509,0.874303


In [13]:
df.drop('W', 1, inplace=True)

In [14]:
df

Unnamed: 0,X,Y,Z,new
A,0.628133,0.907969,0.503826,3.210676
B,-0.319318,-0.848077,0.605965,1.257083
C,0.740122,0.528813,-0.589001,-2.607169
D,-0.758872,-0.933237,0.955057,1.143752
E,1.978757,2.605967,0.683509,0.874303


## Selecting rows

In [16]:
df.loc['A'] #if our rows are named

X      0.628133
Y      0.907969
Z      0.503826
new    3.210676
Name: A, dtype: float64

In [17]:
df.iloc[0] #if rows are not named

X      0.628133
Y      0.907969
Z      0.503826
new    3.210676
Name: A, dtype: float64

## Selecting subsets of rows and columns

In [18]:
df['Y']['C'] #individual value in column Y, row C

0.5288134940893595

In [21]:
#rows A, B, C, cols x and z
df[['X', 'Z']][0:2]

Unnamed: 0,X,Z
A,0.628133,0.503826
B,-0.319318,0.605965


In [29]:
df[['X', 'Z']]['A':'C']

Unnamed: 0,X,Z
A,0.628133,0.503826
B,-0.319318,0.605965
C,0.740122,-0.589001


## Conditional selection

In [32]:
df [df < 0] #where df is less than 0

Unnamed: 0,X,Y,Z,new
A,,,,
B,-0.319318,-0.848077,,
C,,,-0.589001,-2.607169
D,-0.758872,-0.933237,,
E,,,,


In [34]:
bools = df < 0

In [35]:
df[bools]

Unnamed: 0,X,Y,Z,new
A,,,,
B,-0.319318,-0.848077,,
C,,,-0.589001,-2.607169
D,-0.758872,-0.933237,,
E,,,,


## Multiple conditions

In [47]:
df[df['Z'] < 0] # this will give the whole row(data set) where there is a negative number

Unnamed: 0,X,Y,Z,new
C,0.740122,0.528813,-0.589001,-2.607169


In [49]:
df[df['Z'] < 0][['Z', 'new']] #extract the data further

Unnamed: 0,Z,new
C,-0.589001,-2.607169


In [37]:
#where Z < 0 or X > 0.5

In [42]:
bool1 = df['Z'] < 0
bool2 = df['X'] > 0.5

In [43]:
bool1

A    False
B    False
C     True
D    False
E    False
Name: Z, dtype: bool

In [44]:
bool2

A     True
B    False
C     True
D    False
E     True
Name: X, dtype: bool

In [45]:
df[bool1 | bool2]

Unnamed: 0,X,Y,Z,new
A,0.628133,0.907969,0.503826,3.210676
C,0.740122,0.528813,-0.589001,-2.607169
E,1.978757,2.605967,0.683509,0.874303


In [46]:
df[(df['Z'] < 0) | (df['X'] > 0.5)] # in a single line

Unnamed: 0,X,Y,Z,new
A,0.628133,0.907969,0.503826,3.210676
C,0.740122,0.528813,-0.589001,-2.607169
E,1.978757,2.605967,0.683509,0.874303
