In [1]:
import pandas as pd


def get_csv_df():
    return pd.read_csv('titanic.csv')

## Shapes


In [6]:
df = get_csv_df()

print(f'DataFrame Shape: {df.shape}')
print(f'Series Shape: {df["Name"].shape}')

DataFrame Shape: (891, 12)
Series Shape: (891,)


## Filtering Down Columns

Note you need **extra brackets** unlike in numpy.


In [8]:
df = get_csv_df()

df[["Age", "Name"]]

Unnamed: 0,Age,Name
0,22.0,"Braund, Mr. Owen Harris"
1,38.0,"Cumings, Mrs. John Bradley (Florence Briggs Th..."
2,26.0,"Heikkinen, Miss. Laina"
3,35.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)"
4,35.0,"Allen, Mr. William Henry"
...,...,...
886,27.0,"Montvila, Rev. Juozas"
887,19.0,"Graham, Miss. Margaret Edith"
888,,"Johnston, Miss. Catherine Helen ""Carrie"""
889,26.0,"Behr, Mr. Karl Howell"


## Arbitrary Reordering/Repeating, etc.


In [9]:
df = get_csv_df()

df[["Name", "Age", "Name", "Name"]]

Unnamed: 0,Name,Age,Name.1,Name.2
0,"Braund, Mr. Owen Harris",22.0,"Braund, Mr. Owen Harris","Braund, Mr. Owen Harris"
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...","Cumings, Mrs. John Bradley (Florence Briggs Th..."
2,"Heikkinen, Miss. Laina",26.0,"Heikkinen, Miss. Laina","Heikkinen, Miss. Laina"
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)","Futrelle, Mrs. Jacques Heath (Lily May Peel)"
4,"Allen, Mr. William Henry",35.0,"Allen, Mr. William Henry","Allen, Mr. William Henry"
...,...,...,...,...
886,"Montvila, Rev. Juozas",27.0,"Montvila, Rev. Juozas","Montvila, Rev. Juozas"
887,"Graham, Miss. Margaret Edith",19.0,"Graham, Miss. Margaret Edith","Graham, Miss. Margaret Edith"
888,"Johnston, Miss. Catherine Helen ""Carrie""",,"Johnston, Miss. Catherine Helen ""Carrie""","Johnston, Miss. Catherine Helen ""Carrie"""
889,"Behr, Mr. Karl Howell",26.0,"Behr, Mr. Karl Howell","Behr, Mr. Karl Howell"


## Conditions

Conditions on series return a series of booleans. Can use all the conditions you'd expect such as == and | as well.


In [12]:
df = get_csv_df()

ser = df["Age"] > 35
print(type(ser))
ser

<class 'pandas.core.series.Series'>


0      False
1       True
2      False
3      False
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Name: Age, Length: 891, dtype: bool

## Arithmetic


In [13]:
df = get_csv_df()

ser = df["Age"]**2
ser

0       484.0
1      1444.0
2       676.0
3      1225.0
4      1225.0
        ...  
886     729.0
887     361.0
888       NaN
889     676.0
890    1024.0
Name: Age, Length: 891, dtype: float64

## Other Methods


In [17]:
df = get_csv_df()

ser = df["Age"].isin(range(10, 20))
ser

0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887     True
888    False
889    False
890    False
Name: Age, Length: 891, dtype: bool

## Masked Indices

A boolean series can be used to **filter rows** from the dataframe.


In [16]:
df = get_csv_df()

df[df["Age"] > 35]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.5500,C103,S
13,14,0,3,"Andersson, Mr. Anders Johan",male,39.0,1,5,347082,31.2750,,S
15,16,1,2,"Hewlett, Mrs. (Mary D Kingcome)",female,55.0,0,0,248706,16.0000,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
865,866,1,2,"Bystrom, Mrs. (Karolina)",female,42.0,0,0,236852,13.0000,,S
871,872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47.0,1,1,11751,52.5542,D35,S
873,874,0,3,"Vander Cruyssen, Mr. Victor",male,47.0,0,0,345765,9.0000,,S
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C


## Subset of Rows and Columns at Same Time

The **loc** attribute is indexable by rows,cols.


In [19]:
df = get_csv_df()

df.loc[df["Age"] > 35, ["Name", "Age"]]

Unnamed: 0,Name,Age
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0
6,"McCarthy, Mr. Timothy J",54.0
11,"Bonnell, Miss. Elizabeth",58.0
13,"Andersson, Mr. Anders Johan",39.0
15,"Hewlett, Mrs. (Mary D Kingcome)",55.0
...,...,...
865,"Bystrom, Mrs. (Karolina)",42.0
871,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",47.0
873,"Vander Cruyssen, Mr. Victor",47.0
879,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",56.0


The **iloc** attribute is indexable similarly but with integers.


In [20]:
df = get_csv_df()

df.iloc[2:10, 1:3]

Unnamed: 0,Survived,Pclass
2,1,3
3,1,1
4,0,3
5,0,3
6,0,1
7,0,3
8,1,3
9,1,2


## Subset of Rows


In [42]:
df = get_csv_df()

df[2:5]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Subset of Rows & Cols by Double Indexing


In [47]:
df = get_csv_df()

df[2:5]['Name']

2                          Heikkinen, Miss. Laina
3    Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                        Allen, Mr. William Henry
Name: Name, dtype: object

## Get a single Column of a Row


In [49]:
df = get_csv_df()

df.iloc[0]['Name']

'Braund, Mr. Owen Harris'

## Modifying Entries


In [25]:
df = get_csv_df()

# By Column
df["Name"] = "Bob Bobberson"  # Broadcasting single value to all rows
df["Sex"] = ["a", "b"] * 445 + ["a"]  # Setting all rows one-by-one

# By Row
df.loc[df["Age"] > 35, "Survived"] = 2

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,Bob Bobberson,a,22.0,1,0,A/5 21171,7.2500,,S
1,2,2,1,Bob Bobberson,b,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,Bob Bobberson,a,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,Bob Bobberson,b,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,Bob Bobberson,a,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,Bob Bobberson,a,27.0,0,0,211536,13.0000,,S
887,888,1,1,Bob Bobberson,b,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,Bob Bobberson,a,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,Bob Bobberson,b,26.0,0,0,111369,30.0000,C148,C


## New Columns


In [29]:
df = get_csv_df()

df["Name2"] = "Bob Bobberson"
df["Age Squared"] = df["Age"]**2  # NOTE: spaces allowed in column names

df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name2,Age Squared
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,Bob Bobberson,484.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Bob Bobberson,1444.0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,Bob Bobberson,676.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,Bob Bobberson,1225.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,Bob Bobberson,1225.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,Bob Bobberson,729.0
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,Bob Bobberson,361.0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,Bob Bobberson,
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,Bob Bobberson,676.0


## Rename Columns

Note that this makes a **new data frame** instead of updating in-place.


In [32]:
df = get_csv_df()

df = df.rename(columns={'Name': 'Who?'})

df

Unnamed: 0,PassengerId,Survived,Pclass,Who?,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


## Remove Columns


In [37]:
df = get_csv_df()

df.drop(['Survived', 'Pclass'], axis=1)

Unnamed: 0,PassengerId,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...
886,887,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


## Conversion to Numpy


In [34]:
df = get_csv_df()

df.to_numpy()

array([[1, 0, 3, ..., 7.25, nan, 'S'],
       [2, 1, 1, ..., 71.2833, 'C85', 'C'],
       [3, 1, 3, ..., 7.925, nan, 'S'],
       ...,
       [889, 0, 3, ..., 23.45, nan, 'S'],
       [890, 1, 1, ..., 30.0, 'C148', 'C'],
       [891, 0, 3, ..., 7.75, nan, 'Q']], dtype=object)

## From Numpy


In [35]:
import numpy as np

matrix = np.array([["Bob", 20], ["John", 25]])
df = pd.DataFrame(matrix, columns=['Name', 'Age'])

df

Unnamed: 0,Name,Age
0,Bob,20
1,John,25


## Selective Numpy Array


In [51]:
df = get_csv_df()

df[['Name', 'Age']].values

array([['Braund, Mr. Owen Harris', 22.0],
       ['Cumings, Mrs. John Bradley (Florence Briggs Thayer)', 38.0],
       ['Heikkinen, Miss. Laina', 26.0],
       ...,
       ['Johnston, Miss. Catherine Helen "Carrie"', nan],
       ['Behr, Mr. Karl Howell', 26.0],
       ['Dooley, Mr. Patrick', 32.0]], dtype=object)