In [32]:
import pandas as pd


def get_default_df():
    return pd.DataFrame({"Name": ["bob", "john", "sarah"], "Age": [10, 15, 20]})


def get_csv_df():
    return pd.read_csv('titanic.csv')

## From Dictionary of Lists


In [3]:
df = pd.DataFrame({"Name": ["bob", "john", "sarah"], "Age": [10, 15, 20]})

df

Unnamed: 0,Name,Age
0,bob,10
1,john,15
2,sarah,20


## From List of Dictionaries


In [5]:
df = pd.DataFrame([{
    "Name": "bob",
    "Age": 10
}, {
    "Name": "john",
    "Age": 15
}, {
    "Name": "sarah",
    "Age": 20
}])

df

Unnamed: 0,Name,Age
0,bob,10
1,john,15
2,sarah,20


## Displaying DataFrame


In [6]:
print(df)  # Pure text table

df  # More visual table


    Name  Age
0    bob   10
1   john   15
2  sarah   20


Unnamed: 0,Name,Age
0,bob,10
1,john,15
2,sarah,20


## Columns

A column from a DataFrame is a **Series** object.

A DataFrame acts like a dictionary with **columns as keys**.


In [14]:
df = get_default_df()
column = df["Name"]

print(type(column))
print()
print(column)  # Basically looks like a 1-column table (almost)
print()
column  # Prints the same as the print() version in this case

<class 'pandas.core.series.Series'>

0      bob
1     john
2    sarah
Name: Name, dtype: object



0      bob
1     john
2    sarah
Name: Name, dtype: object

## Series from Scratch


## Alternate Column Syntax


In [55]:
df = get_default_df()
column = df.Name
column

0      bob
1     john
2    sarah
Name: Name, dtype: object

In [15]:
column = pd.Series([10, 15, 20], name="Age")

print(column)

0    10
1    15
2    20
Name: Age, dtype: int64


## DataFrame Technical Info


In [42]:
df = get_default_df()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    3 non-null      object
 1   Age     3 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 176.0+ bytes


## Numerical Data Summary

You can get a table representing numerical columns in the table with stats about them.


In [20]:
df = get_default_df()

df.describe()

Unnamed: 0,Age
count,3.0
mean,15.0
std,5.0
min,10.0
25%,12.5
50%,15.0
75%,17.5
max,20.0


## Individual Stats on Numerical Series


In [26]:
df = get_default_df()

print(df["Age"].max())
print(df["Age"].mean())
print(df["Age"].std())

print(df["Name"].max())  # Even this works somehow

20
15.0
5.0
sarah


## Individual Stats on Whole Table

Gives you a table of that stat for **each column**.


In [28]:
df = get_default_df()

df.max()

Name    sarah
Age        20
dtype: object

## Loading from CSV

Also see **compression** and **low_memory** options for zips and loading all into memory.


In [31]:
df = pd.read_csv('titanic.csv')

df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


## First Rows


In [35]:
df = get_csv_df()

df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


## Last Rows


In [36]:
df = get_csv_df()

df.tail(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
881,882,0,3,"Markun, Mr. Johann",male,33.0,0,0,349257,7.8958,,S
882,883,0,3,"Dahlberg, Miss. Gerda Ulrika",female,22.0,0,0,7552,10.5167,,S
883,884,0,2,"Banfield, Mr. Frederick James",male,28.0,0,0,C.A./SOTON 34068,10.5,,S
884,885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.05,,S
885,886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.125,,Q
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


## Data Types of Columns


In [37]:
df = get_csv_df()

df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

## Saving File

Existing file will be **overwritten**.

You can load and save **various formats**.


In [40]:
df = get_csv_df()

df.to_csv("titanic2.csv")

## Changing Index Column

You can use a specified column as the row number-like entity instead of the default.

You can specify by **index or name**.


In [50]:
pd.read_csv('titanic.csv', index_col="PassengerId")

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


## Series with Index Column

A series object includes the index column if it exists, but otherwise still acts the same.


In [52]:
df = pd.read_csv('titanic.csv', index_col='PassengerId')
ser = df['Name']
ser

PassengerId
1                                Braund, Mr. Owen Harris
2      Cumings, Mrs. John Bradley (Florence Briggs Th...
3                                 Heikkinen, Miss. Laina
4           Futrelle, Mrs. Jacques Heath (Lily May Peel)
5                               Allen, Mr. William Henry
                             ...                        
887                                Montvila, Rev. Juozas
888                         Graham, Miss. Margaret Edith
889             Johnston, Miss. Catherine Helen "Carrie"
890                                Behr, Mr. Karl Howell
891                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

## Index Property

Related to how slicing and indexing works in pandas.


In [56]:
df = get_csv_df()

df.index

RangeIndex(start=0, stop=891, step=1)

## String Operations


In [54]:
df = get_csv_df()

df['Name'] = df['Name'].str.replace('Harris', "BLABLABLA")  # not in-place
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen BLABLABLA",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C
