Dataframe is a 2-dimensional labeled data structure with columns of potentially different types. You can think of it like a spreadsheet or SQL table, or a dict of Series objects. It is generally the most commonly used pandas object.

In [2]:
import numpy as np
import pandas as pd



In [2]:
np.random.seed(101)
mydata = np.random.randint(0, 101, (4, 3))

In [4]:
myindex = ['CA', 'NY', 'AZ', 'TX']
mycolumns = ['Jan', 'Feb', 'Mar']
df = pd.DataFrame(data=mydata, index=myindex, columns=mycolumns)

In [7]:
df

Unnamed: 0,Jan,Feb,Mar
CA,95,11,81
NY,70,63,87
AZ,75,9,77
TX,40,4,63


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, CA to TX
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Jan     4 non-null      int64
 1   Feb     4 non-null      int64
 2   Mar     4 non-null      int64
dtypes: int64(3)
memory usage: 128.0+ bytes


In [5]:
df = pd.read_csv('tips.csv') # read_csv() is a function\
# https://github.com/mwaskom/seaborn-data/blob/master/tips.csv

In [6]:
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [7]:
df.index

RangeIndex(start=0, stop=244, step=1)

In [8]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [9]:
df.tail()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.0,Female,Yes,Sat,Dinner,2
241,22.67,2.0,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2
243,18.78,3.0,Female,No,Thur,Dinner,2


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB


In [11]:
df.describe()

Unnamed: 0,total_bill,tip,size
count,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672
std,8.902412,1.383638,0.9511
min,3.07,1.0,1.0
25%,13.3475,2.0,2.0
50%,17.795,2.9,2.0
75%,24.1275,3.5625,3.0
max,50.81,10.0,6.0


In [12]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
total_bill,244.0,19.785943,8.902412,3.07,13.3475,17.795,24.1275,50.81
tip,244.0,2.998279,1.383638,1.0,2.0,2.9,3.5625,10.0
size,244.0,2.569672,0.9511,1.0,2.0,2.0,3.0,6.0


# Working with columns

In [13]:
type(df['total_bill'])

pandas.core.series.Series

In [14]:
mycols = ['total_bill', 'tip']
df[mycols]

Unnamed: 0,total_bill,tip
0,16.99,1.01
1,10.34,1.66
2,21.01,3.50
3,23.68,3.31
4,24.59,3.61
...,...,...
239,29.03,5.92
240,27.18,2.00
241,22.67,2.00
242,17.82,1.75


In [15]:
df['tip_pct'] = 100 * df['tip'] / df['total_bill'] # create a new column
# will be overwritten if already exists

In [16]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,5.944673
1,10.34,1.66,Male,No,Sun,Dinner,3,16.054159
2,21.01,3.5,Male,No,Sun,Dinner,3,16.658734
3,23.68,3.31,Male,No,Sun,Dinner,2,13.978041
4,24.59,3.61,Female,No,Sun,Dinner,4,14.680765


In [17]:
df['tip_pct'] = np.round(df['tip_pct'], 2) # round to 2 decimal places

In [18]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,5.94
1,10.34,1.66,Male,No,Sun,Dinner,3,16.05
2,21.01,3.5,Male,No,Sun,Dinner,3,16.66
3,23.68,3.31,Male,No,Sun,Dinner,2,13.98
4,24.59,3.61,Female,No,Sun,Dinner,4,14.68


# Remove columns

In [26]:
df.drop('tip_pct', axis=1,inplace=True) # axis=1 means column
# inplace=True means overwrite the original dataframe
df.head()

KeyError: "['tip_pct'] not found in axis"

In [28]:
df['tip_pct'] = 100 * df['tip'] / df['total_bill']
df.head()
df.drop('tip_pct', axis=1,inplace=False)
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,5.944673
1,10.34,1.66,Male,No,Sun,Dinner,3,16.054159
2,21.01,3.5,Male,No,Sun,Dinner,3,16.658734
3,23.68,3.31,Male,No,Sun,Dinner,2,13.978041
4,24.59,3.61,Female,No,Sun,Dinner,4,14.680765


# Workign with rows

In [51]:
df['PaymentId'] = np.random.randint(0, 101, len(df))
df

Unnamed: 0_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct,PaymentId
PaymentId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
88,16.99,1.01,Female,No,Sun,Dinner,2,5.944673,66
84,10.34,1.66,Male,No,Sun,Dinner,3,16.054159,54
34,21.01,3.50,Male,No,Sun,Dinner,3,16.658734,92
47,23.68,3.31,Male,No,Sun,Dinner,2,13.978041,77
66,24.59,3.61,Female,No,Sun,Dinner,4,14.680765,74
...,...,...,...,...,...,...,...,...,...
56,29.03,5.92,Male,No,Sat,Dinner,3,20.392697,26
7,27.18,2.00,Female,Yes,Sat,Dinner,2,7.358352,16
13,22.67,2.00,Male,Yes,Sat,Dinner,2,8.822232,70
17,17.82,1.75,Male,No,Sat,Dinner,2,9.820426,42


In [48]:
df.set_index('PaymentId', inplace=True)

In [None]:
df.reset_index(inplace=True)

In [49]:
df.iloc[0] # first row

total_bill       16.99
tip               1.01
sex             Female
smoker              No
day                Sun
time            Dinner
size                 2
tip_pct       5.944673
Name: 88, dtype: object

In [55]:
df.loc[3] # row with index 'n'

Unnamed: 0_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct,PaymentId
PaymentId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
3,7.51,2.0,Male,No,Thur,Lunch,2,26.631158,38
3,13.27,2.5,Female,Yes,Sat,Dinner,2,18.839488,8


In [39]:
# slicing
df.iloc[0:3]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct,PaymentId
0,16.99,1.01,Female,No,Sun,Dinner,2,5.944673,28
1,10.34,1.66,Male,No,Sun,Dinner,3,16.054159,28
2,21.01,3.5,Male,No,Sun,Dinner,3,16.658734,28
