In [2]:
import numpy as np
import pandas as pd

### 1. Object in pandas

- DataFrame: Table
- Series: Column in pandas

![](https://media.geeksforgeeks.org/wp-content/uploads/finallpandas.png)

![](http://t3h.edu.vn/sites/default/files/pictures/thanh-phan-chinh-cua-pandas.png)

### 1. Create Objects

In [5]:
dict_1 = {'name': ['Truong', 'Nam', 'Hoai'], 
          'age': [12, 23, 45],
          'university': ['UK', 'MIT', 'Standford']}
df = pd.DataFrame(dict_1)
df

Unnamed: 0,name,age,university
0,Truong,12,UK
1,Nam,23,MIT
2,Hoai,45,Standford


In [7]:
series = df.name
#series = df['name']
series

0    Truong
1       Nam
2      Hoai
Name: name, dtype: object

In [8]:
df = pd.DataFrame(np.linspace(1, 20, 20).reshape(5, 4), 
                  index=range(1, 6), 
                  columns=[f'col_{i}' for i in range(1, 5)])
df

Unnamed: 0,col_1,col_2,col_3,col_4
1,1.0,2.0,3.0,4.0
2,5.0,6.0,7.0,8.0
3,9.0,10.0,11.0,12.0
4,13.0,14.0,15.0,16.0
5,17.0,18.0,19.0,20.0


In [10]:
series = pd.Series(range(1, 6), name='number', dtype='int', 
                   index=[f'index_{i}' for i in range(1, 6)])
series

index_1    1
index_2    2
index_3    3
index_4    4
index_5    5
Name: number, dtype: int32

### 2. Indexing

In [12]:
import seaborn as sns

In [52]:
tip_df = sns.load_dataset('tips') # comma separated values
tip_df.index = [f'index_{i}' for i in range(tip_df.shape[0])]
tip_df.head(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
index_0,16.99,1.01,Female,No,Sun,Dinner,2
index_1,10.34,1.66,Male,No,Sun,Dinner,3
index_2,21.01,3.5,Male,No,Sun,Dinner,3
index_3,23.68,3.31,Male,No,Sun,Dinner,2
index_4,24.59,3.61,Female,No,Sun,Dinner,4


**Slicing**
- loc: df.loc[row_label, column_label] --> series.loc[row_label]  --> inclusive
- iloc: df.iloc[row_index, column_index] --> series.iloc[row_index]  --> exclusive

**Cell values**
- at: df.at[row_label, column_label]
- iat: df.iat[row_index, column_index]

In [36]:
tip_df.loc['index_0' : 'index_5' , ['total_bill', 'tip', 'time']]

Unnamed: 0,total_bill,tip,time
index_0,16.99,1.01,Dinner
index_1,10.34,1.66,Dinner
index_2,21.01,3.5,Dinner
index_3,23.68,3.31,Dinner
index_4,24.59,3.61,Dinner
index_5,25.29,4.71,Dinner


In [35]:
tip_df.iloc[0: 6 , [0, 1, -2]]

Unnamed: 0,total_bill,tip,time
index_0,16.99,1.01,Dinner
index_1,10.34,1.66,Dinner
index_2,21.01,3.5,Dinner
index_3,23.68,3.31,Dinner
index_4,24.59,3.61,Dinner
index_5,25.29,4.71,Dinner


In [33]:
tip_df.loc['index_0' : 'index_5' , 'total_bill' : 'smoker']

Unnamed: 0,total_bill,tip,sex,smoker
index_0,16.99,1.01,Female,No
index_1,10.34,1.66,Male,No
index_2,21.01,3.5,Male,No
index_3,23.68,3.31,Male,No
index_4,24.59,3.61,Female,No
index_5,25.29,4.71,Male,No


In [34]:
tip_df.iloc[0 : 6 , 0: 4]

Unnamed: 0,total_bill,tip,sex,smoker
index_0,16.99,1.01,Female,No
index_1,10.34,1.66,Male,No
index_2,21.01,3.5,Male,No
index_3,23.68,3.31,Male,No
index_4,24.59,3.61,Female,No
index_5,25.29,4.71,Male,No


In [31]:
tip_df.loc['index_0']

total_bill     16.99
tip             1.01
sex           Female
smoker            No
day              Sun
time          Dinner
size               2
Name: index_0, dtype: object

In [32]:
tip_df.iloc[0]

total_bill     16.99
tip             1.01
sex           Female
smoker            No
day              Sun
time          Dinner
size               2
Name: index_0, dtype: object

In [18]:
tip_df[['total_bill']]

Unnamed: 0,total_bill
0,16.99
1,10.34
2,21.01
3,23.68
4,24.59
...,...
239,29.03
240,27.18
241,22.67
242,17.82


In [16]:
tip_df.total_bill.to_frame()

Unnamed: 0,total_bill
0,16.99
1,10.34
2,21.01
3,23.68
4,24.59
...,...
239,29.03
240,27.18
241,22.67
242,17.82


In [37]:
tip_df.tail(4)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
index_240,27.18,2.0,Female,Yes,Sat,Dinner,2
index_241,22.67,2.0,Male,Yes,Sat,Dinner,2
index_242,17.82,1.75,Male,No,Sat,Dinner,2
index_243,18.78,3.0,Female,No,Thur,Dinner,2


In [38]:
tip_df.at['index_243', 'tip']

3.0

In [40]:
tip_df.iat[-1, 1]

3.0

In [41]:
series = tip_df.total_bill
series.head(5)

index_0    16.99
index_1    10.34
index_2    21.01
index_3    23.68
index_4    24.59
Name: total_bill, dtype: float64

In [44]:
series.loc['index_2' : 'index_4']

index_2    21.01
index_3    23.68
index_4    24.59
Name: total_bill, dtype: float64

In [42]:
series.loc['index_2']

21.01

In [43]:
series.iloc[2]

21.01

### 3. Masking

In [67]:
pd.options.display.max_rows = 6

In [68]:
tip_df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
index_0,16.99,1.01,Female,No,Sun,Dinner,2
index_1,10.34,1.66,Male,No,Sun,Dinner,3
index_2,21.01,3.50,Male,No,Sun,Dinner,3
...,...,...,...,...,...,...,...
index_241,22.67,2.00,Male,Yes,Sat,Dinner,2
index_242,17.82,1.75,Male,No,Sat,Dinner,2
index_243,18.78,3.00,Female,No,Thur,Dinner,2


In [50]:
tip_df.head(2)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
index_0,16.99,1.01,Female,No,Sun,Dinner,2
index_1,10.34,1.66,Male,No,Sun,Dinner,3


In [63]:
tip_df.query("sex == 'Male' & smoker == 'No'")
# conditions = (tip_df.sex == 'Male') & (tip_df.smoker == 'No')
# tip_df[conditions]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
index_1,10.34,1.66,Male,No,Sun,Dinner,3
index_2,21.01,3.50,Male,No,Sun,Dinner,3
index_3,23.68,3.31,Male,No,Sun,Dinner,2
...,...,...,...,...,...,...,...
index_235,10.07,1.25,Male,No,Sat,Dinner,2
index_239,29.03,5.92,Male,No,Sat,Dinner,3
index_242,17.82,1.75,Male,No,Sat,Dinner,2


In [58]:
# tip_df.query('sex == "Male"')
tip_df[tip_df.sex == 'Male']

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
index_1,10.34,1.66,Male,No,Sun,Dinner,3
index_2,21.01,3.50,Male,No,Sun,Dinner,3
index_3,23.68,3.31,Male,No,Sun,Dinner,2
...,...,...,...,...,...,...,...
index_239,29.03,5.92,Male,No,Sat,Dinner,3
index_241,22.67,2.00,Male,Yes,Sat,Dinner,2
index_242,17.82,1.75,Male,No,Sat,Dinner,2


In [59]:
pd.concat([tip_df.iloc[:5], tip_df.iloc[-5:]])

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
index_0,16.99,1.01,Female,No,Sun,Dinner,2
index_1,10.34,1.66,Male,No,Sun,Dinner,3
index_2,21.01,3.50,Male,No,Sun,Dinner,3
...,...,...,...,...,...,...,...
index_241,22.67,2.00,Male,Yes,Sat,Dinner,2
index_242,17.82,1.75,Male,No,Sat,Dinner,2
index_243,18.78,3.00,Female,No,Thur,Dinner,2


### 4. I/O: Input/ Output

In [73]:
attention_df = pd.read_csv('attention.csv', index_col='Unnamed: 0')
attention_df.head(2)

Unnamed: 0,subject,attention,solutions,score
0,1,divided,1,2.0
1,2,divided,1,3.0


In [75]:
attention_df.iloc[:30].to_csv('first_30_attention.csv', index=False)

In [76]:
attention_30_df = pd.read_csv('first_30_attention.csv')
attention_30_df.head(2)

Unnamed: 0,subject,attention,solutions,score
0,1,divided,1,2.0
1,2,divided,1,3.0


In [77]:
attention_30_df.shape

(30, 4)