# Filter col and row

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns

In [6]:
iris = sns.load_dataset('iris')
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


## 1) loc

In [7]:
iris.loc[0:3, "sepal_width" :]

Unnamed: 0,sepal_width,petal_length,petal_width,species
0,3.5,1.4,0.2,setosa
1,3.0,1.4,0.2,setosa
2,3.2,1.3,0.2,setosa
3,3.1,1.5,0.2,setosa


row start: row_end (inclusive)

## 1a) filter by row values of specfic columns

In [5]:
# get rows with setosa and petal_length, and columsn with petal_width and species

iris.loc[ (iris.species == "setosa") & (iris.petal_length== 1.4) , 'petal_width': 'species']

Unnamed: 0,petal_width,species
0,0.2,setosa
1,0.2,setosa
4,0.2,setosa
6,0.3,setosa
8,0.2,setosa
12,0.1,setosa
17,0.3,setosa
28,0.2,setosa
33,0.2,setosa
37,0.1,setosa


- better than chaining as below

In [6]:
iris[(iris.species == "setosa") & (iris.petal_length== 1.4) ].petal_width

0     0.2
1     0.2
4     0.2
6     0.3
8     0.2
12    0.1
17    0.3
28    0.2
33    0.2
37    0.1
45    0.3
47    0.2
49    0.2
Name: petal_width, dtype: float64

## 2) iloc
[: 0:4]

In [7]:
iris.iloc[0:3, 0:2]

Unnamed: 0,sepal_length,sepal_width
0,5.1,3.5
1,4.9,3.0
2,4.7,3.2


row_start: row_end (exclusive) , col_start: col_end (exclusive)

In [12]:
iris[['sepal_length', 'sepal_width']]

Unnamed: 0,sepal_length,sepal_width
0,5.1,3.5
1,4.9,3.0
2,4.7,3.2
3,4.6,3.1
4,5.0,3.6
...,...,...
145,6.7,3.0
146,6.3,2.5
147,6.5,3.0
148,6.2,3.4


# Groupby columns
- groupby requires aggrgation function (i.e. sum() , max() ), else cannot aggregate.
- groupby outputs groupby object

In [8]:
tips = sns.load_dataset('tips')
tips.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3


In [9]:
A = tips.groupby(['sex', 'smoker'])[['total_bill', 'tip']].sum()
A

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Male,Yes,1337.07,183.07
Male,No,1919.75,302.0
Female,Yes,593.27,96.74
Female,No,977.68,149.77


## a) Ranking with groupby

In [69]:
tips2= tips.groupby('day').mean().sort_values(by=['tip'], ascending = False)
tips2

Unnamed: 0_level_0,total_bill,tip,size
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Sun,21.41,3.255132,2.842105
Sat,20.441379,2.993103,2.517241
Thur,17.682742,2.771452,2.451613
Fri,17.151579,2.734737,2.105263


### groupby will automatically make the group_by columns into indices, AND remove from the table
- thus need .index to retreive that column
- if don't want them as indices, use index = False. df.groupby(['col2','col3'], as_index=False)

to make day column into a list, use .index 

In [75]:
tips2.index.tolist()

['Sun', 'Sat', 'Thur', 'Fri']

# Modify column

### a) apply(lambda, axis)

In [10]:
A['tip']= A['tip'].apply(lambda x: "{:02}".format(x %100))
A

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip
sex,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Male,Yes,1337.07,83.07
Male,No,1919.75,2.0
Female,Yes,593.27,96.74
Female,No,977.68,49.77000000000001


In [11]:
tips2= tips.copy()
tips2['Extra']= tips2.apply(lambda row: row['tip']/ row['total_bill'], axis =1)
tips2

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,Extra
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.50,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.139780
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808
...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,0.203927
240,27.18,2.00,Female,Yes,Sat,Dinner,2,0.073584
241,22.67,2.00,Male,Yes,Sat,Dinner,2,0.088222
242,17.82,1.75,Male,No,Sat,Dinner,2,0.098204


# Combine Dataframes
- merge/ join
- concat/ append

## 1) Merge
- column-wise combinations in SQL-like way. 

In [53]:
# datasets

from io import StringIO
with StringIO("""name,num,rand
bug,1,d
rug,2,d
lug,3,d
mug,4,d""") as fp:
    A = pd.read_csv(fp)
print("=== table A ===")
display(A)

with StringIO("""name,num,rand
hug,-1,e
smug,-2,e
rug,-3,e
tug,-4,e
bug,1,e""") as fp:
    B = pd.read_csv(fp)
print("\n=== table B ===")
display(B)

=== table A ===


Unnamed: 0,name,num,rand
0,bug,1,d
1,rug,2,d
2,lug,3,d
3,mug,4,d



=== table B ===


Unnamed: 0,name,num,rand
0,hug,-1,e
1,smug,-2,e
2,rug,-3,e
3,tug,-4,e
4,bug,1,e


### a) Merge on column name

In [42]:
C = A.merge(B, on= ['name', 'name'], how = 'outer')  # outer-join
C

Unnamed: 0,name,num_x,rand_x,num_y,rand_y
0,bug,1.0,d,1.0,e
1,rug,2.0,d,-3.0,e
2,lug,3.0,d,,
3,mug,4.0,d,,
4,hug,,,-1.0,e
5,smug,,,-2.0,e
6,tug,,,-4.0,e


In [44]:
D = A.merge(B, on= ['num', 'num'], how = 'left')  # left join
D

Unnamed: 0,name_x,num,rand_x,name_y,rand_y
0,bug,1,d,bug,e
1,rug,2,d,,
2,lug,3,d,,
3,mug,4,d,,


In [54]:
# left_on, right_on for joining differently named columns in the tables

E = A.merge(B, left_on = 'name', right_on="rand", how = 'left')  # left join
E

# output vi

Unnamed: 0,name_x,num_x,rand_x,name_y,num_y,rand_y
0,bug,1,d,,,
1,rug,2,d,,,
2,lug,3,d,,,
3,mug,4,d,,,


### b) merge on index

In [55]:
A.merge(B, left_index = True, right_index= True)

Unnamed: 0,name_x,num_x,rand_x,name_y,num_y,rand_y
0,bug,1,d,hug,-1,e
1,rug,2,d,smug,-2,e
2,lug,3,d,rug,-3,e
3,mug,4,d,tug,-4,e


## 2) concat()

- only stacks either column or row-wise (thus only outer join) 
- only simplistic combinations (thus we need merge)
- dataframes need same indices
- can pass list in

In [19]:
# concat by rows

pd.concat([A, B], sort = False)

Unnamed: 0,x,y,z,w
0,bug,1,d,
1,rug,2,d,
2,lug,3,d,
3,mug,4,d,
0,hug,-1,,e
1,smug,-2,,e
2,rug,-3,,e
3,tug,-4,,e
4,bug,1,,e


In [18]:
# concat by column

pd.concat([A,B], axis =1)

Unnamed: 0,x,y,z,x.1,y.1,w
0,bug,1.0,d,hug,-1,e
1,rug,2.0,d,smug,-2,e
2,lug,3.0,d,rug,-3,e
3,mug,4.0,d,tug,-4,e
4,,,,bug,1,e


In [23]:
# label on dataset where the row datas are from

pd.concat([A,B], sort = False, keys =['tableA', 'tableB'])

Unnamed: 0,Unnamed: 1,x,y,z,w
tableA,0,bug,1,d,
tableA,1,rug,2,d,
tableA,2,lug,3,d,
tableA,3,mug,4,d,
tableB,0,hug,-1,,e
tableB,1,smug,-2,,e
tableB,2,rug,-3,,e
tableB,3,tug,-4,,e
tableB,4,bug,1,,e


# canonicalize

In [93]:
canonical_in_csv = """,c,a,b
2,hat,x,1
0,rat,y,4
3,cat,x,2
1,bat,x,2"""

with StringIO(canonical_in_csv) as fp:
    input1 = pd.read_csv(fp, index_col=0)
print("=== Input ===")
display(input1)
print("")

=== Input ===


Unnamed: 0,c,a,b
2,hat,x,1
0,rat,y,4
3,cat,x,2
1,bat,x,2





In [96]:
var_names = sorted(input1.columns)
table_copy = input1[var_names].copy()
table_copy

table_copy = table_copy.sort_values(by= var_names)
table_copy


Unnamed: 0,a,b,c
2,x,1,hat
1,x,2,bat
3,x,2,cat
0,y,4,rat


## map
- compares series to list/ dict, and returns values that match keys, else NaN
- https://pandas.pydata.org/pandas-docs/version/0.25.3/reference/api/pandas.Series.map.html
    

# convert dataframe to numpy array
- df.to_numpy()
- old versions: df.values and df.array()

In [None]:
to_numpy()