# Chapter 3.4 Introduction to Tables (Pandas ver.)

# 코딩 문제 시험에 나옴(e.g. 여기서 제일 몸 값이 비싼 사람은?, 코드를 보고 해석이 되는지 질문, sort by 문제)

## Create a DataFrame

In [6]:
import pandas as pd

# Define cones' attributes
conesAttributes = ["Flavor", "Color", "Price"]

# Define cones' data, must be same sequence as attributes
conesData = [["strawberry", "pink", 3.55],
             ["chocolate", "light brown", 4.75],
             ["chocolate", "dark brown", 5.25],
             ["strawberry", "pink", 5.25],
             ["chocolate", "dark brown", 5.25],
             ["bubblegem", "pink", 4.75]]

# Define cones
cones = pd.DataFrame(data=conesData, columns=conesAttributes)

In [7]:
cones  # icecream cones

Unnamed: 0,Flavor,Color,Price
0,strawberry,pink,3.55
1,chocolate,light brown,4.75
2,chocolate,dark brown,5.25
3,strawberry,pink,5.25
4,chocolate,dark brown,5.25
5,bubblegem,pink,4.75


## Display rows

In [8]:
# Show first two rows
cones[:2]

Unnamed: 0,Flavor,Color,Price
0,strawberry,pink,3.55
1,chocolate,light brown,4.75


In [9]:
cones.iloc[:2]

Unnamed: 0,Flavor,Color,Price
0,strawberry,pink,3.55
1,chocolate,light brown,4.75


In [10]:
cones.iloc[0:2, :]

Unnamed: 0,Flavor,Color,Price
0,strawberry,pink,3.55
1,chocolate,light brown,4.75


## Choosing Sets of Columns

In [11]:
# Show Flavor
cones.loc[:, ['Flavor']]

Unnamed: 0,Flavor
0,strawberry
1,chocolate
2,chocolate
3,strawberry
4,chocolate
5,bubblegem


In [12]:
cones['Flavor']

0    strawberry
1     chocolate
2     chocolate
3    strawberry
4     chocolate
5     bubblegem
Name: Flavor, dtype: object

In [13]:
type(cones.loc[:, ['Flavor']])

pandas.core.frame.DataFrame

In [14]:
type(cones['Flavor'])

pandas.core.series.Series

In [15]:
cones['Flavor'].to_frame()

Unnamed: 0,Flavor
0,strawberry
1,chocolate
2,chocolate
3,strawberry
4,chocolate
5,bubblegem


In [16]:
cones  # original DataFrame not changed

Unnamed: 0,Flavor,Color,Price
0,strawberry,pink,3.55
1,chocolate,light brown,4.75
2,chocolate,dark brown,5.25
3,strawberry,pink,5.25
4,chocolate,dark brown,5.25
5,bubblegem,pink,4.75


In [17]:
# Show Flavor, Price
cones.loc[:, ['Flavor', 'Price']]

Unnamed: 0,Flavor,Price
0,strawberry,3.55
1,chocolate,4.75
2,chocolate,5.25
3,strawberry,5.25
4,chocolate,5.25
5,bubblegem,4.75


In [18]:
# Drop Color
cones.drop(columns='Color')


Unnamed: 0,Flavor,Price
0,strawberry,3.55
1,chocolate,4.75
2,chocolate,5.25
3,strawberry,5.25
4,chocolate,5.25
5,bubblegem,4.75


In [19]:
cones

Unnamed: 0,Flavor,Color,Price
0,strawberry,pink,3.55
1,chocolate,light brown,4.75
2,chocolate,dark brown,5.25
3,strawberry,pink,5.25
4,chocolate,dark brown,5.25
5,bubblegem,pink,4.75


In [20]:
cones_without_color = cones.drop(columns='Color')
cones_without_color

Unnamed: 0,Flavor,Price
0,strawberry,3.55
1,chocolate,4.75
2,chocolate,5.25
3,strawberry,5.25
4,chocolate,5.25
5,bubblegem,4.75


## Sorting Rows

In [21]:
# Sort Price
cones.sort_values(by=["Price"])

Unnamed: 0,Flavor,Color,Price
0,strawberry,pink,3.55
1,chocolate,light brown,4.75
5,bubblegem,pink,4.75
2,chocolate,dark brown,5.25
3,strawberry,pink,5.25
4,chocolate,dark brown,5.25


In [22]:
# Sort Price Descending
cones.sort_values(by=["Price"], ascending=False)

Unnamed: 0,Flavor,Color,Price
2,chocolate,dark brown,5.25
3,strawberry,pink,5.25
4,chocolate,dark brown,5.25
1,chocolate,light brown,4.75
5,bubblegem,pink,4.75
0,strawberry,pink,3.55


In [23]:
cones.sort_values(by=["Flavor", "Price"],
                  ascending=[True, True])  # alphabetical한 순서로 정렬된다.같은 맛이라면 값을 기준으로 ascending # 첫번째 인덱스가 우선 기준이 된다.

Unnamed: 0,Flavor,Color,Price
5,bubblegem,pink,4.75
1,chocolate,light brown,4.75
2,chocolate,dark brown,5.25
4,chocolate,dark brown,5.25
0,strawberry,pink,3.55
3,strawberry,pink,5.25


## Selecting Rows that Satisfy a Condition

In [24]:
# Making boolean series for a cones
filter = cones['Flavor'] == "chocolate"

# Pass filter to cones
cones.where(filter, inplace=False)  # 원본 데이터 파일에 적용할지말지 => inplace, 원본이 바뀐다. 추천 X

Unnamed: 0,Flavor,Color,Price
0,,,
1,chocolate,light brown,4.75
2,chocolate,dark brown,5.25
3,,,
4,chocolate,dark brown,5.25
5,,,


In [25]:
filter

0    False
1     True
2     True
3    False
4     True
5    False
Name: Flavor, dtype: bool

In [26]:
cones.where(cones['Flavor'] == 'chocolate')

Unnamed: 0,Flavor,Color,Price
0,,,
1,chocolate,light brown,4.75
2,chocolate,dark brown,5.25
3,,,
4,chocolate,dark brown,5.25
5,,,


In [27]:
# cones.where(filter, inplace=True)

In [28]:
# Making boolean series for a cones
filter = cones['Flavor'] == "chocolate"

# Pass filter to cones
cones.loc[filter]  # loc을 써서 flavour가 chocolate인 icecream만 가져온다. 

Unnamed: 0,Flavor,Color,Price
1,chocolate,light brown,4.75
2,chocolate,dark brown,5.25
4,chocolate,dark brown,5.25


**Index가 1, 2, 3이 아니라 1, 2, 4가 나온다**

In [29]:
# Making boolean series for a cones
filter = cones['Flavor'] == "Chocolate" # case sensitive

# Pass filter to cones
cones.loc[filter]

#cones.where(filter)

Unnamed: 0,Flavor,Color,Price


**Case Sensitive**

## Example: Salaries in the NBA

In [30]:
# Read nba csv
nba = pd.read_csv('nba_salaries_3.4.4.csv')

# Show nba
nba

Unnamed: 0,PLAYER,POSITION,TEAM,2015-2016 SALARY
0,Paul Millsap,PF,Atlanta Hawks,18.671659
1,Al Horford,C,Atlanta Hawks,12.000000
2,Tiago Splitter,C,Atlanta Hawks,9.756250
3,Jeff Teague,PG,Atlanta Hawks,8.000000
4,Kyle Korver,SG,Atlanta Hawks,5.746479
...,...,...,...,...
412,Gary Neal,PG,Washington Wizards,2.139000
413,DeJuan Blair,C,Washington Wizards,2.000000
414,Kelly Oubre Jr.,SF,Washington Wizards,1.920240
415,Garrett Temple,SG,Washington Wizards,1.100602


In [31]:
nba.head()

Unnamed: 0,PLAYER,POSITION,TEAM,2015-2016 SALARY
0,Paul Millsap,PF,Atlanta Hawks,18.671659
1,Al Horford,C,Atlanta Hawks,12.0
2,Tiago Splitter,C,Atlanta Hawks,9.75625
3,Jeff Teague,PG,Atlanta Hawks,8.0
4,Kyle Korver,SG,Atlanta Hawks,5.746479


In [32]:
nba.tail()

Unnamed: 0,PLAYER,POSITION,TEAM,2015-2016 SALARY
412,Gary Neal,PG,Washington Wizards,2.139
413,DeJuan Blair,C,Washington Wizards,2.0
414,Kelly Oubre Jr.,SF,Washington Wizards,1.92024
415,Garrett Temple,SG,Washington Wizards,1.100602
416,Jarell Eddie,SG,Washington Wizards,0.561716


In [33]:
# Filter "Stephen Curry"
filter = nba["PLAYER"] == "Stephen Curry" # 커리를 필터로 생성
nba.loc[filter] # location으로 찾아낸다

Unnamed: 0,PLAYER,POSITION,TEAM,2015-2016 SALARY
121,Stephen Curry,PG,Golden State Warriors,11.370786


In [34]:
nba["PLAYER"] == "Stephen Curry"

0      False
1      False
2      False
3      False
4      False
       ...  
412    False
413    False
414    False
415    False
416    False
Name: PLAYER, Length: 417, dtype: bool

In [35]:
nba[nba["PLAYER"] == "Stephen Curry"] # 직관적이고 편한 방법이다.

Unnamed: 0,PLAYER,POSITION,TEAM,2015-2016 SALARY
121,Stephen Curry,PG,Golden State Warriors,11.370786


In [36]:
nba[121:123]

Unnamed: 0,PLAYER,POSITION,TEAM,2015-2016 SALARY
121,Stephen Curry,PG,Golden State Warriors,11.370786
122,Jason Thompson,PF,Golden State Warriors,7.008475


### Find players by a team name

In [37]:
# Filter "Golden State Warriors"
filter = nba["TEAM"] == "Golden State Warriors"
warriors = nba.loc[filter]
warriors

Unnamed: 0,PLAYER,POSITION,TEAM,2015-2016 SALARY
117,Klay Thompson,SG,Golden State Warriors,15.501
118,Draymond Green,PF,Golden State Warriors,14.26087
119,Andrew Bogut,C,Golden State Warriors,13.8
120,Andre Iguodala,SF,Golden State Warriors,11.710456
121,Stephen Curry,PG,Golden State Warriors,11.370786
122,Jason Thompson,PF,Golden State Warriors,7.008475
123,Shaun Livingston,PG,Golden State Warriors,5.543725
124,Harrison Barnes,SF,Golden State Warriors,3.873398
125,Marreese Speights,C,Golden State Warriors,3.815
126,Leandro Barbosa,SG,Golden State Warriors,2.5


#### Renaming columns

In [38]:
nba.columns

Index(['PLAYER', 'POSITION', 'TEAM', '2015-2016 SALARY'], dtype='object')

In [39]:
nba_renamed = nba.copy(deep='all') # deep = True가 아니라 'all'로 주어야 한다. # deep=True only works for the data # deep copy, shallow copy

### deep = True X, deep = 'all'이어야 한다. 

In [40]:
nba_renamed[:2]

Unnamed: 0,PLAYER,POSITION,TEAM,2015-2016 SALARY
0,Paul Millsap,PF,Atlanta Hawks,18.671659
1,Al Horford,C,Atlanta Hawks,12.0


In [41]:
nba_renamed.columns.values[3] = 'SALARY2'
nba_renamed[:2]

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY2
0,Paul Millsap,PF,Atlanta Hawks,18.671659
1,Al Horford,C,Atlanta Hawks,12.0


In [42]:
nba_renamed = nba_renamed.rename(columns={'SALARY2': 'SALARY'}) # rename method을 이용하여 columsn argument에 'oldname' : 'newname'을 넣어주면 'newname'으로 변경됨
nba_renamed[:2]

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY
0,Paul Millsap,PF,Atlanta Hawks,18.671659
1,Al Horford,C,Atlanta Hawks,12.0


In [43]:
nba.head(2)

Unnamed: 0,PLAYER,POSITION,TEAM,2015-2016 SALARY
0,Paul Millsap,PF,Atlanta Hawks,18.671659
1,Al Horford,C,Atlanta Hawks,12.0


In [44]:
nba = nba.rename(columns={'2025-2016 SALARY': 'SALARY'})
nba.head(2)

Unnamed: 0,PLAYER,POSITION,TEAM,2015-2016 SALARY
0,Paul Millsap,PF,Atlanta Hawks,18.671659
1,Al Horford,C,Atlanta Hawks,12.0


In [45]:
col_name = nba.columns.values[3]
nba = nba.rename(columns={col_name: 'SALARY ($M)'})
nba.head(2)

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY ($M)
0,Paul Millsap,PF,Atlanta Hawks,18.671659
1,Al Horford,C,Atlanta Hawks,12.0


In [46]:
# Sort nba based on SALARY, ascending=True
nba.sort_values(by=["SALARY ($M)"], ascending=True)

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY ($M)
267,Thanasis Antetokounmpo,SF,New York Knicks,0.030888
327,Cory Jefferson,PF,Phoenix Suns,0.049709
326,Jordan McRae,SG,Phoenix Suns,0.049709
324,Orlando Johnson,SG,Phoenix Suns,0.055722
325,Phil Pressey,PG,Phoenix Suns,0.055722
...,...,...,...,...
131,Dwight Howard,C,Houston Rockets,22.359364
255,Carmelo Anthony,SF,New York Knicks,22.875000
72,LeBron James,SF,Cleveland Cavaliers,22.970500
29,Joe Johnson,SF,Brooklyn Nets,24.894863


In [47]:
# Sort nba based on SALARY, ascending=False
nba.sort_values(by=["SALARY ($M)"], ascending=False)

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY ($M)
169,Kobe Bryant,SF,Los Angeles Lakers,25.000000
29,Joe Johnson,SF,Brooklyn Nets,24.894863
72,LeBron James,SF,Cleveland Cavaliers,22.970500
255,Carmelo Anthony,SF,New York Knicks,22.875000
131,Dwight Howard,C,Houston Rockets,22.359364
...,...,...,...,...
200,Elliot Williams,SG,Memphis Grizzlies,0.055722
324,Orlando Johnson,SG,Phoenix Suns,0.055722
327,Cory Jefferson,PF,Phoenix Suns,0.049709
326,Jordan McRae,SG,Phoenix Suns,0.049709


In [48]:
# Sort nba first by 'TEAM', then by 'SALARY'
nba.sort_values(by=['TEAM', 'SALARY ($M)'], ascending=False)

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY ($M)
400,John Wall,PG,Washington Wizards,15.851950
401,Nene Hilario,C,Washington Wizards,13.000000
402,Marcin Gortat,C,Washington Wizards,11.217391
403,Markieff Morris,PF,Washington Wizards,8.000000
404,Bradley Beal,SG,Washington Wizards,5.694674
...,...,...,...,...
9,Tim Hardaway Jr.,SG,Atlanta Hawks,1.304520
10,Walter Tavares,C,Atlanta Hawks,1.000000
11,Jason Richardson,SG,Atlanta Hawks,0.947276
12,Lamar Patterson,SG,Atlanta Hawks,0.525093


# Chapter 6 Tables (Pandas ver.)

## Creating DataFrame revisited

In [49]:
import numpy as np

In [50]:
pd.DataFrame()

In [51]:
pd.DataFrame(data=np.array([8, 34, 5]), columns=['Number of petals'])

Unnamed: 0,Number of petals
0,8
1,34
2,5


In [52]:
pd.DataFrame({'Number of petals': np.array([8, 34, 5]),
              'Name': np.array(['lotus', 'sunflower', 'rose'])})

Unnamed: 0,Number of petals,Name
0,8,lotus
1,34,sunflower
2,5,rose


### Adding a new column to an existing DataFrame

In [53]:
flowers = pd.DataFrame({'Number of petals': np.array([8, 34, 5]),
                        'Name': np.array(['lotus', 'sunflower', 'rose'])})
flowers['Color'] = np.array(['pink', 'yellow', 'red'])
flowers

Unnamed: 0,Number of petals,Name,Color
0,8,lotus,pink
1,34,sunflower,yellow
2,5,rose,red


### minard data

In [54]:
# Read CSV
minard_cities = pd.read_csv('https://vincentarelbundock.github.io/Rdatasets/csv/HistData/Minard.cities.csv')
minard_temp = pd.read_csv('https://vincentarelbundock.github.io/Rdatasets/csv/HistData/Minard.temp.csv')
minard_troops = pd.read_csv('https://vincentarelbundock.github.io/Rdatasets/csv/HistData/Minard.troops.csv')

URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1000)>

In [None]:
minard_cities.info()

In [None]:
minard_cities.head()

In [55]:
minard_temp.info()

NameError: name 'minard_temp' is not defined

In [56]:
minard_temp.head()

NameError: name 'minard_temp' is not defined

In [57]:
minard_troops.info()

NameError: name 'minard_troops' is not defined

In [58]:
minard_troops.head()

NameError: name 'minard_troops' is not defined

Remove rownames

In [59]:
# Remove the 1st column which is rank
minard_cities = minard_cities.iloc[:, 1:] 
minard_temp = minard_temp.iloc[:, 1:]
minard_troops = minard_troops.iloc[:, 1:]

NameError: name 'minard_cities' is not defined

In [60]:
# Join minard_troops and minard_cities
minard = minard_troops.join(minard_cities.set_index(['long', 'lat']), on=['long', 'lat']) # Null은 NaN으로 다 표시된다.
minard.head(20)

NameError: name 'minard_troops' is not defined

In [61]:
# Filter the NaN(Null) value in city column
filter = minard['city'].notnull() # NaN모두 필터에 걸리게 한다.
minard = minard[filter]
minard

NameError: name 'minard' is not defined

In [62]:
# The columns of the Table
minard.columns

NameError: name 'minard' is not defined

In [63]:
# Rename column 'city' to 'city name', inplace=False
minard.rename(columns={'city': 'city name'})

NameError: name 'minard' is not defined

In [64]:
minard

NameError: name 'minard' is not defined

In [65]:
# minard.rename(columns={'city': 'city name'}, inplace=True)
minard = minard.rename(columns={'city': 'city name'})
minard

NameError: name 'minard' is not defined

In [66]:
# minard.rename(columns={'city': 'city name'}, inplace=True)
minard.rename(columns={'city name': 'city name2'},inplace=True)
minard

NameError: name 'minard' is not defined

In [67]:
minard.shape

NameError: name 'minard' is not defined

In [68]:
len(minard)

NameError: name 'minard' is not defined

In [69]:
# Accessing the Data in a Column
minard['survivors']

NameError: name 'minard' is not defined

In [70]:
# OR
minard.survivors

NameError: name 'minard' is not defined

In [71]:
type(minard.survivors)

NameError: name 'minard' is not defined

In [72]:
# OR
minard.iloc[:, 2] # row index _:_ => all, column index 2, _ is blank
# iloc[row index, column index]

NameError: name 'minard' is not defined

#### iloc[row index, column index] (indexing location)?

In [73]:
minard.iloc[2]
# return i-th row # 2-th row => 0, 1, 2 세번째 행의 column들이 row로 들어가게 된다.

NameError: name 'minard' is not defined

In [74]:
type(minard.iloc[2]) # Series

NameError: name 'minard' is not defined

In [75]:
minard

NameError: name 'minard' is not defined

In [76]:
minard.reset_index(drop=True, inplace=True) # drop=False하면 기존에 존재하던 인덱스가 새로운 Column으로 들어가 중복 인덱싱이 된다.

NameError: name 'minard' is not defined

In [77]:
minard

NameError: name 'minard' is not defined

In [78]:
minard.loc[5]

NameError: name 'minard' is not defined

In [79]:
minard.iloc[5]

NameError: name 'minard' is not defined

#### iloc과 loc의 차이점?

In [80]:
# Working with the Data in a Column
initial = minard['survivors'].iloc[0]
minard['percent surviving'] = minard['survivors'] / initial
minard

NameError: name 'minard' is not defined

In [81]:
# Formart columns 'percent surviving'
minard2 = minard.style.format({'percent surviving': '{:.2%}'})

NameError: name 'minard' is not defined

In [82]:
minard2  # this is not a dataframe # 

NameError: name 'minard2' is not defined

#### Dataframe이 아닌 styler를 출력한 것이다.

In [83]:
minard

NameError: name 'minard' is not defined

The method loc creates a new table that contains only the specified columns.

In [84]:
minard[['long', 'lat']]

NameError: name 'minard' is not defined

In [85]:
minard.loc[:, ['long', 'lat']]

NameError: name 'minard' is not defined

In [86]:
minard['long']  # returns an array

NameError: name 'minard' is not defined

In [87]:
minard[['long']]  # returns a DataFrame

NameError: name 'minard' is not defined

In [88]:
minard.loc[:, ['long']]

NameError: name 'minard' is not defined

In [89]:
minard.iloc[:, :1]

NameError: name 'minard' is not defined

In [90]:
minard.drop(columns=['long', 'lat', 'direction'])

NameError: name 'minard' is not defined

In [91]:
minard  # applying iloc, loc, drop won't change the original table

NameError: name 'minard' is not defined

In [92]:
minard = minard.drop(columns=['long', 'lat', 'direction'])

NameError: name 'minard' is not defined

In [93]:
minard

NameError: name 'minard' is not defined

## selecting rows

### See {x} Tab

In [94]:
%reset

In [95]:
import pandas as pd

nba = pd.read_csv('nba_salaries_3.4.4.csv')

In [96]:
# Show the 1st row
nba.iloc[[0]]

Unnamed: 0,PLAYER,POSITION,TEAM,2015-2016 SALARY
0,Paul Millsap,PF,Atlanta Hawks,18.671659


In [97]:
nba[:1]

Unnamed: 0,PLAYER,POSITION,TEAM,2015-2016 SALARY
0,Paul Millsap,PF,Atlanta Hawks,18.671659


In [98]:
# Show rows between 3 to 5
nba.iloc[3:6]

Unnamed: 0,PLAYER,POSITION,TEAM,2015-2016 SALARY
3,Jeff Teague,PG,Atlanta Hawks,8.0
4,Kyle Korver,SG,Atlanta Hawks,5.746479
5,Thabo Sefolosha,SF,Atlanta Hawks,4.0


In [99]:
nba = nba.rename(columns={'2015-2016 SALARY': 'SALARY'})

In [100]:
nba

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY
0,Paul Millsap,PF,Atlanta Hawks,18.671659
1,Al Horford,C,Atlanta Hawks,12.000000
2,Tiago Splitter,C,Atlanta Hawks,9.756250
3,Jeff Teague,PG,Atlanta Hawks,8.000000
4,Kyle Korver,SG,Atlanta Hawks,5.746479
...,...,...,...,...
412,Gary Neal,PG,Washington Wizards,2.139000
413,DeJuan Blair,C,Washington Wizards,2.000000
414,Kelly Oubre Jr.,SF,Washington Wizards,1.920240
415,Garrett Temple,SG,Washington Wizards,1.100602


## sorting rows

In [179]:
# top 5 hiest paid players
nba.sort_values(by=["SALARY"], ascending=False).iloc[:5]

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY
169,Kobe Bryant,SF,Los Angeles Lakers,25.0
29,Joe Johnson,SF,Brooklyn Nets,24.894863
72,LeBron James,SF,Cleveland Cavaliers,22.9705
255,Carmelo Anthony,SF,New York Knicks,22.875
131,Dwight Howard,C,Houston Rockets,22.359364


In [180]:
nba.sort_values(by=["SALARY"], ascending=False)[:5]

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY
169,Kobe Bryant,SF,Los Angeles Lakers,25.0
29,Joe Johnson,SF,Brooklyn Nets,24.894863
72,LeBron James,SF,Cleveland Cavaliers,22.9705
255,Carmelo Anthony,SF,New York Knicks,22.875
131,Dwight Howard,C,Houston Rockets,22.359364


## selecting rows by values

In [182]:
# Filter the rows where 'SALARY' >= 10
nba[nba['SALARY'] > 10]

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY
0,Paul Millsap,PF,Atlanta Hawks,18.671659
1,Al Horford,C,Atlanta Hawks,12.000000
29,Joe Johnson,SF,Brooklyn Nets,24.894863
30,Thaddeus Young,PF,Brooklyn Nets,11.235955
42,Al Jefferson,C,Charlotte Hornets,13.500000
...,...,...,...,...
368,DeMar DeRozan,SG,Toronto Raptors,10.050000
383,Gordon Hayward,SF,Utah Jazz,15.409570
400,John Wall,PG,Washington Wizards,15.851950
401,Nene Hilario,C,Washington Wizards,13.000000


In [183]:
# Filter the rows where 'SALARY' >= 10, then sort by 'SALARY', ascending=False
nba[nba['SALARY'] > 10].sort_values('SALARY', ascending=False)

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY
169,Kobe Bryant,SF,Los Angeles Lakers,25.000000
29,Joe Johnson,SF,Brooklyn Nets,24.894863
72,LeBron James,SF,Cleveland Cavaliers,22.970500
255,Carmelo Anthony,SF,New York Knicks,22.875000
131,Dwight Howard,C,Houston Rockets,22.359364
...,...,...,...,...
95,Wilson Chandler,SF,Denver Nuggets,10.449438
144,Monta Ellis,SG,Indiana Pacers,10.300000
204,Luol Deng,SF,Miami Heat,10.151612
298,Gerald Wallace,SF,Philadelphia 76ers,10.105855


In [184]:
# Filter the rows where 'PLARYER' == 'Stephen Curry'
nba[nba['PLAYER'] == 'Stephen Curry']

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY
121,Stephen Curry,PG,Golden State Warriors,11.370786


In [185]:
nba['PLAYER'] == 'Stephen Curry'

0      False
1      False
2      False
3      False
4      False
       ...  
412    False
413    False
414    False
415    False
416    False
Name: PLAYER, Length: 417, dtype: bool

In [268]:
nba.sample(
    n=5)

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY
241,Jrue Holiday,PG,New Orleans Pelicans,10.595507
238,Damjan Rudez,SF,Minnesota Timberwolves,1.1495
179,Anthony Brown,SF,Los Angeles Lakers,0.7
388,Trey Burke,PG,Utah Jazz,2.65824
60,Derrick Rose,PG,Chicago Bulls,20.093064


In [269]:
# Get a table of all the Warriors
nba[nba['TEAM'] == 'Golden State Warriors']

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY
117,Klay Thompson,SG,Golden State Warriors,15.501
118,Draymond Green,PF,Golden State Warriors,14.26087
119,Andrew Bogut,C,Golden State Warriors,13.8
120,Andre Iguodala,SF,Golden State Warriors,11.710456
121,Stephen Curry,PG,Golden State Warriors,11.370786
122,Jason Thompson,PF,Golden State Warriors,7.008475
123,Shaun Livingston,PG,Golden State Warriors,5.543725
124,Harrison Barnes,SF,Golden State Warriors,3.873398
125,Marreese Speights,C,Golden State Warriors,3.815
126,Leandro Barbosa,SG,Golden State Warriors,2.5


In [270]:
# Fuzzy search for records in 'TEAM' columns containing 'Warriors'
nba[nba['TEAM'].str.contains('Warriors', regex=False)]

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY
117,Klay Thompson,SG,Golden State Warriors,15.501
118,Draymond Green,PF,Golden State Warriors,14.26087
119,Andrew Bogut,C,Golden State Warriors,13.8
120,Andre Iguodala,SF,Golden State Warriors,11.710456
121,Stephen Curry,PG,Golden State Warriors,11.370786
122,Jason Thompson,PF,Golden State Warriors,7.008475
123,Shaun Livingston,PG,Golden State Warriors,5.543725
124,Harrison Barnes,SF,Golden State Warriors,3.873398
125,Marreese Speights,C,Golden State Warriors,3.815
126,Leandro Barbosa,SG,Golden State Warriors,2.5


In [277]:
nba[nba['TEAM'].str.contains('w*',regex =True)]

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY
0,Paul Millsap,PF,Atlanta Hawks,18.671659
1,Al Horford,C,Atlanta Hawks,12.000000
2,Tiago Splitter,C,Atlanta Hawks,9.756250
3,Jeff Teague,PG,Atlanta Hawks,8.000000
4,Kyle Korver,SG,Atlanta Hawks,5.746479
...,...,...,...,...
412,Gary Neal,PG,Washington Wizards,2.139000
413,DeJuan Blair,C,Washington Wizards,2.000000
414,Kelly Oubre Jr.,SF,Washington Wizards,1.920240
415,Garrett Temple,SG,Washington Wizards,1.100602


In [101]:
nba[nba['TEAM'].str.contains('New', regex=False)]

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY
239,Eric Gordon,SG,New Orleans Pelicans,15.514031
240,Tyreke Evans,SG,New Orleans Pelicans,10.734586
241,Jrue Holiday,PG,New Orleans Pelicans,10.595507
242,Omer Asik,C,New Orleans Pelicans,9.213483
243,Ryan Anderson,PF,New Orleans Pelicans,8.5
244,Anthony Davis,PF,New Orleans Pelicans,7.07073
245,Alexis Ajinca,C,New Orleans Pelicans,4.389607
246,Quincy Pondexter,SF,New Orleans Pelicans,3.382023
247,Norris Cole,PG,New Orleans Pelicans,3.036927
248,Dante Cunningham,PF,New Orleans Pelicans,2.85


## selecting rows by multiple features

In [102]:
# Filter 'POSITION' == 'PG' & 'SALARY' > 15
nba[(nba['POSITION'] == 'PG') & (nba['SALARY'] > 15)] # the 'and' in python is not working

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY
60,Derrick Rose,PG,Chicago Bulls,20.093064
74,Kyrie Irving,PG,Cleveland Cavaliers,16.407501
156,Chris Paul,PG,Los Angeles Clippers,21.468695
269,Russell Westbrook,PG,Oklahoma City Thunder,16.744218
400,John Wall,PG,Washington Wizards,15.85195


In [103]:
# Filter a SALARY range
nba[(nba['SALARY'] < 10.3) & (nba['SALARY'] >= 10)]

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY
204,Luol Deng,SF,Miami Heat,10.151612
298,Gerald Wallace,SF,Philadelphia 76ers,10.105855
356,Danny Green,SG,San Antonio Spurs,10.0
368,DeMar DeRozan,SG,Toronto Raptors,10.05


In [104]:
nba[nba['PLAYER'] == 'Barack Obama'] # empty dataframe with column name

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY


## Example Population Trends

In [106]:
# As of Jan 2017, this census file is online here:

# A local copy can be accessed here in case census.gov moves the file:
# data = path_data + 'nc-est2015-agesex-res.csv'

full_census_table = pd.read_csv("./nc-est2015-agesex-res.csv")
full_census_table

Unnamed: 0,SEX,AGE,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015
0,0,0,3944153,3944160,3951330,3963087,3926540,3931141,3949775,3978038
1,0,1,3978070,3978090,3957888,3966551,3977939,3942872,3949776,3968564
2,0,2,4096929,4096939,4090862,3971565,3980095,3992720,3959664,3966583
3,0,3,4119040,4119051,4111920,4102470,3983157,3992734,4007079,3974061
4,0,4,4063170,4063186,4077551,4122294,4112849,3994449,4005716,4020035
...,...,...,...,...,...,...,...,...,...,...
301,2,97,53582,53605,54118,57159,59533,61255,62779,69285
302,2,98,36641,36675,37532,40116,42857,44359,46208,47272
303,2,99,26193,26214,26074,27030,29320,31112,32517,34064
304,2,100,44202,44246,45058,47556,50661,53902,58008,61886


In [107]:
# Select relevant columns
partial_census_table = full_census_table[['SEX', 'AGE', 'POPESTIMATE2010', 'POPESTIMATE2014']]
partial_census_table

Unnamed: 0,SEX,AGE,POPESTIMATE2010,POPESTIMATE2014
0,0,0,3951330,3949775
1,0,1,3957888,3949776
2,0,2,4090862,3959664
3,0,3,4111920,4007079
4,0,4,4077551,4005716
...,...,...,...,...
301,2,97,54118,62779
302,2,98,37532,46208
303,2,99,26074,32517
304,2,100,45058,58008


In [108]:
# Simplify the columns name
us_pop = partial_census_table.rename(columns={'POPESTIMATE2010': '2010', 'POPESTIMATE2014': '2014'})
us_pop

Unnamed: 0,SEX,AGE,2010,2014
0,0,0,3951330,3949775
1,0,1,3957888,3949776
2,0,2,4090862,3959664
3,0,3,4111920,4007079
4,0,4,4077551,4005716
...,...,...,...,...
301,2,97,54118,62779
302,2,98,37532,46208
303,2,99,26074,32517
304,2,100,45058,58008


In [109]:
# The change in population between 2010 and 2014
change = us_pop['2014'] - us_pop['2010']

In [110]:
# Make a new df census from us_pop, then add two new columns
census = us_pop.copy()
census['Change'] = change
census['Percent Change'] = change / census['2010']
census

Unnamed: 0,SEX,AGE,2010,2014,Change,Percent Change
0,0,0,3951330,3949775,-1555,-0.000394
1,0,1,3957888,3949776,-8112,-0.002050
2,0,2,4090862,3959664,-131198,-0.032071
3,0,3,4111920,4007079,-104841,-0.025497
4,0,4,4077551,4005716,-71835,-0.017617
...,...,...,...,...,...,...
301,2,97,54118,62779,8661,0.160039
302,2,98,37532,46208,8676,0.231163
303,2,99,26074,32517,6443,0.247104
304,2,100,45058,58008,12950,0.287407


In [111]:
census.head().style.format({'Percent Change': '{:.2%}'})

Unnamed: 0,SEX,AGE,2010,2014,Change,Percent Change
0,0,0,3951330,3949775,-1555,-0.04%
1,0,1,3957888,3949776,-8112,-0.20%
2,0,2,4090862,3959664,-131198,-3.21%
3,0,3,4111920,4007079,-104841,-2.55%
4,0,4,4077551,4005716,-71835,-1.76%


In [112]:
# Sort census by Change, ascending=False
census.sort_values(by='Change', ascending=False)

Unnamed: 0,SEX,AGE,2010,2014,Change,Percent Change
101,0,999,309346863,318907401,9560538,0.030906
203,1,999,152088043,156955337,4867294,0.032003
305,2,999,157258820,161952064,4693244,0.029844
67,0,67,2693707,3485241,791534,0.293846
64,0,64,2706055,3487559,781504,0.288798
...,...,...,...,...,...,...
39,0,39,4324463,3982507,-341956,-0.079075
48,0,48,4534663,4159738,-374925,-0.082680
46,0,46,4529716,4077151,-452565,-0.099910
47,0,47,4535473,4082883,-452590,-0.099789


In [113]:
census_mf = census[(census['SEX'] == 0) & (census['AGE'] < 999)] ### 999는 전체 합계이므로 뺴고 싶다.
census_mf[['AGE', 'Percent Change']]

Unnamed: 0,AGE,Percent Change
0,0,-0.000394
1,1,-0.002050
2,2,-0.032071
3,3,-0.025497
4,4,-0.017617
...,...,...
96,96,0.240055
97,97,0.206059
98,98,0.269766
99,99,0.288707


In [114]:
census_mf['age group'] = pd.qcut(census_mf.AGE, q=5, labels=['0-20', '21-40', '41-60', '61-80', '81-100'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  census_mf['age group'] = pd.qcut(census_mf.AGE, q=5, labels=['0-20', '21-40', '41-60', '61-80', '81-100'])


In [115]:
agg_df = census_mf.groupby('age group').sum()

  agg_df = census_mf.groupby('age group').sum()


In [116]:
agg_df

Unnamed: 0_level_0,SEX,AGE,2010,2014,Change,Percent Change
age group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0-20,0,210,87753968,86577469,-1176499,-0.265658
21-40,0,610,82818476,85845611,3027135,0.755757
41-60,0,1010,84921736,85703773,782037,0.264756
61-80,0,1410,43870456,50150773,6280317,2.776676
81-100,0,1810,9982227,10629775,647548,2.999173


In [117]:
change = agg_df['2014'] - agg_df['2010']
agg_df['Change'] = change
agg_df['Percent Change'] = change / agg_df['2010']
agg_df

Unnamed: 0_level_0,SEX,AGE,2010,2014,Change,Percent Change
age group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0-20,0,210,87753968,86577469,-1176499,-0.013407
21-40,0,610,82818476,85845611,3027135,0.036551
41-60,0,1010,84921736,85703773,782037,0.009209
61-80,0,1410,43870456,50150773,6280317,0.143156
81-100,0,1810,9982227,10629775,647548,0.06487


## Example Sex Ratios

In [118]:
us_pop  #1 for male, 2 for female

Unnamed: 0,SEX,AGE,2010,2014
0,0,0,3951330,3949775
1,0,1,3957888,3949776
2,0,2,4090862,3959664
3,0,3,4111920,4007079
4,0,4,4077551,4005716
...,...,...,...,...
301,2,97,54118,62779
302,2,98,37532,46208
303,2,99,26074,32517
304,2,100,45058,58008


In [119]:
us_pop[us_pop.AGE == 999]

Unnamed: 0,SEX,AGE,2010,2014
101,0,999,309346863,318907401
203,1,999,152088043,156955337
305,2,999,157258820,161952064


In [298]:
# Drop column '2010' then filter 'AGE'==999
us_pop_2014 = us_pop.drop(columns='2010')
all_ages = us_pop_2014[us_pop_2014['AGE'] == 999]
all_ages

Unnamed: 0,SEX,AGE,2014
101,0,999,318907401
203,1,999,156955337
305,2,999,161952064


In [299]:
# Add a column Proportion
all_ages['Proportion'] = all_ages['2014'] / (list(all_ages[all_ages['SEX'] == 0]['2014']) * 3)
all_ages

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_ages['Proportion'] = all_ages['2014'] / (list(all_ages[all_ages['SEX'] == 0]['2014']) * 3)


Unnamed: 0,SEX,AGE,2014,Proportion
101,0,999,318907401,1.0
203,1,999,156955337,0.492166
305,2,999,161952064,0.507834
