In [1]:
import pandas as pd

In [4]:
nba = pd.read_csv("nba.csv")

## Shared methods and attributes between series and data frames
A method() is just a command to an object. Different objects can still respond to the same command. Some objects have exclusive methods.

In [9]:
nba.head(7)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
5,Amir Johnson,Boston Celtics,90.0,PF,29.0,6-9,240.0,,12000000.0
6,Jordan Mickey,Boston Celtics,55.0,PF,21.0,6-8,235.0,LSU,1170960.0


In [10]:
nba.tail()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0
457,,,,,,,,,


In [11]:
nba.index

RangeIndex(start=0, stop=458, step=1)

In [13]:
nba.values

array([['Avery Bradley', 'Boston Celtics', 0.0, ..., 180.0, 'Texas',
        7730337.0],
       ['Jae Crowder', 'Boston Celtics', 99.0, ..., 235.0, 'Marquette',
        6796117.0],
       ['John Holland', 'Boston Celtics', 30.0, ..., 205.0,
        'Boston University', nan],
       ...,
       ['Tibor Pleiss', 'Utah Jazz', 21.0, ..., 256.0, nan, 2900000.0],
       ['Jeff Withey', 'Utah Jazz', 24.0, ..., 231.0, 'Kansas', 947276.0],
       [nan, nan, nan, ..., nan, nan, nan]], dtype=object)

In [15]:
nba.shape  # (rows, columns)

(458, 9)

In [16]:
nba.dtypes

Name         object
Team         object
Number      float64
Position     object
Age         float64
Height       object
Weight      float64
College      object
Salary      float64
dtype: object

In [17]:
nba.columns

Index(['Name', 'Team', 'Number', 'Position', 'Age', 'Height', 'Weight',
       'College', 'Salary'],
      dtype='object')

In [19]:
# Combine the axes that make up data frame
nba.axes

[RangeIndex(start=0, stop=458, step=1),
 Index(['Name', 'Team', 'Number', 'Position', 'Age', 'Height', 'Weight',
        'College', 'Salary'],
       dtype='object')]

## Exclusive attributes for Data Frames (only)

In [24]:
nba.info()  # dtypes: float64(4), object(5) -- 4 and 5 columns: get_dtype_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 9 columns):
Name        457 non-null object
Team        457 non-null object
Number      457 non-null float64
Position    457 non-null object
Age         457 non-null float64
Height      457 non-null object
Weight      457 non-null float64
College     373 non-null object
Salary      446 non-null float64
dtypes: float64(4), object(5)
memory usage: 32.3+ KB


In [23]:
nba.get_dtype_counts()

float64    4
object     5
dtype: int64

## Differences between shared methods (series vs. data frames)

In [27]:
rev = pd.read_csv("revenue.csv", index_col = "Date")
rev.head(3)

Unnamed: 0_level_0,New York,Los Angeles,Miami
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/1/16,985,122,499
1/2/16,738,788,534
1/3/16,14,20,933


In [29]:
s = pd.Series([1, 2, 3])
s.sum()

6

In [32]:
rev.sum()  # Results in brand new series

New York       5475
Los Angeles    5134
Miami          5641
dtype: int64

In [38]:
# Sum vertically is available to both series and DFs. 
# However, horizontal sums is only available to DFs.
# How to do .sum() by date? Use axis parameter.
rev.sum(axis = "columns")  # or rev.sum(axis = 1)

Date
1/1/16     1606
1/2/16     2060
1/3/16      967
1/4/16     2519
1/5/16      438
1/6/16     1935
1/7/16     1234
1/8/16     2313
1/9/16     2623
1/10/16     555
dtype: int64

In [40]:
rev.sum(axis = 0) # Or, rev.sum(axis = "index")

New York       5475
Los Angeles    5134
Miami          5641
dtype: int64

## Select ONE column from a DataFrame

In [41]:
nba = pd.read_csv("nba.csv")
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,


In [43]:
nba.Name  #The returns a SERIES bc of extracting one column from a DF

0                Avery Bradley
1                  Jae Crowder
2                 John Holland
3                  R.J. Hunter
4                Jonas Jerebko
5                 Amir Johnson
6                Jordan Mickey
7                 Kelly Olynyk
8                 Terry Rozier
9                 Marcus Smart
10             Jared Sullinger
11               Isaiah Thomas
12                 Evan Turner
13                 James Young
14                Tyler Zeller
15            Bojan Bogdanovic
16                Markel Brown
17             Wayne Ellington
18     Rondae Hollis-Jefferson
19                Jarrett Jack
20              Sergey Karasev
21             Sean Kilpatrick
22                Shane Larkin
23                 Brook Lopez
24            Chris McCullough
25                 Willie Reed
26             Thomas Robinson
27                  Henry Sims
28                Donald Sloan
29              Thaddeus Young
                ...           
428            Al-Farouq Aminu
429     

In [44]:
nba.Number

0       0.0
1      99.0
2      30.0
3      28.0
4       8.0
5      90.0
6      55.0
7      41.0
8      12.0
9      36.0
10      7.0
11      4.0
12     11.0
13     13.0
14     44.0
15     44.0
16     22.0
17     21.0
18     24.0
19      2.0
20     10.0
21      6.0
22      0.0
23     11.0
24      1.0
25     33.0
26     41.0
27     14.0
28     15.0
29     30.0
       ... 
428     8.0
429     5.0
430    23.0
431    17.0
432     4.0
433     9.0
434    35.0
435    11.0
436     0.0
437     3.0
438    44.0
439    24.0
440     2.0
441    21.0
442    33.0
443     3.0
444    10.0
445    11.0
446    15.0
447    27.0
448    20.0
449     5.0
450     2.0
451    23.0
452    41.0
453     8.0
454    25.0
455    21.0
456    24.0
457     NaN
Name: Number, Length: 458, dtype: float64

In [45]:
nba.Salary

0       7730337.0
1       6796117.0
2             NaN
3       1148640.0
4       5000000.0
5      12000000.0
6       1170960.0
7       2165160.0
8       1824360.0
9       3431040.0
10      2569260.0
11      6912869.0
12      3425510.0
13      1749840.0
14      2616975.0
15      3425510.0
16       845059.0
17      1500000.0
18      1335480.0
19      6300000.0
20      1599840.0
21       134215.0
22      1500000.0
23     19689000.0
24      1140240.0
25       947276.0
26       981348.0
27       947276.0
28       947276.0
29     11235955.0
          ...    
428     8042895.0
429      625093.0
430      947276.0
431     6980802.0
432     2894059.0
433     6000000.0
434     5016000.0
435     3075880.0
436     4236287.0
437     2525160.0
438      525093.0
439     1415520.0
440     2854940.0
441     2637720.0
442     4775000.0
443     2658240.0
444     9463484.0
445     3777720.0
446    12000000.0
447     1175880.0
448    15409570.0
449     1348440.0
450     2050000.0
451      981348.0
452     22

In [46]:
nba.Name
nba.Number
nba.Salary

Output = None  # Tells NB not to print output

In [51]:
# This is teacher's preferred choice since it handles columns with spaces in between
nba["Name"]
nba["Salary"]
nba["Number"]

Output = None

In [53]:
type(nba["Name"])  #again, extracting one column from DF results in SERIES

pandas.core.series.Series

In [55]:
nba["Name"].head(10)  #Can chain methods

0    Avery Bradley
1      Jae Crowder
2     John Holland
3      R.J. Hunter
4    Jonas Jerebko
5     Amir Johnson
6    Jordan Mickey
7     Kelly Olynyk
8     Terry Rozier
9     Marcus Smart
Name: Name, dtype: object

## Selecting two or more columns from a DataFrame

In [56]:
nba = pd.read_csv("nba.csv")
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,


In [58]:
nba[["Name", "Team"]]  # Two or more columns results in new DF

Unnamed: 0,Name,Team
0,Avery Bradley,Boston Celtics
1,Jae Crowder,Boston Celtics
2,John Holland,Boston Celtics
3,R.J. Hunter,Boston Celtics
4,Jonas Jerebko,Boston Celtics
5,Amir Johnson,Boston Celtics
6,Jordan Mickey,Boston Celtics
7,Kelly Olynyk,Boston Celtics
8,Terry Rozier,Boston Celtics
9,Marcus Smart,Boston Celtics


In [66]:
# Want to change the column order?
nba[["Team", "Name"]].head()
nba[["Number", "College", "Salary"]].tail(10)

Unnamed: 0,Number,College,Salary
448,20.0,Butler,15409570.0
449,5.0,Duke,1348440.0
450,2.0,,2050000.0
451,23.0,Dayton,981348.0
452,41.0,Kentucky,2239800.0
453,8.0,Butler,2433333.0
454,25.0,,900000.0
455,21.0,,2900000.0
456,24.0,Kansas,947276.0
457,,,


In [67]:
# Another elegant way to extract multiple columns when
# working with data that has several columns
select = ['Salary', 'Team', 'Name']
nba[select]

Unnamed: 0,Salary,Team,Name
0,7730337.0,Boston Celtics,Avery Bradley
1,6796117.0,Boston Celtics,Jae Crowder
2,,Boston Celtics,John Holland
3,1148640.0,Boston Celtics,R.J. Hunter
4,5000000.0,Boston Celtics,Jonas Jerebko
5,12000000.0,Boston Celtics,Amir Johnson
6,1170960.0,Boston Celtics,Jordan Mickey
7,2165160.0,Boston Celtics,Kelly Olynyk
8,1824360.0,Boston Celtics,Terry Rozier
9,3431040.0,Boston Celtics,Marcus Smart


## Adding new columns to DataFrame

In [68]:
nba = pd.read_csv("nba.csv")
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,


In [70]:
nba["Sport"]  # Results in error because column doeesn't exist

KeyError: 'Sport'

In [71]:
# Scalar value: Every value in new column will equal "Basketball"
nba["Sport"] = "Basketball"

In [73]:
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Sport
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0,Basketball
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0,Basketball
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,,Basketball
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0,Basketball
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0,Basketball


In [75]:
nba.shape

(458, 10)

In [77]:
nba["League"] = "National Basketball Association"
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Sport,League
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0,Basketball,National Basketball Association
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0,Basketball,National Basketball Association
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,,Basketball,National Basketball Association
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0,Basketball,National Basketball Association
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0,Basketball,National Basketball Association


In [89]:
# Returning to original DF
nba = pd.read_csv("nba.csv")
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,


In [93]:
nba.insert(3, column = "Sport", value = "Basketball", allow_duplicates = True)
nba

Unnamed: 0,Name,Team,Number,Sport,Sport.1,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,Basketball,Basketball,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,Basketball,Basketball,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,Basketball,Basketball,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,Basketball,Basketball,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,Basketball,Basketball,PF,29.0,6-10,231.0,,5000000.0
5,Amir Johnson,Boston Celtics,90.0,Basketball,Basketball,PF,29.0,6-9,240.0,,12000000.0
6,Jordan Mickey,Boston Celtics,55.0,Basketball,Basketball,PF,21.0,6-8,235.0,LSU,1170960.0
7,Kelly Olynyk,Boston Celtics,41.0,Basketball,Basketball,C,25.0,7-0,238.0,Gonzaga,2165160.0
8,Terry Rozier,Boston Celtics,12.0,Basketball,Basketball,PG,22.0,6-2,190.0,Louisville,1824360.0
9,Marcus Smart,Boston Celtics,36.0,Basketball,Basketball,PG,22.0,6-4,220.0,Oklahoma State,3431040.0


In [97]:
nba.insert(7, column = "League", value = "NBA", allow_duplicates = True)
nba.head()

Unnamed: 0,Name,Team,Number,Sport,Sport.1,Position,Age,League,League.1,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,Basketball,Basketball,PG,25.0,NBA,NBA,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,Basketball,Basketball,SF,25.0,NBA,NBA,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,Basketball,Basketball,SG,27.0,NBA,NBA,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,Basketball,Basketball,SG,22.0,NBA,NBA,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,Basketball,Basketball,PF,29.0,NBA,NBA,6-10,231.0,,5000000.0


In [98]:
nba.insert(3, column = "Dinner", value = "Fajitas")

In [100]:
nba.head()

Unnamed: 0,Name,Team,Number,Dinner,Sport,Sport.1,Position,Age,League,League.1,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,Fajitas,Basketball,Basketball,PG,25.0,NBA,NBA,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,Fajitas,Basketball,Basketball,SF,25.0,NBA,NBA,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,Fajitas,Basketball,Basketball,SG,27.0,NBA,NBA,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,Fajitas,Basketball,Basketball,SG,22.0,NBA,NBA,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,Fajitas,Basketball,Basketball,PF,29.0,NBA,NBA,6-10,231.0,,5000000.0


## Broadasting Operations
Some methods (.sort_values()) apply to entire series. Others (.apply()) only apply to each value in a series. Like a broadcasting tower. The command operates on the values of the series, rather than acting on the series as a whole.

In [101]:
nba = pd.read_csv("nba.csv")
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


### Examples of methods that apply one by one
**Cool stuff!** .add(), .sub(), .mul(), .div(), +, -, *

In [111]:
nba["Age"].add(5)  # Even works with NaN values
nba["Age"] + 5  # same as above 

nba["Salary"].sub(5000000)
nba["Salary"] - 5000000

nba["Weight"].mul(0.453592)
nba["Weight in Kilograms"] = nba["Weight"] * .453592

In [112]:
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Weight in Kilograms
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0,81.64656
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0,106.59412
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,,92.98636
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0,83.91452
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0,104.779752


In [115]:
nba["Salary"].div(1000000)
nba["Salary in Millions"] = nba["Salary"] / 1000000

In [116]:
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Weight in Kilograms,Salary in Millions
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0,81.64656,7.730337
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0,106.59412,6.796117
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,,92.98636,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0,83.91452,1.14864
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0,104.779752,5.0


## A review of the .value_counts() method
Only can be used on SERIES. Great way to get a unique count on a single column (or series)

In [117]:
nba = pd.read_csv("nba.csv")
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,


In [122]:
nba["Team"].value_counts()
nba["Position"].value_counts().head(1)  # The most popular position in dataset
nba["Weight"].value_counts().tail(1)
nba["Salary"].value_counts()

947276.0      31
845059.0      18
525093.0      13
981348.0       6
1100602.0      5
16407500.0     5
5000000.0      5
12000000.0     5
8000000.0      5
4000000.0      5
3000000.0      4
7000000.0      4
2814000.0      4
1000000.0      4
19689000.0     4
200600.0       3
8500000.0      3
2500000.0      3
1015421.0      3
2854940.0      3
13500000.0     3
5543725.0      3
2288205.0      2
1270964.0      2
2900000.0      2
1007026.0      2
111444.0       2
13000000.0     2
1500000.0      2
1842000.0      2
              ..
2239800.0      1
1474440.0      1
19688000.0     1
7900000.0      1
2008748.0      1
13800000.0     1
2841960.0      1
1404600.0      1
1584480.0      1
273038.0       1
9213483.0      1
3272091.0      1
3075880.0      1
2250000.0      1
4626960.0      1
1304520.0      1
12100000.0     1
7500000.0      1
295327.0       1
2836186.0      1
6486486.0      1
5016000.0      1
3333333.0      1
1824360.0      1
8042895.0      1
1242720.0      1
2489530.0      1
5103120.0     

In [131]:
nba["Name"][1]

'Jae Crowder'

In [133]:
'Jae Crowder' in nba["Name"]

False

In [135]:
nba["Name"][:10]

0    Avery Bradley
1      Jae Crowder
2     John Holland
3      R.J. Hunter
4    Jonas Jerebko
5     Amir Johnson
6    Jordan Mickey
7     Kelly Olynyk
8     Terry Rozier
9     Marcus Smart
Name: Name, dtype: object

## Drop rows with null values
Use the .dropna(how = "all", inplace=, axis=, subset=[])

In [136]:
nba = pd.read_csv("nba.csv")
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,


In [137]:
nba.tail(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0
457,,,,,,,,,


In [142]:
# Only remove a row if ALL values are null .dropna(how = 'all')
# Also has an inplace parameter
nba.dropna(how = "all").head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,


In [144]:
nba.dropna(how = 'all', inplace = True)

In [145]:
nba.tail(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


In [148]:
# Remove columns that have ANY null values
nba.dropna(axis = 1).head()  # Or axis = "columns"

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0


In [153]:
# Want to remove from a specific row/column? .dropna(subset = ["Salary"])
# The subset accepts a list of strings.
# Ex. subset=["Salary"] will only drop rows if there's a null value in the Salary column
nba.dropna(subset = ["Salary", "College"]).head()  

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
6,Jordan Mickey,Boston Celtics,55.0,PF,21.0,6-8,235.0,LSU,1170960.0
7,Kelly Olynyk,Boston Celtics,41.0,C,25.0,7-0,238.0,Gonzaga,2165160.0


## Fill in Null values with .fillna() Method

In [154]:
nba = pd.read_csv("nba.csv")
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [156]:
# Not best if inconsistent varied data type set
# Best to call it on a SERIES instead
nba.fillna(0)  

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,0,5000000.0
5,Amir Johnson,Boston Celtics,90.0,PF,29.0,6-9,240.0,0,12000000.0
6,Jordan Mickey,Boston Celtics,55.0,PF,21.0,6-8,235.0,LSU,1170960.0
7,Kelly Olynyk,Boston Celtics,41.0,C,25.0,7-0,238.0,Gonzaga,2165160.0
8,Terry Rozier,Boston Celtics,12.0,PG,22.0,6-2,190.0,Louisville,1824360.0
9,Marcus Smart,Boston Celtics,36.0,PG,22.0,6-4,220.0,Oklahoma State,3431040.0


In [157]:
nba["Salary"].fillna(0, inplace = True)

In [158]:
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [161]:
nba["College"].fillna("No College", inplace = True)
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,No College,5000000.0


## The .astype() Method
Can convert datatype of a series but can't use if have NULL values.
Note: .astype() doesn't have inplace so have to reassign instead.

In [163]:
# Gotta clear NULL values before using .astype()
nba = pd.read_csv("nba.csv").dropna(how = "all")
nba["Salary"].fillna(0, inplace = True)
nba["College"].fillna("None", inplace = True)
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [166]:
nba.dtypes
nba.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 0 to 456
Data columns (total 9 columns):
Name        457 non-null object
Team        457 non-null object
Number      457 non-null float64
Position    457 non-null object
Age         457 non-null float64
Height      457 non-null object
Weight      457 non-null float64
College     457 non-null object
Salary      457 non-null float64
dtypes: float64(4), object(5)
memory usage: 35.7+ KB


In [170]:
nba["Salary"] = nba["Salary"].astype(dtype = "int")
nba["Salary"].head()

0    7730337
1    6796117
2          0
3    1148640
4    5000000
Name: Salary, dtype: int64

In [172]:
nba["Age"] = nba["Age"].astype(dtype = "int")
nba["Age"].head(3)

0    25
1    25
2    27
Name: Age, dtype: int64

In [175]:
nba["Number"] = nba["Number"].astype(dtype = "int")
nba["Number"].head()

0     0
1    99
2    30
3    28
4     8
Name: Number, dtype: int64

In [184]:
nba.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 0 to 456
Data columns (total 9 columns):
Name        457 non-null object
Team        457 non-null category
Number      457 non-null int64
Position    457 non-null category
Age         457 non-null int64
Height      457 non-null object
Weight      457 non-null float64
College     457 non-null object
Salary      457 non-null int64
dtypes: category(2), float64(1), int64(3), object(3)
memory usage: 51.1+ KB


** Unique Pandas only (not in Python) dtype: Categories **
Really ideal when you have a small amount of unique values in a data frame. Ex. Employees with 1 million rows with a Gender column or Month column (limited values - male/female; jan/feb/mar...). Categories are good are reducing the memory of the DF.

## .nunique() returns countd essentially

In [177]:
nba["Position"].nunique()

5

In [179]:
nba["Position"] = nba["Position"].astype(dtype = "category")

In [181]:
nba["Team"].nunique()

30

In [185]:
nba["Team"] = nba["Team"].astype(dtype = "category")
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0,PG,25,6-2,180.0,Texas,7730337
1,Jae Crowder,Boston Celtics,99,SF,25,6-6,235.0,Marquette,6796117
2,John Holland,Boston Celtics,30,SG,27,6-5,205.0,Boston University,0
3,R.J. Hunter,Boston Celtics,28,SG,22,6-5,185.0,Georgia State,1148640
4,Jonas Jerebko,Boston Celtics,8,PF,29,6-10,231.0,,5000000


## Sort DataFrame with .sort_values method - Part 1

In [186]:
nba = pd.read_csv("nba.csv")
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,


In [191]:
nba.sort_values("Name", ascending = False)
nba.sort_values("Age", ascending = False)
nba.sort_values("Salary", ascending = False).head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
109,Kobe Bryant,Los Angeles Lakers,24.0,SF,37.0,6-6,212.0,,25000000.0
169,LeBron James,Cleveland Cavaliers,23.0,SF,31.0,6-8,250.0,,22970500.0
33,Carmelo Anthony,New York Knicks,7.0,SF,32.0,6-8,240.0,Syracuse,22875000.0


In [198]:
# If you have nulls NaN in data, can use na_position parameter
# to position them at the end or beginning
nba.sort_values("Salary", na_position = "first").head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
46,Elton Brand,Philadelphia 76ers,42.0,PF,37.0,6-9,254.0,Duke,
171,Dahntay Jones,Cleveland Cavaliers,30.0,SG,35.0,6-6,225.0,Duke,
264,Jordan Farmar,Memphis Grizzlies,4.0,PG,29.0,6-2,180.0,UCLA,
269,Ray McCallum,Memphis Grizzlies,5.0,PG,24.0,6-3,190.0,Detroit,


In [200]:
nba.sort_values("Salary", ascending = False, na_position = "first").tail()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
175,Jordan McRae,Cleveland Cavaliers,12.0,SG,25.0,6-5,179.0,Tennessee,111196.0
135,Alan Williams,Phoenix Suns,15.0,C,23.0,6-8,260.0,UC Santa Barbara,83397.0
291,Orlando Johnson,New Orleans Pelicans,0.0,SG,27.0,6-5,220.0,UC Santa Barbara,55722.0
130,Phil Pressey,Phoenix Suns,25.0,PG,25.0,5-11,175.0,Missouri,55722.0
32,Thanasis Antetokounmpo,New York Knicks,43.0,SF,23.0,6-7,205.0,,30888.0


## Sort a DataFrame with MULTIPLE COLUMNS using .sort_values() - Part 2
The OPTIMAL data type to use in Pandas when you want to specify more than one thing is the Python LIST

In [None]:
nba = pd.read_csv("nba.csv")
nba.head(3)

In [203]:
# What if you want to sort by two columns in the same way (ascending/descending)?
nba.sort_values(by = ["Team", "Name"], ascending = False).head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
379,Ramon Sessions,Washington Wizards,7.0,PG,30.0,6-3,190.0,Nevada,2170465.0
378,Otto Porter Jr.,Washington Wizards,22.0,SF,23.0,6-8,198.0,Georgetown,4662960.0
375,Nene Hilario,Washington Wizards,42.0,C,33.0,6-11,250.0,,13000000.0
376,Markieff Morris,Washington Wizards,5.0,PF,26.0,6-10,245.0,Kansas,8000000.0
381,Marcus Thornton,Washington Wizards,15.0,SF,29.0,6-4,205.0,LSU,200600.0


In [206]:
# But what if you want to sort by multiple columns but each column
# sort is unique (i.e., not both by ascending or descending)?
# Can provide ascending= a boolean list [True, False, True]
nba.sort_values(by = ["Team", "Name"], ascending = [True, False], inplace = True)
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
322,Walter Tavares,Atlanta Hawks,22.0,C,24.0,7-3,260.0,,1000000.0
310,Tim Hardaway Jr.,Atlanta Hawks,10.0,SG,24.0,6-6,205.0,Michigan,1304520.0
321,Tiago Splitter,Atlanta Hawks,11.0,C,31.0,6-11,245.0,,9756250.0
320,Thabo Sefolosha,Atlanta Hawks,25.0,SF,32.0,6-7,220.0,,4000000.0
315,Paul Millsap,Atlanta Hawks,4.0,PF,31.0,6-8,246.0,Louisiana Tech,18671659.0


## Sort DataFrame with .sort_index() Method
Can help return data to original order/structure

In [207]:
nba = pd.read_csv("nba.csv")
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,


In [213]:
# Let's mess up index order. You'll see index is 68, 1, 457
nba.sort_values(["Number", "Salary", "Name"], inplace = True)
nba.tail(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
68,Lucas Nogueira,Toronto Raptors,92.0,C,23.0,7-0,220.0,,1842000.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
457,,,,,,,,,


In [221]:
# Can return to original order by using .sort_index()
nba.sort_index(ascending = False, inplace = True)
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
457,,,,,,,,,
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0


In [224]:
nba.shape
nba.columns

Index(['Name', 'Team', 'Number', 'Position', 'Age', 'Height', 'Weight',
       'College', 'Salary'],
      dtype='object')

In [238]:
nba.sort_index(axis = 1, ascending = True).head()

Unnamed: 0,Age,College,Height,Name,Number,Position,Salary,Team,Weight
457,,,,,,,,,
456,26.0,Kansas,7-0,Jeff Withey,24.0,C,947276.0,Utah Jazz,231.0
455,26.0,,7-3,Tibor Pleiss,21.0,C,2900000.0,Utah Jazz,256.0
454,24.0,,6-1,Raul Neto,25.0,PG,900000.0,Utah Jazz,179.0
453,26.0,Butler,6-3,Shelvin Mack,8.0,PG,2433333.0,Utah Jazz,203.0


In [239]:
nba.sort_index(axis = 0, ascending = True).head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [246]:
nba.columns = ["Name", "Position", "Number", "Team", "Age", "Height",
              "Weight", "Salary", "College"]
nba.head()

Unnamed: 0,Name,Position,Number,Team,Age,Height,Weight,Salary,College
457,,,,,,,,,
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0


In [248]:
nba.sort_index()
nba.columns[3:]

Index(['Team', 'Age', 'Height', 'Weight', 'Salary', 'College'], dtype='object')

## Rank values with the .rank() Method
*Note: Gotta clear all null values first. .rank is called on a single SERIES. By default .rank(ascending=True) will rank in reverse order (e.g., the lowest salary will rank #1

In [249]:
nba = pd.read_csv("nba.csv")
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [255]:
#Delete nulls.
nba = pd.read_csv("nba.csv").dropna(how = 'all')
#Can't use inplace with .fillna() this time because after .fillna()
#the values (0) will still be FLOATS, so still have to call the
# .astype(dtype = 'int') to convert to INTEGER and THEN we can finally 
# use the .rank()
nba["Salary"] = nba["Salary"].fillna(value = 0).astype(dtype = "int")
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0


In [263]:
# Let's rank the highest salaries. However, just .rank() gives us
# a couple of problems. 1. it's ranking in reverse order (note that
# index[2] was 0 for salary), so it's now ranked 6th. Need to change
# ascending param. 2. It ranks with FLOATS, but need to change dtype
# to "int".
nba["Salary"].rank()
nba["Salary"].rank(ascending = False)
nba["Salary"].rank(ascending = False).astype(dtype = "int").head()

0     97
1    110
2    452
3    322
4    147
Name: Salary, dtype: int64

In [266]:
# Can now assign to new series
nba["Salary Rank"] = nba["Salary"].rank(ascending = False).astype(dtype = "int")
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Salary Rank
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337,97
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117,110
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0,452
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640,322
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000,147


In [268]:
# Let's sort by Salaries
nba.sort_values(by = "Salary", ascending = False)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Salary Rank
109,Kobe Bryant,Los Angeles Lakers,24.0,SF,37.0,6-6,212.0,,25000000,1
169,LeBron James,Cleveland Cavaliers,23.0,SF,31.0,6-8,250.0,,22970500,2
33,Carmelo Anthony,New York Knicks,7.0,SF,32.0,6-8,240.0,Syracuse,22875000,3
251,Dwight Howard,Houston Rockets,12.0,C,30.0,6-11,265.0,,22359364,4
339,Chris Bosh,Miami Heat,1.0,PF,32.0,6-11,235.0,Georgia Tech,22192730,5
100,Chris Paul,Los Angeles Clippers,3.0,PG,31.0,6-0,175.0,Wake Forest,21468695,6
414,Kevin Durant,Oklahoma City Thunder,35.0,SF,27.0,6-9,240.0,Texas,20158622,7
164,Derrick Rose,Chicago Bulls,1.0,PG,27.0,6-3,190.0,Memphis,20093064,8
349,Dwyane Wade,Miami Heat,3.0,SG,34.0,6-4,220.0,Marquette,20000000,9
174,Kevin Love,Cleveland Cavaliers,0.0,PF,27.0,6-10,251.0,UCLA,19689000,11
