## Pandas DataFrames

- Two-dimensional

- Size must be rows and columns now, not just rows. Two points of reference

- 

In [1]:
import pandas as pd

In [2]:
nba = pd.read_csv("nba.csv")
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [3]:
nba.isnull().sum()

Name         1
Team         1
Number       1
Position     1
Age          1
Height       1
Weight       1
College     85
Salary      12
dtype: int64

In [4]:
nba.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 9 columns):
Name        457 non-null object
Team        457 non-null object
Number      457 non-null float64
Position    457 non-null object
Age         457 non-null float64
Height      457 non-null object
Weight      457 non-null float64
College     373 non-null object
Salary      446 non-null float64
dtypes: float64(4), object(5)
memory usage: 32.3+ KB


### Shared Methods and Attributes

In [5]:
# Methods are instructions to an object to do something
nba.index

RangeIndex(start=0, stop=458, step=1)

In [6]:
nba.values

array([['Avery Bradley', 'Boston Celtics', 0.0, ..., 180.0, 'Texas',
        7730337.0],
       ['Jae Crowder', 'Boston Celtics', 99.0, ..., 235.0, 'Marquette',
        6796117.0],
       ['John Holland', 'Boston Celtics', 30.0, ..., 205.0,
        'Boston University', nan],
       ...,
       ['Tibor Pleiss', 'Utah Jazz', 21.0, ..., 256.0, nan, 2900000.0],
       ['Jeff Withey', 'Utah Jazz', 24.0, ..., 231.0, 'Kansas', 947276.0],
       [nan, nan, nan, ..., nan, nan, nan]], dtype=object)

In [7]:
nba.shape

(458, 9)

In [8]:
# Remember this is a new series
nba.dtypes

Name         object
Team         object
Number      float64
Position     object
Age         float64
Height       object
Weight      float64
College      object
Salary      float64
dtype: object

### Exclusive Attributes

In [9]:
nba.columns

Index(['Name', 'Team', 'Number', 'Position', 'Age', 'Height', 'Weight',
       'College', 'Salary'],
      dtype='object')

In [10]:
nba.axes

[RangeIndex(start=0, stop=458, step=1),
 Index(['Name', 'Team', 'Number', 'Position', 'Age', 'Height', 'Weight',
        'College', 'Salary'],
       dtype='object')]

### Exclusive Methods

In [11]:
# Easy to see if there are Null values in any columns
nba.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 9 columns):
Name        457 non-null object
Team        457 non-null object
Number      457 non-null float64
Position    457 non-null object
Age         457 non-null float64
Height      457 non-null object
Weight      457 non-null float64
College     373 non-null object
Salary      446 non-null float64
dtypes: float64(4), object(5)
memory usage: 32.3+ KB


In [12]:
nba.get_dtype_counts()

float64    4
object     5
dtype: int64

### Differences between Shared Methods

In [13]:
rev = pd.read_csv("revenue.csv", index_col=["Date"])
rev.head(3)

Unnamed: 0_level_0,New York,Los Angeles,Miami
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/1/16,985,122,499
1/2/16,738,788,534
1/3/16,14,20,933


In [20]:
rev.index

Index(['1/1/16', '1/2/16', '1/3/16', '1/4/16', '1/5/16', '1/6/16', '1/7/16',
       '1/8/16', '1/9/16', '1/10/16'],
      dtype='object', name='Date')

In [14]:
# sum() method on a Series
s = pd.Series([1, 2, 3])
s.sum()

6

In [17]:
# sum() method on a DataFrame I get a brand new Series
# index labels represent the column labels and the 
# values are the sum of the values in each column
rev.sum()
rev.sum(axis=0)
rev.sum(axis="index")

New York       5475
Los Angeles    5134
Miami          5641
dtype: int64

In [21]:
# If we want to sum the number horizontally by date
# sums the values in the row across the columns
rev.sum(axis=1)
rev.sum(axis="columns")

Date
1/1/16     1606
1/2/16     2060
1/3/16      967
1/4/16     2519
1/5/16      438
1/6/16     1935
1/7/16     1234
1/8/16     2313
1/9/16     2623
1/10/16     555
dtype: int64

### Select One Column from a DataFrame

In [11]:
nba = pd.read_csv("nba.csv")
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,


In [23]:
# Extracting a single column from a DataFrame returns a Series
# This only works if column names have no spaces or symbols
# In a Series, this returns the corresponding value for the key/index
name = nba.Name
name.head()

0    Avery Bradley
1      Jae Crowder
2     John Holland
3      R.J. Hunter
4    Jonas Jerebko
Name: Name, dtype: object

In [27]:
number = nba.Number
number.head(3)

0     0.0
1    99.0
2    30.0
Name: Number, dtype: float64

In [28]:
salary = nba.Salary
salary.head(3)

0    7730337.0
1    6796117.0
2          NaN
Name: Salary, dtype: float64

In [29]:
# This works to extract a column from a DataFrame with spaces or symbols
# This extracts columns from a DataFrame, a Series object
# Since this returns a Series object, you can perform
# method chaining on the new Series object
nba["Name"].sort_values().head(3)

152      Aaron Brooks
356      Aaron Gordon
328    Aaron Harrison
Name: Name, dtype: object

### Select Two or More Columns from A DataFrame

In [30]:
# Returns a new DataFrame
# You can order your columns in the order you call them
nba[["Name", "Team"]].head(3)

Unnamed: 0,Name,Team
0,Avery Bradley,Boston Celtics
1,Jae Crowder,Boston Celtics
2,John Holland,Boston Celtics


In [31]:
nba[["Number", "College"]].head(3)

Unnamed: 0,Number,College
0,0.0,Texas
1,99.0,Marquette
2,30.0,Boston University


In [32]:
select = ["Salary", "Team", "Name"]
nba[select].head(3)

Unnamed: 0,Salary,Team,Name
0,7730337.0,Boston Celtics,Avery Bradley
1,6796117.0,Boston Celtics,Jae Crowder
2,,Boston Celtics,John Holland


### The .astype() Method

In [33]:
# You have to remove Null values to use .astype()
nba = pd.read_csv("nba.csv").dropna(how="all")
nba["Salary"].fillna(0, inplace=True)
nba["College"].fillna("None", inplace=True)
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0.0


In [34]:
# Now you see the valid values in each column have the same count
nba.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 0 to 456
Data columns (total 9 columns):
Name        457 non-null object
Team        457 non-null object
Number      457 non-null float64
Position    457 non-null object
Age         457 non-null float64
Height      457 non-null object
Weight      457 non-null float64
College     457 non-null object
Salary      457 non-null float64
dtypes: float64(4), object(5)
memory usage: 35.7+ KB


In [35]:
nba["Salary"] = nba["Salary"].astype("int")
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0


In [36]:
nba["Number"] = nba["Number"].astype("int")
nba["Age"] = nba["Age"].astype("int")

In [37]:
# A category is an ideal data type when you have a small number of unique values
# in a DataFrame like gender (M or F)
# This tells us there are only 5 unique values in the position column
nba["Position"].nunique()

5

In [38]:
# This saves memory bc now only five positions savedin memory
# and DataFrame just points to one of those five places in memory
nba["Position"] = nba["Position"].astype("category")
nba["Team"] = nba["Team"].astype("category")

### Sort a DataFrame with the .sort_values() Method

In [39]:
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0,PG,25,6-2,180.0,Texas,7730337
1,Jae Crowder,Boston Celtics,99,SF,25,6-6,235.0,Marquette,6796117
2,John Holland,Boston Celtics,30,SG,27,6-5,205.0,Boston University,0


In [40]:
# Sorting a DataFrame requires you to pass "column_name"
# nba.sort_values(
#     by,
#     axis=0,
#     ascending=True,
#     inplace=False,
#     kind='quicksort',
#     na_position='last',
# )
nba.sort_values("Name", ascending=False).head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
237,Zaza Pachulia,Dallas Mavericks,27,C,32,6-11,275.0,,5200000
271,Zach Randolph,Memphis Grizzlies,50,PF,34,6-9,260.0,Michigan State,9638555
402,Zach LaVine,Minnesota Timberwolves,8,PG,21,6-5,189.0,UCLA,2148360


In [41]:
nba.sort_values("Age", ascending=False)
nba.sort_values("Salary", ascending=False, inplace=True)
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
109,Kobe Bryant,Los Angeles Lakers,24,SF,37,6-6,212.0,,25000000
169,LeBron James,Cleveland Cavaliers,23,SF,31,6-8,250.0,,22970500
33,Carmelo Anthony,New York Knicks,7,SF,32,6-8,240.0,Syracuse,22875000


In [42]:
# Sort by multiple columns, feed sort_values() a list
# Sorts by Team in ascending alpha and then by alpha Name within Team
nba.sort_values(["Team", "Name"])
# If you want to sort one column in ascending but the other descending
# you can pass a boolean list
nba.sort_values(["Team", "Name"], ascending=[True, False]).head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
322,Walter Tavares,Atlanta Hawks,22,C,24,7-3,260.0,,1000000
310,Tim Hardaway Jr.,Atlanta Hawks,10,SG,24,6-6,205.0,Michigan,1304520
321,Tiago Splitter,Atlanta Hawks,11,C,31,6-11,245.0,,9756250


In [45]:
# sort_index() method 
nba = pd.read_csv("nba.csv")
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,


In [46]:
nba.sort_values(["Number", "Salary", "Name"], ascending=False, inplace=True)
nba.tail(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
248,Andrew Goudelock,Houston Rockets,0.0,PG,27.0,6-3,200.0,Charleston,200600.0
291,Orlando Johnson,New Orleans Pelicans,0.0,SG,27.0,6-5,220.0,UC Santa Barbara,55722.0
457,,,,,,,,,


In [48]:
# return DataFrame to original shape by ordered index
nba.sort_index()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
5,Amir Johnson,Boston Celtics,90.0,PF,29.0,6-9,240.0,,12000000.0
6,Jordan Mickey,Boston Celtics,55.0,PF,21.0,6-8,235.0,LSU,1170960.0
7,Kelly Olynyk,Boston Celtics,41.0,C,25.0,7-0,238.0,Gonzaga,2165160.0
8,Terry Rozier,Boston Celtics,12.0,PG,22.0,6-2,190.0,Louisville,1824360.0
9,Marcus Smart,Boston Celtics,36.0,PG,22.0,6-4,220.0,Oklahoma State,3431040.0


### Rank Values with the .rank() Method

In [50]:
# Rank on a single Series
# Must drop take care of Nulls first
nba = pd.read_csv("nba.csv").dropna(how="all")
nba["Salary"] = nba["Salary"].fillna(0).astype("int")
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0


In [86]:
# This ranks each salary relative to entire Salary Series
# ascending=False makes highest salaries have smallest ranks
# astype() converts floats to integers
nba["Salary Rank"] = nba["Salary"].rank(ascending=False).astype("int")
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Salary Rank
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337,97
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117,110
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0,452


In [89]:
# sorting by salary shows us our rank is working correctly
# identical salaries will have the same rank
nba.sort_values(by="Salary", ascending=False).head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Salary Rank
109,Kobe Bryant,Los Angeles Lakers,24.0,SF,37.0,6-6,212.0,,25000000,1
169,LeBron James,Cleveland Cavaliers,23.0,SF,31.0,6-8,250.0,,22970500,2
33,Carmelo Anthony,New York Knicks,7.0,SF,32.0,6-8,240.0,Syracuse,22875000,3
