# DataFrames Introduction

In [1]:
import os
import pandas as pd
import jupyter_black

jupyter_black.load()

current_dir = os.path.dirname(os.path.abspath("__file__"))

### Set file path to datasets

In [2]:
nba_path = os.path.join(current_dir, "data", "nba.csv")
revenue_path = os.path.join(current_dir, "data", "revenue.csv")

### Methods and attributes between Series and DataFrames

In [3]:
nba = pd.read_csv(nba_path)
s = pd.Series(range(1, 6))

s.head(n=2)
nba.head(n=2)

s.index
nba.index

s.values
nba.values

s.shape
nba.shape

s.dtype
nba.dtypes  # returns a Series

s.hasnans
# nba.hasnans # Does not exist

# s.columns # Does not exist
nba.columns

s.axes
nba.axes

s.info()
nba.info()

<class 'pandas.core.series.Series'>
RangeIndex: 5 entries, 0 to 4
Series name: None
Non-Null Count  Dtype
--------------  -----
5 non-null      int64
dtypes: int64(1)
memory usage: 168.0 bytes
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      457 non-null    object 
 1   Team      457 non-null    object 
 2   Number    457 non-null    float64
 3   Position  457 non-null    object 
 4   Age       457 non-null    float64
 5   Height    457 non-null    object 
 6   Weight    457 non-null    float64
 7   College   373 non-null    object 
 8   Salary    446 non-null    float64
dtypes: float64(4), object(5)
memory usage: 32.3+ KB


### Difference between Shared Methods

In [4]:
revenue = pd.read_csv(revenue_path, index_col=["Date"])
s = pd.Series([1, 2, 3])

s.sum()

# The  four below give same result
revenue.sum()
revenue.sum(axis="rows")
revenue.sum(axis="index")
revenue.sum(axis=0)

revenue.sum(axis="columns")  # Summing across the columns to one number
revenue.sum(axis=1)  # Same as columns

Date
1/1/16     1606
1/2/16     2060
1/3/16      967
1/4/16     2519
1/5/16      438
1/6/16     1935
1/7/16     1234
1/8/16     2313
1/9/16     2623
1/10/16     555
dtype: int64

### Select One Column frm a DataFrame

In [5]:
nba = pd.read_csv(nba_path)

# First Method
nba.Name
type(nba.Name)

# Second Method (preferred)
nba["Name"]
nba["Salary"].head(3)

0    7730337.0
1    6796117.0
2          NaN
Name: Salary, dtype: float64

### Select Two or More columns from a DataFrame

In [6]:
nba[["Name", "Team"]]
nba[["Team", "Name"]]
colnames = ["Salary", "Team", "Name"]
nba[colnames].head(3)

Unnamed: 0,Salary,Team,Name
0,7730337.0,Boston Celtics,Avery Bradley
1,6796117.0,Boston Celtics,Jae Crowder
2,,Boston Celtics,John Holland


### Add New Column to DataFrame


In [7]:
nba = pd.read_csv(nba_path)
nba.head()

nba["Sport"] = "Basketball"
nba["League"] = "National Basektball Association"
display(nba.head(3))

# Insert a new column in position 3, where the column name is "Entertaining" and the value is "No"
nba.insert(loc=3, column="Entertaining", value="No")
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Sport,League
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0,Basketball,National Basektball Association
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0,Basketball,National Basektball Association
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,,Basketball,National Basektball Association


Unnamed: 0,Name,Team,Number,Entertaining,Position,Age,Height,Weight,College,Salary,Sport,League
0,Avery Bradley,Boston Celtics,0.0,No,PG,25.0,6-2,180.0,Texas,7730337.0,Basketball,National Basektball Association
1,Jae Crowder,Boston Celtics,99.0,No,SF,25.0,6-6,235.0,Marquette,6796117.0,Basketball,National Basektball Association
2,John Holland,Boston Celtics,30.0,No,SG,27.0,6-5,205.0,Boston University,,Basketball,National Basektball Association


### Create New Column from Existing Column

In [8]:
nba = pd.read_csv(nba_path)

nba["Age in a Decade"] = nba["Age"] + 10

# Using methods to do the same
nba.insert(loc=5, column="Age in a Decade 2", value=nba["Age"].add(10))

nba["Weight (kg)"] = nba["Weight"] * 0.4535
nba.head(3)

# Override
nba["Salary"] = nba["Salary"].mul(0.5)

### `value_counts()` method

In [9]:
nba = pd.read_csv(nba_path)
nba.value_counts()  # Pretty useless
nba["Position"].value_counts()

SG    102
PF    100
PG     92
SF     85
C      78
Name: Position, dtype: int64

### Drop DataFrame Rows with Null Values with the dropna Method

In [10]:
## nba = pd.read_csv(file_path + "nba.csv")
nba.tail()

nba.dropna()  # Drops all rows with one or more missing values

nba.dropna(how="all")  # Drops rows with all values are missing

nba.dropna(subset=["College"])  # Drops columns where the column College is missing

nba.dropna(subset=["College", "Salary"]).head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0


### Fill in Missing DataFrame Values with the fillna method

In [11]:
nba = pd.read_csv(nba_path)

nba.fillna(0)  # Fill  all missing values with zero

nba["College"] = nba["College"].fillna("Unknown")

nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,


### The `astype()` Method

In [12]:
nba[
    "Age"
].hasnans  # Pandas cannot converge column with nans to integer. They must be float...

True

In [13]:
nba = pd.read_csv(nba_path).dropna(how="all")

# Pandas cannot converge column with nans to integer. They must be float...
nba["Age"].hasnans

# The two below are the same
nba["Age"].astype("int")  # Recommended
nba["Age"].astype(int)
# The latter can only be used when the data type is in the python standard library

nba["Age"] = nba["Age"].astype("int")

In [14]:
# nba["Salary"].astype("int") # Does not work because Salary has NaNs

nba["Salary"] = nba["Salary"].fillna(0).astype("int")

### The `category` dtype

In [15]:
print(nba.info())
nba["Position"] = nba["Position"].astype("category")
nba["Team"] = nba["Team"].astype("category")
print(nba.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 0 to 456
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      457 non-null    object 
 1   Team      457 non-null    object 
 2   Number    457 non-null    float64
 3   Position  457 non-null    object 
 4   Age       457 non-null    int64  
 5   Height    457 non-null    object 
 6   Weight    457 non-null    float64
 7   College   373 non-null    object 
 8   Salary    457 non-null    int64  
dtypes: float64(2), int64(2), object(5)
memory usage: 35.7+ KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 0 to 456
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   Name      457 non-null    object  
 1   Team      457 non-null    category
 2   Number    457 non-null    float64 
 3   Position  457 non-null    category
 4   Age       457 non-null    int64   
 5   Height    

### The `sort_value()` method

In [16]:
nba = pd.read_csv(nba_path)

nba["Name"].sort_values()

nba.sort_values(by="Salary", ascending=True)
nba.sort_values(by=["Age", "Salary"], ascending=True)

nba.sort_values(by="Salary", ascending=False, na_position="first")

nba = nba.sort_values(by=["Team", "Name"], ascending=[True, False])

### The `sort_index()` method

In [17]:
nba.sort_index(ascending=True)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


### Rank Series Values with the rank method

In [18]:
nba = pd.read_csv(nba_path).dropna(how="all")

nba["Salary"] = nba["Salary"].fillna(0).astype("int")

nba["Salary Rank"] = nba["Salary"].rank(ascending=False).astype("int")

nba.sort_values(by="Salary Rank").head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Salary Rank
109,Kobe Bryant,Los Angeles Lakers,24.0,SF,37.0,6-6,212.0,,25000000,1
169,LeBron James,Cleveland Cavaliers,23.0,SF,31.0,6-8,250.0,,22970500,2
33,Carmelo Anthony,New York Knicks,7.0,SF,32.0,6-8,240.0,Syracuse,22875000,3
