# DataFrames Introduction

In [None]:
import os
import pandas as pd
import jupyter_black

jupyter_black.load()

current_dir = os.path.dirname(os.path.abspath("__file__"))

### Set file path to datasets

In [None]:
nba_path = os.path.join(current_dir, "data", "nba.csv")
revenue_path = os.path.join(current_dir, "data", "revenue.csv")

### Methods and attributes between Series and DataFrames

In [None]:
nba = pd.read_csv(nba_path)
s = pd.Series(range(1, 6))

s.head(n=2)
nba.head(n=2)

s.index
nba.index

s.values
nba.values

s.shape
nba.shape

s.dtype
nba.dtypes  # returns a Series

s.hasnans
# nba.hasnans # Does not exist

# s.columns # Does not exist
nba.columns

s.axes
nba.axes

s.info()
nba.info()

### Difference between Shared Methods

In [None]:
revenue = pd.read_csv(revenue_path, index_col=["Date"])
s = pd.Series([1, 2, 3])

s.sum()

# The  four below give same result
revenue.sum()
revenue.sum(axis="rows")
revenue.sum(axis="index")
revenue.sum(axis=0)

revenue.sum(axis="columns")  # Summing across the columns to one number
revenue.sum(axis=1)  # Same as columns

### Select One Column frm a DataFrame

In [None]:
nba = pd.read_csv(nba_path)

# First Method
nba.Name
type(nba.Name)

# Second Method (preferred)
nba["Name"]
nba["Salary"].head(3)

### Select Two or More columns from a DataFrame

In [None]:
nba[["Name", "Team"]]
nba[["Team", "Name"]]
colnames = ["Salary", "Team", "Name"]
nba[colnames].head(3)

### Add New Column to DataFrame


In [None]:
nba = pd.read_csv(nba_path)
nba.head()

nba["Sport"] = "Basketball"
nba["League"] = "National Basektball Association"
display(nba.head(3))

# Insert a new column in position 3, where the column name is "Entertaining" and the value is "No"
nba.insert(loc=3, column="Entertaining", value="No")
nba.head(3)

### Create New Column from Existing Column

In [None]:
nba = pd.read_csv(nba_path)

nba["Age in a Decade"] = nba["Age"] + 10

# Using methods to do the same
nba.insert(loc=5, column="Age in a Decade 2", value=nba["Age"].add(10))

nba["Weight (kg)"] = nba["Weight"] * 0.4535
nba.head(3)

# Override
nba["Salary"] = nba["Salary"].mul(0.5)

### `value_counts()` method

In [None]:
nba = pd.read_csv(nba_path)
nba.value_counts()  # Pretty useless
nba["Position"].value_counts()

### Drop DataFrame Rows with Null Values with the dropna Method

In [None]:
## nba = pd.read_csv(file_path + "nba.csv")
nba.tail()

nba.dropna()  # Drops all rows with one or more missing values

nba.dropna(how="all")  # Drops rows with all values are missing

nba.dropna(subset=["College"])  # Drops columns where the column College is missing

nba.dropna(subset=["College", "Salary"]).head(3)

### Fill in Missing DataFrame Values with the fillna method

In [None]:
nba = pd.read_csv(nba_path)

nba.fillna(0)  # Fill  all missing values with zero

nba["College"] = nba["College"].fillna("Unknown")

nba.head(3)

### The `astype()` Method

In [None]:
nba[
    "Age"
].hasnans  # Pandas cannot converge column with nans to integer. They must be float...

In [None]:
nba = pd.read_csv(nba_path).dropna(how="all")

# Pandas cannot converge column with nans to integer. They must be float...
nba["Age"].hasnans

# The two below are the same
nba["Age"].astype("int")  # Recommended
nba["Age"].astype(int)
# The latter can only be used when the data type is in the python standard library

nba["Age"] = nba["Age"].astype("int")

In [None]:
# nba["Salary"].astype("int") # Does not work because Salary has NaNs

nba["Salary"] = nba["Salary"].fillna(0).astype("int")

### The `category` dtype

In [None]:
print(nba.info())
nba["Position"] = nba["Position"].astype("category")
nba["Team"] = nba["Team"].astype("category")
print(nba.info())

### The `sort_value()` method

In [None]:
nba = pd.read_csv(nba_path)

nba["Name"].sort_values()

nba.sort_values(by="Salary", ascending=True)
nba.sort_values(by=["Age", "Salary"], ascending=True)

nba.sort_values(by="Salary", ascending=False, na_position="first")

nba = nba.sort_values(by=["Team", "Name"], ascending=[True, False])

### The `sort_index()` method

In [None]:
nba.sort_index(ascending=True)

### Rank Series Values with the rank method

In [None]:
nba = pd.read_csv(nba_path).dropna(how="all")

nba["Salary"] = nba["Salary"].fillna(0).astype("int")

nba["Salary Rank"] = nba["Salary"].rank(ascending=False).astype("int")

nba.sort_values(by="Salary Rank").head(3)