# pandas: Intermediate (Part 3)

## Intro to NA Values

### Numpy NAN 
Numpy `NAN` stands for not a number and is defined as a substitute for declaring value which are numerical values that are missing values in an array

In [1]:
import pandas as pd
import numpy as np

In [None]:
sales = pd.read_csv("../Data/sales.csv", index_col = 0)

In [None]:
sales

In [None]:
sales.info()

In [None]:
sales.loc["Steven", "Thu"]

In [None]:
sales.iloc[1,1] = None

In [None]:
sales

In [None]:
sales.iloc[2,2] = np.nan

In [None]:
sales

In [None]:
sales.info()

## Handling NA Values / missing Values

In [None]:
titanic = pd.read_csv("../Data/titanic.csv")

In [None]:
titanic.head()

In [None]:
titanic.info()

In [None]:
titanic.isna().sum()

In [None]:
titanic.notna().sum()

In [None]:
titanic.loc[titanic.embarked.isna()]

In [None]:
titanic.shape

### Remove missing values

dropna() allows you to handle missing values in different ways, we will explore some of them.

Doc: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.dropna.html?highlight=dropna#pandas.DataFrame.dropna

In [None]:
titanic.dropna()

In [None]:
titanic.dropna().shape

In [None]:
titanic.dropna(how = "all").shape

In [None]:
titanic.dropna(axis = 1, how = "any").shape

In [None]:
titanic.dropna(axis = 1, thresh = 500).shape

In [None]:
titanic.dropna(axis = 1, thresh = 500, inplace = True)

In [None]:
titanic.info()

In [None]:
titanic.loc[titanic.age.isna()]

In [None]:
mean_age = titanic.age.mean()
mean_age

In [None]:
titanic.age.fillna(value = mean_age, inplace = True)

In [None]:
titanic.age

In [None]:
titanic.info()

## Changing dtypes

In [3]:
transactions = pd.read_csv("../Data/transactions.csv", index_col = 0)

In [11]:
transactions.head(5)

Unnamed: 0_level_0,store_nbr,transactions
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-01-01,25,770
2013-01-02,1,2111
2013-01-02,2,2358
2013-01-02,3,3487
2013-01-02,4,1922


In [12]:
transactions.reset_index(inplace=True)
transactions.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83488 entries, 0 to 83487
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   date          83488 non-null  object
 1   store_nbr     83488 non-null  int64 
 2   transactions  83488 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.9+ MB


In [14]:
transactions1 = transactions.convert_dtypes()
transactions1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83488 entries, 0 to 83487
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   date          83488 non-null  string
 1   store_nbr     83488 non-null  Int64 
 2   transactions  83488 non-null  Int64 
dtypes: Int64(2), string(1)
memory usage: 2.1 MB


In [19]:
transactions2 = transactions.astype({'transactions':'float', "date":"datetime64"})
transactions2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83488 entries, 0 to 83487
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   date          83488 non-null  datetime64[ns]
 1   store_nbr     83488 non-null  int64         
 2   transactions  83488 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(1)
memory usage: 1.9 MB


In [21]:
transactions2.head(5)

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770.0
1,2013-01-02,1,2111.0
2,2013-01-02,2,2358.0
3,2013-01-02,3,3487.0
4,2013-01-02,4,1922.0


In [24]:
transactions['date'] =  pd.to_datetime(transactions['date']) 
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83488 entries, 0 to 83487
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   date          83488 non-null  datetime64[ns]
 1   store_nbr     83488 non-null  int64         
 2   transactions  83488 non-null  int64         
dtypes: datetime64[ns](1), int64(2)
memory usage: 1.9 MB


## Summary Statistics and Accumulations

In [None]:
titanic = pd.read_csv("../Data/titanic.csv")

In [None]:
titanic.head()

In [None]:
titanic.describe()

In [None]:
titanic.count(axis = "columns")

In [None]:
titanic.count(axis = 1)

In [None]:
titanic.mean(axis = 0)

In [None]:
titanic.mean(axis = 1)

In [None]:
titanic.sum(axis = 0)

In [None]:
titanic.head()

In [None]:
titanic.fare.cumsum(axis = 0)

In [None]:
titanic.corr()

In [None]:
titanic.survived.corr(titanic.pclass)

In [None]:
titanic.survived.corr(titanic.fare)

## The agg() method

In [None]:
titanic = pd.read_csv("../Data/titanic.csv")

In [None]:
titanic.head()

In [None]:
titanic.describe()

In [None]:
titanic.mean()

In [None]:
titanic.agg("mean")

In [None]:
titanic.agg(["mean", "std", "min", "max", "median"])

In [None]:
titanic.agg({"survived": "mean", "age":["min", "max"]})