In [None]:
import pandas as pd

 I want to store passenger data of the Titanic. For a number
    of passengers, I know the name, age, and sex data.
    
        To manually store data in a table, create a DataFrame
        Each column in a DataFrame is a Series

In [None]:
df = pd.DataFrame({
    "Name": [
        "Braund, Mr. Owen Harris",
        "Allen, mr. William Henry",
        "Bonnell, Miss. Elizabeth"
    ],
    "Age": [22, 35, 58],
    "Sex": ["male", "male", "female"]
})
df

In [None]:
# Just interested in working with the data in the column Age
# When selecting the result of a single column of a Dataframe
# the result is a pandas Series
df["Age"]

In [None]:
# Interested in some basic statistics of the numerical data of my
# data table 

df.describe()

### How do I read and write tabula data?

In [None]:
titanic = pd.read_csv("data/train.csv")
titanic

In [None]:
titanic.head(8)

In [None]:
titanic.dtypes

In [None]:
titanic.to_excel("titanic_new.xlsx", sheet_name="passengers", index=False)

In [None]:
titanic = pd.read_excel("titanic_new.xlsx", sheet_name="passengers")
titanic.head(5)

In [None]:
# Interested in technical summary
titanic.info()

## How do I select a sbuset of a DataFrame?
## How do I select a specific columns from a DataFrame?

In [None]:
# Interested in the age of the Titanic passengers
ages = titanic["Age"]
ages.head()

In [None]:
# the shape of the output
titanic["Age"].shape

In [None]:
# interested in the age and sex f the titanic passengers
age_sex = titanic[["Age", "Sex"]]
age_sex.head()

## Learning more basic information on indexing
    Here we construct a simple time series data set and use it for illustrating the indexing functionality

In [None]:
dates = pd.date_range("1/1/2000", periods=8)
dates

In [None]:
import numpy as np
df = pd.DataFrame(np.random.randn(8,4),
                 index=dates, columns=["A","B","C","D"])
df

In [None]:
s = df["A"]
s

In [None]:
s[dates[0]]

In [None]:
# You can pass a list of columns
df


In [None]:
df[["B", "A"]] = df[["A", "B"]]
df

In [None]:
df[["A", "B"]]

In [None]:
# The correct way to swap values is by using raw values
df.loc[:, ["B", "A"]] = df[["A", "B"]].to_numpy()
df[["A", "B"]]

## Attribute access

In [None]:
sa = pd.Series([1,2,3], index=list("abc"))
sa

In [None]:
dfa = df.copy()
dfa

In [None]:
sa.b

In [None]:
dfa.A

In [None]:
sa

In [None]:
sa.a = 5
sa

In [None]:
dfa.A = list(range(len(dfa.index))  # ok if A already exists
dfa

In [None]:
dfa["F"] = np.random.randn(8,4)
dfa

### Slicing ranges
        The most robust and consistent way of slicing ranges along arbitrary is loc and iloc

In [None]:
s

In [None]:
s[:5]

In [None]:
s[::2]

In [None]:
s[::-1]

In [None]:
# Note that stting works as well:
s2 = s.copy()
s2[:5]

In [None]:
s2[:5] = 0
s2

In [None]:
# With DataFrame, slicing inside of [] *slices the rows*. This is
#provided largely as a convenice since it ushc a common operation
df

In [None]:
df[:3]

In [None]:
df[::-1]

#### Selections by label

In [None]:
dfl = pd.DataFrame(np.random.randn(5,4),
                  columns=list("ABCD"),
                  index=pd.date_range("20130101", periods=5))
dfl

In [None]:
dfl.iloc[2:3]

### How do I filter specific rows from a DataFrame

In [None]:
titanic.head(6)

In [None]:
above_35 = titanic[titanic["Age"] > 35]
above_35.head()

In [None]:
titanic["Age"]> 35

In [None]:
above_35.shape

In [None]:
# Interested in the Titanic Passengers from cabin class 2 and 3
class_23 = titanic[titanic["Pclass"].isin([2,3])]

In [None]:
class_23.head()

##### I want to work with passenger data for which the age is known

In [None]:
age_no_na = titanic[titanic["Age"].notna()]
age_no_na.head()

In [None]:
age_no_na.shape

## How do I select specific rows and columsn from a DataFrame

    When using ;oc/iloc, the part before the comma is the rows you want,
    and the part after the comma is the columns you want to select

In [None]:
# Interested in the names of the passengers older than 35 years
adult_names = titanic.loc[titanic["Age"]>35, "Name"]
adult_names.head()


In [None]:
# You are interested in rows 10 till 25 and columns 3 to 5
titanic.iloc[9:25, 2:5]

# How to create plots in pandas

U

In [None]:
df = pd.read_csv("data/Lagos_PM2.5_2021_YTD.csv", index_col=2, parse_dates=True)

df

In [None]:
df = df[df["AQI"]>0]
air_quality_lagos = df[["AQI"]]
air_quality_lagos = air_quality_lagos.loc["2021-02-12 21:00:00":]
air_quality_lagos





In [None]:
df = pd.read_csv("data/Abuja_PM2.5_2021_YTD.csv", index_col=2, parse_dates=True)
df = df[df["AQI"]>0]
air_quality_abuja = df[["AQI"]]
air_quality_abuja

In [None]:

air_quality_data = pd.merge(air_quality_lagos, air_quality_abuja, on="Date (LT)",
                      suffixes=('_lagos', '_abuja'))

In [None]:
air_quality_data

In [None]:
air_quality_data.plot()

In [None]:
# I want to visually compare the air quality values in Lagos vs Abuja
air_quality_data

In [None]:
air_quality_data.plot.scatter(x="AQI_lagos", y="AQI_abuja", alpha=0.5)

#### Create a bar chart

In [None]:
ax = air_quality_data.plot(kind="line", figsize=(20,7))
ax.set_ylabel("AQI Lagos vs.AQI Abuja")
ax.set_xlabel("Dates (LT)")
ax.grid()

### Subplot

In [None]:
axs = air_quality_data.plot.area(figsize=(12, 4), subplots=True)

In [None]:
# Further customize, extend and save resulting plot
from matplotlib import pyplot as plt
fig, axs = plt.subplots(figsize=(12, 4))        # Create an empty matplotlib Figure and Axes
air_quality_data.plot.area(ax=axs)                   # Use pandas to put the area plot on the prepared Figure/Axes
axs.set_ylabel("AQI")          # Do any matplotlib customization you like
fig.savefig("AQI_lag_vs_abj.png") 