In [None]:
# Run this first if you're on Google Colaboratory
# from google.colab import files
# files.upload() # data/banana.py

In [None]:
# files.upload() # data/train.py

# Intro to Jupyter notebooks


## Markdown and REPL (read–eval–print loop) things ✎

In [None]:
print("banana")

In [None]:
my_list = ["a", "b", "c", "d", "e", "f"]

my_list

In [None]:
# Get the last item in the list
my_list[-1]

In [None]:
# Slice it!
my_list[1:3]

In [None]:
# Get an end piece!
my_list[:3]

In [None]:
# Get the other end piece!
my_list[-3:]

## `%magic` tricks ✵彡

This gives us some sweet features from Interactive Python. There are two types of magic:

- `%` prefix: line magic, which applies to a line
- `%%` prefix: cell magic, which applies to multiple lines

In [None]:
# Get more info on a magic
%pwd?

In [None]:
# Print working directory
%pwd

In [None]:
# We can assign %magic output to variables
cwd = %pwd 
cwd

In [None]:
dict = {}
dict["banana"]

In [None]:
# Adds an interactive debugger at the bottom of the LAST exception traceback
%debug

In [None]:
# Automatically enter debugger after ANY exception
%pdb

In [None]:
# Returns the execution time of a single statement
%time "banana".count("a")

In [None]:
# Returns the average execution time of a single statement, run multiple times
%timeit "banana".count("a")

In [None]:
%%time
def count_a(string):
    return string.count("a")
count_a("banana")

# A cell magic that returns the execution time of a multi-line statement.
# Warning: Cell magics don't like leading blank/comment/code lines.

In [None]:
# Run an external script here
%run data/banana.py

# If you're in Google Colab, run this instead:
# %run banana.py

In [None]:
# View all %magic commands
%magic

In [None]:
cwd

In [None]:
# Reset all variables. Use `%reset -f` to skip confirmation
%reset

In [None]:
cwd

More:
https://ipython.readthedocs.io/en/stable/interactive/magics.html

## Meet `pandas` ʕ •ᴥ•ʔ

`pandas` is a Python library for exploration & analysis, for tabular data.

We've already made `pandas` available in this environment with `pip install`. To use it in our notebook, we'll need to import it.

In [None]:
import pandas as pd

### `pandas` objects: Series ﾟﾟ┌┴oﾟﾟﾟﾟ°

A Series is a one-dimensional array-like data structure with indexing.

In [None]:
# The simplest Series.
# Note that the index is automatically created, and the value type is inferred.
series1 = pd.Series([10, 5, 6, 7, 18, 10, 4, 13])

series1

In [None]:
series1.values

In [None]:
series1.index

In [None]:
# Check the shape of the Series
series1.shape

In [None]:
# Produces a summary of the Series.
series1.describe()

In [None]:
# Custom indexing with labels
series2 = pd.Series([1, 2, 50], index=['banana', 'bananas', 'much bananas'])

series2

In [None]:
# We can have mixed types in a Series
series3 = pd.Series([1, 'potato', 2, 'potato'])

series3

***Note: When the columns are object types, this makes the data structure slower to process.**

In [None]:
# We can use array indexes to fetch values
series2[2]

In [None]:
# We can fetch ranges by slicing
series2[:2]

In [None]:
# We can fetch values by integer indexing
series2.iloc[1]

In [None]:
# We can also use the index labels to fetch values
series2['much bananas']

# We can fetch values by label indexing.
# This is the `pandas` way to do the previous command
# series2.loc['much bananas']

In [None]:
# We can fetch values using multiple labels, called in a custom order
series2[['much bananas', 'banana', 'bananas']]

In [None]:
# We can filter with conditions

# Flashback: series1 = pd.Series([10, 5, 6, 7, 18, 10, 4, 13])
# Note the indexes.
series1[series1<8]

In [None]:
# We can load dictionaries as Series
fruit_dict = {'banana': 3, 'apple': 2, 'lemon': 5}

fruit_series1 = pd.Series(fruit_dict)

fruit_series1

In [None]:
# And you can pre-specify index labels and get the Series in a desired order
fruit_labels = ['apple', 'banana', 'lemon', 'tomato']

fruit_series2 = pd.Series(fruit_dict, index=fruit_labels)

fruit_series2 # Suddenly we get floats. Why?

In [None]:
fruit_series2.isnull()

In [None]:
fruit_series2.notnull()

In [None]:
# Grab the Index object from one of our Series
fruit_series2_index = fruit_series2.index

fruit_series2_index

In [None]:
type(fruit_series2_index)

In [None]:
fruit_series2_index[1]

In [None]:
# `pandas` Index object items are immutable
fruit_series2_index[1] = 'no banana'

In [None]:
# `pandas` Index object can contain duplicates
orphan_index = pd.Index([1, 'potato', 2, 'potato'])

orphan_index

### ┏━━━━━━━ʕ•㉨•ʔ━━━━━━━━┓
### &nbsp;&nbsp; &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`pandas` objects: DataFrame
### ┗━━━━━━━━☆━━━━━━━━━┛

A `pandas` DataFrame is a table of data with row and column indexes. Wes McKinney (`pandas` creator) describes it as a dictionary of Series that shares the same index. The data is stored as one or more two-dimensional blocks.

In [None]:
# Now we'll load some data
file_path = "./data/train.csv"

# If you're in Google Colab, run this instead:
# file_path = "train.csv"

df = pd.read_csv(file_path) # defaults: `sep=",", header=0`

In [None]:
# Check the data type
type(df)

In [None]:
# Check the shape of the DataFrame
df.shape

In [None]:
df.columns

In [None]:
# Quick peek at the dataset via Python slicing
df[:5]

In [None]:
# This is the `pandas` way to do it
df.head()
# df.head(8)

In [None]:
df.tail()

In [None]:
df.sample(3)

In [None]:
# Produces a summary of the DataFrame.
# This ignores columns with non-numerical values.
# This has no meaning in this context, we're just experimenting.
df.describe()

In [None]:
# Examine a certain column
df['Name'].head(10)

In [None]:
df.rename(columns={"PassengerId": "passenger_id"}) # Note if this changed the column name in the DataFrame.

In [None]:
# Different ways of selecting the first row

df.head(1)
# df[:1]
# df.iloc[0]
# df.iloc[0, :]
# df.loc[0]
# df.loc[[0], df.columns]

In [None]:
# Different ways of selecting a particular column (Name)

df['Name']
# df.loc[:, 'Name']
# df.iloc[:, 3]

![Accessing things in pandas](images/pandas_access.png)

### Exercise ᕦ(ò_óˇ)ᕤ

Use your newfound knowledge of `iloc` and `loc` to select the first 5 rows from the first 4 columns.

In [None]:
# Different ways of selecting (5 rows from) the first 4 columns

# Using head()

# Using iloc[]

# Using loc[]

## I can haz codez pls ( ᐕ)

To generate a `.py` out of this notebook:

`File > Download as > Python (.py)`

Github is nice and renders `.ipynb` files without needing to run Jupyter - great for quick sharing. E.g. https://github.com/ibm-et/jupyter-samples/blob/master/airline/Exploration%20of%20Airline%20On-Time%20Performance.ipynb

Bitbucket doesn't do this, so for the same effect, you can generate and commit a Markdown version of your notebook to your repo:

`File > Download as > Markdown (.md)`

## Questions? ʕ•ᴥ•ʔʃ

## What's next?  υ´• ﻌ •`υ

Extra credit: Try the Titanic tutorial on Kaggle: https://www.kaggle.com/c/titanic/overview/tutorials