# Pandas & Dataframe Basics

- API Reference: https://pandas.pydata.org/docs/reference/index.html

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np

## 1. Dataframes

### 1a. Download and import dataset with pandas
- `pd.read_csv(file directory)`
- Download data from : https://www.kaggle.com/c/titanic
    - `test.csv`
    - `train.csv`
    - `gender_submission.csv`

In [4]:
# Read csv file
data = pd.read_csv("train.csv")

data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


### 1b. Show Data
- `pd.head()`: read first _ rows of dataframe
- `pd.tail()`: read last _ rows of dataframe
- by default, return 5 rows

In [None]:
# pd.head()


In [None]:
# pd.tail()


### 1c. Count Values
- `pd.value_counts()`
- For counting certain values in a **Series**
- Need to convert dataframe into series to use this method

In [None]:
# what is Series?
# A dataframe is composed of multiple Series


In [None]:
# number of occurences


In [5]:
"""
Try this:
Count the number of occurences of each values in 'Embarked'
"""

values = data['Embarked'].value_counts()

values

S    644
C    168
Q     77
Name: Embarked, dtype: int64

### 1d. Creating a Dataframe with lists
- `pd.Dataframe(list or array, columns)`

In [None]:
# list shape


In [None]:
# creating a dataframe with list


### 1e. Creating a Dataframe with dictionaries
- `pd.DataFrame(dictionary)`
- Since dictionaries are mapped with key-values, no list input is needed
- key : column name
- value: data

## 2. Manipulating Dataframes

### 2a. Adding Rows to an existing Dataframe

In [None]:
# Fill new column with 0


In [None]:
# Fill new column with an empty string


In [None]:
# Creating new columns from an existing column


In [6]:
"""
Try This:
Create a new column 'Fare_DC' which has 10 % discounted values from the 'Fare' column
"""
data['Fare_DC'] = data['Fare'] * 0.9
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Fare_DC
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,6.525
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,64.15497
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,7.1325
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,47.79
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,7.245


### 2b. Updating rows in an existing Dataframe
- original column = original column + something

### 2c. Deleting rows or columns in an existing Dataframe
- If deleting a column, axis = 1
- `pd.drop(column name, axis = 1)`

In [None]:
# deleting column 'Age_new'


- If deleting a row, axis = 0 (default)
- `pd.drop(index_num, axis = 0)`

In [None]:
# deleting the first row


In [7]:
"""
Try This:
Delete the column 'Fare_DC'
"""

new = data.drop('Fare_DC', axis = 1)

new.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## 3. Indexing in Dataframe
- Index : Unique numbers associated with each rows and columns of a dataframe


In [None]:
data = pd.read_csv('titanic/train.csv')
data.head(5)

In [None]:
# get info of data


In [None]:
# get index


In [None]:
# get index at certain position of a list
lst = ['apple', 'banana', 'melon']


In [None]:
# get certain value from a certain row by indexing


- `pd.reset_index()`
- creating a new index column to an existing dataframe

In [None]:
data.head()

In [None]:
data.reset_index()

## 4. Data Selection / Filtering

In [9]:
"""
Try importing the titanic train.csv file with pandas
"""

data = pd.read_csv('train.csv')

data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


- Data Selection with "[]" operator
- `dataframe[column_name]`

In [None]:
# Get certain column from a dataframe
# Returns a series type


In [None]:
# Get multiple columns from a dataframe
# Returns a dataframe type


- Cannot call a column with column index number

In [None]:
# If called with a number, returns a KeyError
data[0]

- However, slicing is possible
- `dataframe[0:3]`
    - Getting the first three rows of a dataframe

In [None]:
# Get the first three rows of columns 'Survived', 'Pclass'


- Boolean indexing is also possible
    - Meaning that a statement returns either True or False value

In [None]:
# Get rows with Pclass having value of 3


In [11]:
data


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [14]:
"""
Try this:
Get First 10 rows where Pclass equals 2
"""

data[data['Pclass'] == 2][0:10]


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C
15,16,1,2,"Hewlett, Mrs. (Mary D Kingcome)",female,55.0,0,0,248706,16.0,,S
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0,,S
20,21,0,2,"Fynney, Mr. Joseph J",male,35.0,0,0,239865,26.0,,S
21,22,1,2,"Beesley, Mr. Lawrence",male,34.0,0,0,248698,13.0,D56,S
33,34,0,2,"Wheadon, Mr. Edward H",male,66.0,0,0,C.A. 24579,10.5,,S
41,42,0,2,"Turpin, Mrs. William John Robert (Dorothy Ann ...",female,27.0,1,0,11668,21.0,,S
43,44,1,2,"Laroche, Miss. Simonne Marie Anne Andree",female,3.0,1,2,SC/Paris 2123,41.5792,,C
53,54,1,2,"Faunthorpe, Mrs. Lizzie (Elizabeth Anne Wilkin...",female,29.0,1,0,2926,26.0,,S
56,57,1,2,"Rugg, Miss. Emily",female,21.0,0,0,C.A. 31026,10.5,,S


**Indexing functions**
- `loc` : Label based indexing (명칭 기반 인덱싱)
    - Referencing to column name when indexing
- `iloc`: Position based indexing (위치 기반 인덱싱) 
    - Referencing to the position of rows and columms of a dataframe
    - `dataframe.iloc[row, column]`

In [15]:
# iloc only allows integers as inputs
data.iloc[0, 'Name']

ValueError: Location based indexing can only have [integer, integer slice (START point is INCLUDED, END point is EXCLUDED), listlike of integers, boolean array] types

In [None]:
# loc allows both integers and strings as inputs
# however, the integer does not represent a value in a row, but a unique index of a row



In [16]:
test = {'Color': ['Yellow', 'Red', 'Green'],
        'Name': ['one', 'two', 'three'],
        'Year': [1999, 1909, 2011]
       }

test_df = pd.DataFrame(test, index = ['Sam', 'James', 'Kyle'])

test_df

Unnamed: 0,Color,Name,Year
Sam,Yellow,one,1999
James,Red,two,1909
Kyle,Green,three,2011


**Boolean indexing**

In [None]:
# Get passengers over age 60



**Operands**
- <, >
    - comparing the size
- <=, >=
    - greater/less or equal to

In [None]:
myage = 10
yourage = 20
hisage = 20



- `&`, `and`
    - and operator 
- `|`, `or`
    - or operator

In [19]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
"""
Try this:
Get passengers of 'Age' order than 50 and 'Pclass' equal to 1
"""

In [None]:
data[(data['Age'] > 50) & (data['Pclass'] == 1)]

In [22]:
data[(data['Fare'] < 15) & (data['Age'] < 30)]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
12,13,0,3,"Saundercock, Mr. William Henry",male,20.0,0,0,A/5. 2151,8.0500,,S
14,15,0,3,"Vestrom, Miss. Hulda Amanda Adolfina",female,14.0,0,0,350406,7.8542,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
877,878,0,3,"Petroff, Mr. Nedelio",male,19.0,0,0,349212,7.8958,,S
882,883,0,3,"Dahlberg, Miss. Gerda Ulrika",female,22.0,0,0,7552,10.5167,,S
883,884,0,2,"Banfield, Mr. Frederick James",male,28.0,0,0,C.A./SOTON 34068,10.5000,,S
884,885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.0500,,S
