# Pandas crash course

### Library imports

In [1]:
import pandas as pd

In [2]:
import numpy as np # Pandas and NumPy use to be together

## The Series object

In [4]:
series = pd.Series([0.25, 0.5, 0.75, 1.0])

print(series)

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64


In [5]:
series.values

array([0.25, 0.5 , 0.75, 1.  ])

In [6]:
series.index

RangeIndex(start=0, stop=4, step=1)

In [None]:
series[1:3]

In [7]:
series = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])

In [8]:
series.values

array([0.25, 0.5 , 0.75, 1.  ])

In [9]:
series.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [12]:
series[0]

0.25

In [11]:
series['a']

0.25

In [17]:
series['a':'c']

a    0.25
b    0.50
c    0.75
dtype: float64

In [14]:
series[series>0.5]

c    0.75
d    1.00
dtype: float64

In [15]:
series[(series>0.5) & (series<1)]

c    0.75
dtype: float64

## The DataFrame object

In [18]:
df = pd.DataFrame(np.random.rand(3,2), columns=['foo', 'bar'])

df.head()

Unnamed: 0,foo,bar
0,0.340209,0.854956
1,0.799967,0.280468
2,0.142582,0.317863


In [19]:
df = pd.DataFrame(np.random.rand(3,2), columns=['foo', 'bar'], index=['a', 'b', 'c'])

df.head()

Unnamed: 0,foo,bar
a,0.185532,0.317516
b,0.110627,0.910568
c,0.928267,0.291139


In [20]:
df.values

array([[0.18553213, 0.31751558],
       [0.11062659, 0.91056779],
       [0.9282671 , 0.29113902]])

In [21]:
df.columns

Index(['foo', 'bar'], dtype='object')

In [22]:
df.index

Index(['a', 'b', 'c'], dtype='object')

## Data selection in FataFrame

In [None]:
df['foo']

In [None]:
df[:'a']

## Load data

We're going to use the Titanic dataset.

In [None]:
data = pd.read_csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv")

In [None]:
data.head()

In [None]:
data.columns

In [None]:
data.index

## Dataset summaries

In [None]:
data.shape

In [None]:
len(data) # Count rows

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.describe(include="all")

In [None]:
data.groupby("class").describe()

In [None]:
data.groupby("class").describe().T

In [None]:
data['class'].value_counts()

In [None]:
data['class'].nunique()

In [None]:
%matplotlib inline

import seaborn as sns

d = sns.load_dataset('planets')

d.summary()


## Data selection by column

In [None]:
data["fare"]

In [None]:
data.fare

In [None]:
data[["class", "sex", "fare"]]

In [None]:
data.loc[:,'age':'fare'].head()

In [None]:
data.iloc[:,[1,2,5]]; # Returns columns 1, 2 and 5

In [None]:
data.loc[data['age'] < 10, ['class','sex']];

In [None]:
data.head()

In [None]:
data.tail()

## Data selection by row

In [None]:
data[data['class'] == "First"];

In [None]:
data[data['age'] < 30];

In [None]:
data.drop_duplicates();

In [None]:
data.iloc[5:10]

In [None]:
data.nlargest(3, "age")

In [None]:
data.nsmallest(3, "age")

In [None]:
data.query("(age > 10) and (age < 15)")

## Missing data

In [None]:
data['deck'].isna().sum()

In [None]:
print("Rows: {}".format(len(data)))
cleanData = data.dropna()
print("Rows: {}".format(len(cleanData)))

In [None]:
print(data['deck'].head())
cleanData = data.fillna("FILLED")
print(cleanData['deck'].head())

## Make new columns

In [None]:
data['sibilings'] = data['sibsp'] + data['parch']
data[data['sibilings'] > 1].head()

## Delete rows and columns

In [None]:
clean = data.drop('sibilings', axis=1); # Delete column
print(data.columns)
print(clean.columns)

In [None]:
data.drop('sibilings', axis=1, inplace=True); # Delete column in the same dataframe
print(data.columns)

In [None]:
data.drop(['sibsp', 'parch'], axis=1); # Delete several columns

In [None]:
data.drop([10,12,20], axis=0); # Delete several rows

## Group data

In [None]:
data.groupby('class')

In [None]:
data.groupby('class').aggregate(np.mean)

In [None]:
data.groupby('class').aggregate(np.mean)['age']

## Plots

First we must prepare the graphical device.

In [None]:
%matplotlib 

We are ready to plot.

In [None]:
data['fare'].hist()

In [None]:
data.plot.scatter(x="age", y="fare")

## Exercises

## Solutions