A quick reference kernel for anyone who works with pandas often. 

This kernel will be updated with more of pandas' methods, techniques but for now dealing with the most important methods and techniques. 

## Contents

1. [Object Creation](#obj)
2. [Viewing Data](#vdt)
3. [Selection](#slt)
4. [Selection By Position](#sltp)
5. [Boolean Indexing](#bi)
6. [Missing data](#md)
7. [Operations](#opt)
8. [Histogramming](#hgm)
9. [JOINS](#jns)
10. [Grouping](#grp)
11. [Reshaping](#rsh)
12. [Pivot tables](#pvt)
13. [Time Series](#tst)
14. [Categoricals](#cts)
15. [Saving and Writing data](#swd)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os
print(os.listdir("../input"))

In [None]:
data = pd.read_csv("../input/winemag-data_first150k.csv")

In [None]:
data.head(10)

<a id='obj'></a>

## Object Creation - Series

In [None]:
s = pd.Series(data['points'])
s
type(s)

In [None]:
dates = pd.date_range('20130101', periods=6)
dates

In [None]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

### The columns of the DataFrame have different dtypes

In [None]:
data.dtypes

<a id='vdt'></a>

#  Viewing Data

#### Here is how to view the top and bottom rows of the frame:

In [None]:
data.head(5)

In [None]:
data.tail(5)

In [None]:
#### Display the index, columns, and the underlying NumPy data:

data.index

In [None]:
# columns

data.columns

In [None]:
# values

data.values

In [None]:
# check stats of the numerical columns

data.describe()

In [None]:
# Transposing your data:

data.T

In [None]:
# Sorting by an axis:

data.sort_index(axis=1, ascending=False)

In [None]:
# Sorting by values:

data.sort_values(by='country')

<a id='slt'></a>

# SELECTION

In [None]:
# Selecting a single column, which yields a Series, equivalent to df.A:

data['province'].head(5)  # head(5) to print only the top 5 rows

In [None]:
# Selecting via [], which slices the rows.

data[0:3] # Python does not consider the last index so in this case 3 and therefore prints rows 0,1,2

In [None]:
# Selection by Label

data.loc[0]

In [None]:
# Selecting on a multi-axis by label:

data.loc[:,['province','winery']].head(5)

<a id='sltp'></a>

# Selection by Position

In [None]:
# Select via the position of the passed integers:

data.iloc[3]

In [None]:
# By integer slices, acting similar to numpy/python:

data.iloc[3:5,0:2]

In [None]:
# By lists of integer position locations, similar to the numpy/python style:

data.iloc[[1,2,4],[0,2]]

In [None]:
# For slicing rows explicitly:

data.iloc[1:3,:]

In [None]:
#For slicing columns explicitly:

data.iloc[:,1:3].head(4)

In [None]:
# For getting a value explicitly:

data.iloc[1,1]

In [None]:
# For getting fast access to a scalar (equivalent to the prior method):

data.iat[1,1]

<a id='bi'></a>

# Boolean Indexing

In [None]:
# Using a single column’s values to select data.

data[data.price > 100].head(5)

In [None]:
# Selecting values from a DataFrame where a boolean condition is met.

data[data > 0]

In [None]:
# Using the isin() method for filtering:

data[data['country'].isin(['US', 'Spain'])].head(5)

<a id='md'></a>

# Missing Data

In [None]:
#  To drop any rows that have missing data.

data.dropna(how='any')

In [None]:
# Filling missing data.

data.fillna(value=5)

In [None]:
# To get the boolean mask where values are nan.

pd.isna(data)

<a id='opt'></a>

# Operations

In [None]:
#Stats Operations in general exclude missing data.

#Performing a descriptive statistic:


# data.mean()

# Same operation on the other axis:

#data.mean(1)

In [None]:
s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2)
s

In [None]:
df.sub(s, axis='index')

In [None]:
## Apply
#Applying functions to the data:

data['price'].apply(np.cumsum).head(5)

In [None]:
a = data['points']#.apply(lambda x: x.max() - x.min())

a.max() - a.min()

<a id='hgm'></a>

# Histogramming

In [None]:
data['price'].value_counts().head(10)

In [None]:
# String Methods

# Series is equipped with a set of string processing methods in the str attribute that make it easy to operate on each element of the array,
# Note that pattern-matching in str generally uses regular expressions by default (and in some cases always uses them)

strs = data['description']

In [None]:
# lower case

strs.str.lower().head(10)

In [None]:
## Merge

# Concat

#pandas provides various facilities for easily combining together Series, DataFrame, and Panel objects with various kinds of set logic for the indexes and relational algebra functionality in the case of join / merge-type operations.
#Concatenating pandas objects together with concat():

# break it into pieces
pieces = [data[:3], data[3:7], data[7:]]

pd.concat(pieces)

<a id='jns'></a>

# JOIN

In [None]:
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})

right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})

In [None]:
pd.merge(left, right, on='key')

In [None]:
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})

In [None]:
pd.merge(left, right, on='key')

In [None]:
# Append

# Append rows to a dataframe. 

df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
df

In [None]:
s = data.iloc[3]
s

In [None]:
## Appending

data.append(s, ignore_index=True)

<a id='grp'></a>

# Grouping 

#### By “group by” we are referring to a process involving one or more of the following steps:

#### Splitting the data into groups based on some criteria
#### Applying a function to each group independently
#### Combining the results into a data structure

In [None]:
# group by single column

data.groupby('country')['price'].sum().head(10)

In [None]:
# group by multiple columns

data.groupby(['country', 'province'])['price'].sum().head(10)

<a id='rsh'></a>

# Reshaping

In [None]:
# Stack

# The stack() method “compresses” a level in the DataFrame’s columns.

stacked = data.stack()
stacked

In [None]:
## Unstack()

stacked.unstack()

In [None]:
## Change in axis

stacked.unstack(0)

<a id='pvt'></a>

#  Pivot tables

In [None]:
# We can produce pivot tables from this data very easily:

pd.pivot_table(data, values='price', index=['country', 'province'], columns=['points'])

<a id='tst'></a>

# Time Series

In [None]:
#  pandas has simple, powerful, and efficient functionality for performing resampling operations during frequency conversion (e.g., converting secondly data into 5-minutely data).
#  This is extremely common in, but not limited to, financial applications.

In [None]:
rng = pd.date_range('1/1/2012', periods=100, freq='S')
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts.resample('5Min').sum()

In [None]:
# Time zone representation:

rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D')
ts = pd.Series(np.random.randn(len(rng)), rng)

In [None]:
ts

In [None]:
# change time zones

ts_utc = ts.tz_localize('UTC')
ts_utc

In [None]:
# Converting to another time zone:

ts_utc.tz_convert('US/Eastern')

In [None]:
# Converting between time span representations:

rng = pd.date_range('1/1/2012', periods=5, freq='M')

ts = pd.Series(np.random.randn(len(rng)), index=rng)

In [None]:
# Converting between period and timestamp enables some convenient arithmetic functions to be used. In the following example, 
# we convert a quarterly frequency with year ending in November to 9am of the end of the month following the quarter end:

prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV')
prng

In [None]:
ts = pd.Series(np.random.randn(len(prng)), prng)
ts

In [None]:
ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9

ts.head()

<a id='cts'></a>

# Categoricals

In [None]:
# Convert the raw grades to a categorical data type.

data["country"] = data["country"].astype("category")
data["country"].head(10)

In [None]:
# Sorting is per order in the categories, not lexical order.

data.sort_values(by="country")

In [None]:
# Grouping by a categorical column also shows empty categories.

data.groupby("country").size()

In [None]:
# Plotting

In [None]:
#ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))
ts = data['points']
ts = ts.cumsum()

ts.plot()

In [None]:
df = df.cumsum()

plt.figure(); df.plot(); plt.legend(loc='best')

<a id='swd'></a>

# Saving and writing data

In [None]:
#CSV

#Writing to a csv file.

df.to_csv('foo.csv')

# Reading

pd.read_csv('foo.csv')

# Writing to a HDF5 Store.

df.to_hdf('foo.h5','df')

# Reading from a HDF5 Store.

pd.read_hdf('foo.h5','df')

# Writing to an excel file.

df.to_excel('foo.xlsx', sheet_name='Sheet1')

# Reading from an excel file

pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA'])

### More to be added to this