# Pandas - Basic Stuff

### Table of Contents

 - [Display output options](#options)
 - [Data structures](#datastructures)
     - [Series](#series)
     - [Data Frames](#dataframes)
 - [loc](#loc)
 - [Adding Data](#adddata)
 - [Concat](#concat)
 - [Comparisions](#comparisions)
 - [Binning](#binning)
 - [Random](#random)

### Style Sheet

In [2]:
from IPython.core.display import HTML
css = open('styles/style-table.css').read() + open('styles/style-notebook.css').read()
HTML('<style>{}</style>'.format(css))

### Imports

In [3]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns


<a id='options'/> 
## Display output options 

In [4]:
pd.set_option("display.max_rows",1000)    # or pd.options.display.max_rows=1000
pd.set_option("display.max_columns",20)   # or pd.options.display.max_columns=20 
pd.set_option('precision',7)
pd.set_option('large_repr', 'truncate')

In [5]:
!ls 

[1m[34mdata[m[m                                 [1m[32mpandas - indexing.ipynb[m[m
[1m[32mlecture_15_pandas_transforming.ipynb[m[m pandas - merging.ipynb
[1m[32mlecture_21_pandas_processing.ipynb[m[m   pandas - pivot tables.ipynb
[1m[32mlecture_22_pandas_cleaning.ipynb[m[m     pandas - read n write.ipynb
pandas - basic.ipynb                 pandas - reshaping.ipynb
pandas - cleaning n processing.ipynb pandas - timeseries.ipynb
pandas - finance.ipynb               pandas - visualizations.ipynb
[1m[32mpandas - grouping.ipynb[m[m              [1m[34mstyles[m[m
pandas - grouping2.ipynb             [1m[34mtemp[m[m


<a id='datastructures'/>
## Data Structures

<a id="series"/>

### Series

In [6]:
# Create Series
s1 = pd.Series(np.arange(5))
s1

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [7]:
# Create Series with Index
s2 = pd.Series([2, 3, -5, 3], index=['d', 'b', 'a', 'c'])
print("SERIES:\n{}, \n INDEX:\n{}, \n VALUES:\n{}".format(s2, s2.index, s2.values))

SERIES:
d    2
b    3
a   -5
c    3
dtype: int64, 
 INDEX:
Index(['d', 'b', 'a', 'c'], dtype='object'), 
 VALUES:
[ 2  3 -5  3]


In [8]:
s2.value_counts(sort=True, ascending=True)

-5    1
 2    1
 3    2
dtype: int64

In [9]:
sdata = {'Ohio': 1000, 'Texas': 2000, 'Oregon': 3000, 'Utah': 4000}
s3 = pd.Series(sdata)
s3

Ohio      1000
Oregon    3000
Texas     2000
Utah      4000
dtype: int64

In [10]:
s3['Ohio']

1000

In [11]:
s3['CA'] = 5000
s3

Ohio      1000
Oregon    3000
Texas     2000
Utah      4000
CA        5000
dtype: int64

In [12]:
s3.name="States"
s3.index = ['OH', 'OR', 'TX', 'UT', 'CA']
s3[:]

OH    1000
OR    3000
TX    2000
UT    4000
CA    5000
Name: States, dtype: int64

In [13]:
'CA' in s3

True

In [14]:
s3*2

OH     2000
OR     6000
TX     4000
UT     8000
CA    10000
Name: States, dtype: int64

In [15]:
s3.mean()

3000.0

In [16]:
isUniq = s3.is_unique
isNull = s3.isnull()
isTS = s3.index.is_all_dates

print(" Unique:{}\n TimeSeries:{}\n IsNull:{}".format(isUniq, isTS, isNull))

 Unique:True
 TimeSeries:False
 IsNull:OH    False
OR    False
TX    False
UT    False
CA    False
Name: States, dtype: bool


<a id="dataframes" /a>

### Data Frame

In [17]:
df = pd.DataFrame({'group': ['a', 'a', 'a', 'b','b', 'b', 'c', 'c','c'],
                 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

print("dim:{}, shape:{}, rows:{}, columns:{}".format(df.ndim, df.shape, len(df), len(df.columns)))
df.head()

dim:2, shape:(9, 2), rows:9, columns:2


Unnamed: 0,group,ounces
0,a,4.0
1,a,3.0
2,a,12.0
3,b,6.0
4,b,7.5


In [18]:
df.columns

Index(['group', 'ounces'], dtype='object')

In [19]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=['Ohio', 'Colorado', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [20]:
df.index
df.values

array([['a', 4.0],
       ['a', 3.0],
       ['a', 12.0],
       ['b', 6.0],
       ['b', 7.5],
       ['b', 8.0],
       ['c', 3.0],
       ['c', 5.0],
       ['c', 6.0]], dtype=object)

In [21]:
# Show num columns, rows, data type, memory usage etc 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 2 columns):
group     9 non-null object
ounces    9 non-null float64
dtypes: float64(1), object(1)
memory usage: 224.0+ bytes


In [22]:
df['group']
df.group.head()
#df.loc[:3]

0    a
1    a
2    a
3    b
4    b
Name: group, dtype: object

In [23]:
df[['group','ounces']].head()

Unnamed: 0,group,ounces
0,a,4.0
1,a,3.0
2,a,12.0
3,b,6.0
4,b,7.5


In [24]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
df1 = pd.DataFrame(pop, index=[2001, 2002, 2003])
df1.index.name = "years"
df1.columns.name = "states"
df1

states,Nevada,Ohio
years,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [25]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
df1 = pd.DataFrame(pop, index=[2001, 2002, 2003])
df1.index.name = "years"
df1.columns.name = "states"
df1.isnull()

states,Nevada,Ohio
years,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,False,False
2002,False,False
2003,True,True


In [26]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
df1 = pd.DataFrame(pop, index=[2001, 2002, 2003])
df1.index.name = "years"
df1.columns.name = "states"
df1.fillna(100)

states,Nevada,Ohio
years,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2003,100.0,100.0


<a id='loc' />

### Loc

In [27]:
df1.ix[[2001, 2002], ['Ohio', 'indebt']]

states,Ohio,indebt
years,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,1.7,
2002,3.6,


In [28]:
# Position based indexing
df1.iloc[:2][['Ohio','indebt']]

KeyError: "['indebt'] not in index"

In [None]:
# Label based indexing
df1.loc[[2001,2002]][['Ohio','indebt']]

<a id='adddata' />

### Adding Data

In [None]:
names = ['John', 'Matt', 'Sara', 'Jim', 'Ashley']
ids = [ 23, 34, 83, 86, 12]
debt= range(5)
balance = [10.2, 84.3, 72.9, 27.1, 223.1]

users = pd.DataFrame({'names':names, 'bal': balance, 'debt':debt}, index=ids)
users

In [None]:
#Add new columns
users['credit'] = np.arange(5)*10
users['indebt'] = users['debt'] > 0
users.head()

In [None]:
names = ['John', 'Matt', 'Sara', 'Jim', 'Ashley']
ids = [ 23, 34, 83, 86, 12]
balance = [10.2, 84.3, 72.9, 27.1, 223.1]

users = pd.DataFrame({'names':names, 'bal': balance}, index=ids)
users

In [None]:
# Appending a row
users.loc[5] = 6
users

In [None]:
# Appending a column
users.loc[:,'bal2'] = users.loc[:,'bal']*10
users

<a id="concat" />
----
### Concat

In [None]:
names = ['John', 'Matt', 'Sara', 'Jim', 'Ashley']
ids = [ 23, 34, 83, 86, 12]
debt= range(5)
balance = [10.2, 84.3, 72.9, 27.1, 223.1]
users = pd.DataFrame({'names':names, 'bal': balance, 'debt':debt}, index=ids)

n_users = users.copy()
df_list = [n_users, users]

all_users = pd.concat(df_list)
all_users

In [None]:
# Reset index
all_users.reset_index().head()

#### Concatenate by columns

In [None]:
names = ['John', 'Matt', 'Sara', 'Jim', 'Ashley']
ids = [ 23, 34, 83, 86, 12]
debt= range(5)
balance = [10.2, 84.3, 72.9, 27.1, 223.1]

users = pd.DataFrame({'names':names, 'bal': balance, 'debt':debt}, index=ids)
n_users = users.copy()

df_list = [n_users, users]

all_users = pd.concat(df_list, axis='columns')
all_users

#### Concatenate by Rows

In [None]:
names = ['John', 'Matt', 'Sara', 'Jim', 'Ashley']
ids = [ 23, 34, 83, 86, 12]
debt= range(5)
balance = [10.2, 84.3, 72.9, 27.1, 223.1]

users = pd.DataFrame({'names':names, 'bal': balance, 'debt':debt}, index=ids)
n_users = users.copy()
df_list = [n_users, users]

all_users = pd.concat(df_list, axis=0)
all_users.head(7)

In [None]:
# Create hierarchical index by key
all_users = pd.concat(df_list, keys=['x', 'y', 'z'])
all_users

In [None]:
# Access dataframe by key
all_users.ix['x']

In [None]:
# Inner Join ---> Intersection
# Outer Join ---> Union
pd.concat(df_list, axis=1, join='inner')

In [None]:
# Concatenation via append
users2 = users.copy()
users.append(users2, ignore_index=True)

In [None]:
# Concatenation of mixed dims
s1 = pd.Series(['X0', 'X1', 'X2', 'X3'], name='X')
pd.concat([users,s1], ignore_index=True, axis=0)

In [None]:
users.append(s1, ignore_index=True)

<a id='binning' />

### Binning

In [None]:
ages = [20,22,25,27,21,23,37,31,61,54,41,32]
bins = [18,25,35,60,100]
cats = pd.cut(ages ,bins)

cats

In [None]:
ages = [20,22,25,27,21,23,37,31,61,54,41,32]
bins = [18,25,35,60,100]
cats = pd.cut(ages ,bins)

cats.categories

In [None]:
cats.rename_categories(['a','b','c','d'], inplace=True)
cats

<a id="comparisions" />

### Comparisons

In [None]:
df1 = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']})
df2 = pd.DataFrame({'A': [2,3], 'B': ['b', 'g']})

In [None]:
#Which occurences does df.A have in common with df2.A?
df1.A.isin(df2.A)

In [None]:
# Which occurences does df.A NOT have in common with df2.A?
-df1.A.isin(df2.A)

In [None]:
subset = df1[-df1.A.isin(df2.A)]
subset

In [None]:
s = pd.DataFrame(np.random.randn(6).reshape([3,2]), columns=list('ab'))
s

In [None]:
s['rank']=s.b.rank()
s

In [None]:
s.rank(axis=0)

In [None]:
s.rank(axis=1)

### Random

In [None]:
countries = np.array(['US', 'UK', 'GR', 'JP'])
key = countries[np.random.randint(0, 4, 10)]
key

In [50]:
f = pd.cut(range(21), 5, retbins=True)
f

([(-0.02, 4], (-0.02, 4], (-0.02, 4], (-0.02, 4], (-0.02, 4], ..., (12, 16], (16, 20], (16, 20], (16, 20], (16, 20]]
 Length: 21
 Categories (5, object): [(-0.02, 4] < (4, 8] < (8, 12] < (12, 16] < (16, 20]],
 array([ -0.02,   4.  ,   8.  ,  12.  ,  16.  ,  20.  ]))

In [48]:
f=pd.qcut(range(21), 5)
f

[[0, 4], [0, 4], [0, 4], [0, 4], [0, 4], ..., (12, 16], (16, 20], (16, 20], (16, 20], (16, 20]]
Length: 21
Categories (5, object): [[0, 4] < (4, 8] < (8, 12] < (12, 16] < (16, 20]]