In [14]:
import numpy as np
import pandas as pd
from pandas import Series

## Series

In [3]:
Series?

In [4]:
animals = ['tiger','shetta','monkey']
capitals = {
    'Egypt' : 'Cairo',
    'UK' : 'London',
    'France' : 'Paris'
}
_series = Series([1,2,3,4,5])
_series

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [5]:
Series([1,2,3,4],index=['one','two','three','four'])

one      1
two      2
three    3
four     4
dtype: int64

In [6]:
animals = Series(animals)
animals

0     tiger
1    shetta
2    monkey
dtype: object

In [7]:
capitals = Series(capitals)
print (capitals)
print (capitals.index)

Egypt      Cairo
France     Paris
UK        London
dtype: object
Index(['Egypt', 'France', 'UK'], dtype='object')


In [8]:
capitals.name

In [9]:
animals.name

In [10]:
animals

0     tiger
1    shetta
2    monkey
dtype: object

In [11]:
capitals

Egypt      Cairo
France     Paris
UK        London
dtype: object

In [12]:
animals.append(capitals)

0          tiger
1         shetta
2         monkey
Egypt      Cairo
France     Paris
UK        London
dtype: object

In [13]:
animals

0     tiger
1    shetta
2    monkey
dtype: object

### Quering in Pandas

In [14]:
sports = {'Archery': 'Bhutan',
          'Golf': 'Scotland',
          'Sumo': 'Japan',
          'Taekwondo': 'South Korea'}
s = pd.Series(sports)
s

Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object

In [15]:
s.index

Index(['Archery', 'Golf', 'Sumo', 'Taekwondo'], dtype='object')

In [16]:
s[0]

'Bhutan'

In [17]:
s['Sumo']

'Japan'

In [18]:
sports = {99: 'Bhutan',
          100: 'Scotland',
          101: 'Japan',
          102: 'South Korea'}
s = pd.Series(sports)

In [19]:
# s[0] will throw err 
s.iloc[0]

'Bhutan'

### Operation over series

In [15]:
l = (np.random.rand(10)*20).astype(int)

In [16]:
_series = Series(l)
# LOOPs ARE SLOW !!!!
sum = 0
for x in _series:
    sum += x
print (sum)

83


In [17]:
sum == l.sum()

True

### Using Vectorization

In [23]:
import numpy as np
np.sum(_series) # or use _series.sum()

55

#### Testing Speed

In [37]:
test = pd.Series(np.random.randint(0,10,1000))

In [38]:
test.head()

0    7
1    8
2    1
3    6
4    1
dtype: int64

In [39]:
len(test)

1000

In [68]:
%%timeit -n 100
sum = 0;
for item in test:
    sum += item

63.8 µs ± 2.29 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [65]:
%%timeit -n 100
np.sum(test)

74.8 µs ± 12 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### Broadcasting

In [69]:
# apply operation to every value to the series, and change it 

In [72]:
_series ** 2 # braodcasting -- exponentail by 2

0      1
1    289
2     25
3      4
4      9
5    144
6     16
7     49
8      9
9      1
dtype: int64

## DataFrame

In [1]:
from pandas import DataFrame

In [50]:
_df = DataFrame([
    {'Cost':1,'Name':2,'Total':3},
    {'Cost':5,'Name':3,'Total':9},
    {'Cost':3,'Name':5},
    {'Cost':4,'Name':5}],index=['Store 1','Store 1','Store 2','Store 3'])
_df

Unnamed: 0,Cost,Name,Total
Store 1,1,2,3.0
Store 1,5,3,9.0
Store 2,3,5,
Store 3,4,5,


In [132]:
df = DataFrame([_series.values,_series.values**2,_series.values**3],index=['x','x*2','x*3'])
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
x,3,7,8,0,3,8,17,10,15,12
x*2,9,49,64,0,9,64,289,100,225,144
x*3,27,343,512,0,27,512,4913,1000,3375,1728


In [133]:
df.T

Unnamed: 0,x,x*2,x*3
0,3,9,27
1,7,49,343
2,8,64,512
3,0,0,0
4,3,9,27
5,8,64,512
6,17,289,4913
7,10,100,1000
8,15,225,3375
9,12,144,1728


### Querying Dataframe

In [54]:
_df.loc['Store 2']

Cost     3.0
Name     5.0
Total    NaN
Name: Store 2, dtype: float64

In [134]:
_df.loc['Store 1','Cost']

Store 1     1
Store 1    21
Name: Cost, dtype: int64

#### Column Selection

In [136]:
_df.loc[:,['Total','Name']]

Unnamed: 0,Total,Name
Store 1,3.0,2
Store 1,9.0,3
Store 2,,5
Store 3,,5


### Transpose Dataframe

In [63]:
_df.T

Unnamed: 0,Store 1,Store 1.1,Store 2,Store 3
Cost,1.0,5.0,3.0,4.0
Name,2.0,3.0,5.0,5.0
Total,3.0,9.0,,


In [143]:
_df.loc[:,['Cost','Total']] # or we can directly get column from df df[['Cost']] return dataframe

Unnamed: 0,Cost,Total
Store 1,1,3.0
Store 1,21,9.0
Store 2,3,
Store 3,4,


In [144]:
_df[['Cost','Total']] # will return Series /  _df[['Cost']] will retrn DataFrame

Unnamed: 0,Cost,Total
Store 1,1,3.0
Store 1,21,9.0
Store 2,3,
Store 3,4,


In [76]:
_df.T.loc['Cost']

Store 1    1.0
Store 1    5.0
Store 2    3.0
Store 3    4.0
Name: Cost, dtype: float64

In [95]:
# modification in chanining is the original
_df.loc['Store 1']['Cost'].iloc[1] = 21

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [94]:
_df

Unnamed: 0,Cost,Name,Total
Store 1,1,2,3.0
Store 1,21,3,9.0
Store 2,3,5,
Store 3,4,5,


In [89]:
_df.drop('Store 3') # return a copy rather than change dataframe

Unnamed: 0,Cost,Name,Total
Store 1,1,2,3.0
Store 1,23,3,9.0
Store 2,3,5,


In [105]:
# make a copy 
copy_df = _df.copy()
# copy_df.drop(1,inplace=True,axis=1)

In [113]:
copy_df.drop('Cost',axis=1)

Unnamed: 0,Name,Total
Store 1,2,3.0
Store 1,3,9.0
Store 2,5,
Store 3,5,


In [120]:
# Appending new column with default value is None or List of Value
# calculated column for other df columns
_df['Revenue'] = [None, 2, 23, 5]
_df['Calculated Column'] = _df['Cost'] + _df['Total'] # Series from other Series
_df

Unnamed: 0,Cost,Name,Total,Revenue,Calculated Column
Store 1,1,2,3.0,,4.0
Store 1,21,3,9.0,2.0,30.0
Store 2,3,5,,23.0,
Store 3,4,5,,5.0,


### Querying the Columns

In [131]:
_df[['Cost','Total']]

Unnamed: 0,Cost,Total
Store 1,1,3.0
Store 1,21,9.0
Store 2,3,
Store 3,4,


### Quering the Rows

In [126]:
_df.loc['Store 1']

Unnamed: 0,Cost,Name,Total,Revenue,Calculated Column
Store 1,1,2,3.0,,4.0
Store 1,21,3,9.0,2.0,30.0
