* Pandas - High-Performance open source library for data analysis
* Process datasets of different formats - time series, tabular data, matrix data
* Import data from csv,json & database
* Provides extensive operations like slice,subset,merging, groupby,shaping etc.
* Handling missing data
* Doing statiscal analysis
* Pandas objects are consumed by scikit-learn,tensorflow

In [1]:
import pandas as pd

#### Two Data Structures
* Series - 1D NumPY array with indexed column
* DataFrame - Tabular data with hetrogenous columns

In [2]:
ser1 = pd.Series(data=[1,2,3,3,4], index=['a','b','c','d','e'])

In [3]:
ser1

a    1
b    2
c    3
d    3
e    4
dtype: int64

In [4]:
ser1.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [5]:
ser1.values

array([1, 2, 3, 3, 4], dtype=int64)

In [6]:
ser1['a']

1

In [7]:
ser2 = pd.Series([1,2,3])

In [8]:
ser2

0    1
1    2
2    3
dtype: int64

#### Convert dictionary to series

In [9]:
db = {'abc':'hello','def':'yello','jkl':'good'}

In [10]:
pd.Series(db)

abc    hello
def    yello
jkl     good
dtype: object

#### Convert scalar value to series

In [11]:
pd.Series(0, index=['a','b','c'])

a    0
b    0
c    0
dtype: int64

#### Accessing series

In [16]:
ser1 = pd.Series(data=[5,6,6,7], index=['a','b','c','d'])

In [17]:
#Access by index values
#Value is inclusive
ser1[:'c']

a    5
b    6
c    6
dtype: int64

In [18]:
#Access by index numbers
#Index number is exclusive
ser1[:2]

a    5
b    6
dtype: int64

In [15]:
ser1

a    1
b    2
c    3
d    3
e    4
dtype: int64

In [19]:
ser1['b':'d']

b    6
c    6
d    7
dtype: int64

In [22]:
#Append - Combine two series
ser1.append(ser2)

a    5
b    6
c    6
d    7
0    1
1    2
2    3
dtype: int64

In [23]:
ser1.to_dict()

{'a': 5, 'b': 6, 'c': 6, 'd': 7}

### DataFrames
* Analogous to spreadsheet
* Collection od series
* mutable - contents changeable
* hetrogenous - different cols with different type of data

#### Create dataframe from multiple series

In [25]:
ser1 = pd.Series([100,200,300,400], index=['a','b','c','d'])

In [29]:
ser2 = pd.Series([222,333,444,555,666], index=['a','c','d','b','e'])

In [30]:
df = pd.DataFrame({
    's1':ser1,
    's2':ser2
})

In [31]:
df

Unnamed: 0,s1,s2
a,100.0,222
b,200.0,555
c,300.0,333
d,400.0,444
e,,666


In [32]:
ser3 = pd.Series(data=[1,2,3,4,5], index=['a','b','b','c','d'])

In [33]:
ser3

a    1
b    2
b    3
c    4
d    5
dtype: int64

In [35]:
ser3[:'b']

a    1
b    2
b    3
dtype: int64

#### Access column

In [37]:
df['s1']

a    100.0
b    200.0
c    300.0
d    400.0
e      NaN
Name: s1, dtype: float64

In [40]:
#### Accessing with double bracket returns a dataframe
df[['s1','s2']]

Unnamed: 0,s1,s2
a,100.0,222
b,200.0,555
c,300.0,333
d,400.0,444
e,,666


In [41]:
#Delete a column
del df['s1']

In [42]:
df

Unnamed: 0,s2
a,222
b,555
c,333
d,444
e,666


#### Add a new column

In [44]:
df['s3'] = df.s2 + 100

In [45]:
df

Unnamed: 0,s2,s3
a,222,322
b,555,655
c,333,433
d,444,544
e,666,766


In [46]:
s4 = pd.Series('hello', index=df.index)

In [48]:
df['s4'] = s4

In [49]:
df

Unnamed: 0,s2,s3,s4
a,222,322,hello
b,555,655,hello
c,333,433,hello
d,444,544,hello
e,666,766,hello


#### Create dataframe from numpy

In [55]:
import numpy as np
pd.DataFrame(np.array([[1,2,3,4],
                      [5,6,7,8]]),
            index=['a','b'],
            columns=['u','v','w','x'])

Unnamed: 0,u,v,w,x
a,1,2,3,4
b,5,6,7,8


### Creating from diffrent file formats

In [2]:
hr_data = pd.read_csv(r'HR_comma_sep.csv')

In [58]:
hr_data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [59]:
hr_data.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident', 'left',
       'promotion_last_5years', 'sales', 'salary'],
      dtype='object')

In [60]:
hr_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
satisfaction_level       14999 non-null float64
last_evaluation          14999 non-null float64
number_project           14999 non-null int64
average_montly_hours     14999 non-null int64
time_spend_company       14999 non-null int64
Work_accident            14999 non-null int64
left                     14999 non-null int64
promotion_last_5years    14999 non-null int64
sales                    14999 non-null object
salary                   14999 non-null object
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [61]:
hr_data.describe()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years
count,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0
mean,0.612834,0.716102,3.803054,201.050337,3.498233,0.14461,0.238083,0.021268
std,0.248631,0.171169,1.232592,49.943099,1.460136,0.351719,0.425924,0.144281
min,0.09,0.36,2.0,96.0,2.0,0.0,0.0,0.0
25%,0.44,0.56,3.0,156.0,3.0,0.0,0.0,0.0
50%,0.64,0.72,4.0,200.0,3.0,0.0,0.0,0.0
75%,0.82,0.87,5.0,245.0,4.0,0.0,0.0,0.0
max,1.0,1.0,7.0,310.0,10.0,1.0,1.0,1.0


#### Indexing & Selecting

In [63]:
hr_data[1:2]

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
1,0.8,0.86,5,262,6,0,1,0,sales,medium


#### loc - access index by value

In [64]:
hr_data.loc[1:4]

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [65]:
hr_data.iloc[1:4]

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low


In [3]:
movie_data = pd.read_json('movie.json.txt')

In [4]:
movie_data

Unnamed: 0,Adam Cohen,Bill Duffy,Brenda Peterson,Chris Duncan,Clarissa Jackson,David Smith,Julie Hammel,Samuel Miller
Goodfellas,4.5,4.5,2.0,,2.5,4.5,3.0,5.0
Raging Bull,,,1.0,4.5,4.0,3.0,,5.0
Roman Holiday,3.0,,4.5,,1.5,,4.5,1.0
Scarface,3.0,5.0,1.5,,4.5,4.5,2.5,3.5
The Apartment,1.0,1.0,5.0,1.5,1.0,1.0,,1.0
Vertigo,3.5,4.5,3.0,,5.0,4.0,,


In [68]:
movie_data.columns

Index(['Adam Cohen', 'Bill Duffy', 'Brenda Peterson', 'Chris Duncan',
       'Clarissa Jackson', 'David Smith', 'Julie Hammel', 'Samuel Miller'],
      dtype='object')

In [69]:
movie_data.index

Index(['Goodfellas', 'Raging Bull', 'Roman Holiday', 'Scarface',
       'The Apartment', 'Vertigo'],
      dtype='object')

In [71]:
movie_data.loc['Goodfellas':'Scarface']

Unnamed: 0,Adam Cohen,Bill Duffy,Brenda Peterson,Chris Duncan,Clarissa Jackson,David Smith,Julie Hammel,Samuel Miller
Goodfellas,4.5,4.5,2.0,,2.5,4.5,3.0,5.0
Raging Bull,,,1.0,4.5,4.0,3.0,,5.0
Roman Holiday,3.0,,4.5,,1.5,,4.5,1.0
Scarface,3.0,5.0,1.5,,4.5,4.5,2.5,3.5


In [72]:
movie_data.iloc[1:3]

Unnamed: 0,Adam Cohen,Bill Duffy,Brenda Peterson,Chris Duncan,Clarissa Jackson,David Smith,Julie Hammel,Samuel Miller
Raging Bull,,,1.0,4.5,4.0,3.0,,5.0
Roman Holiday,3.0,,4.5,,1.5,,4.5,1.0


### Filtering when accessing

In [74]:
movie_data.loc[movie_data['Samuel Miller'] > 3]

Unnamed: 0,Adam Cohen,Bill Duffy,Brenda Peterson,Chris Duncan,Clarissa Jackson,David Smith,Julie Hammel,Samuel Miller
Goodfellas,4.5,4.5,2.0,,2.5,4.5,3.0,5.0
Raging Bull,,,1.0,4.5,4.0,3.0,,5.0
Scarface,3.0,5.0,1.5,,4.5,4.5,2.5,3.5


* Filtering when accessing
* movie_data['Samuel Miller'] > 3 returns a boolean array with indexes retained
* movie_data.iloc[movie_data['Samuel Miller'] > 3] won't work

In [79]:
#Filtering when accessing
movie_data.loc[movie_data['Samuel Miller'] > 3][['Samuel Miller', 'Julie Hammel']]

Unnamed: 0,Samuel Miller,Julie Hammel
Goodfellas,5.0,3.0
Raging Bull,5.0,
Scarface,3.5,2.5


In [80]:
movie_data.at['Raging Bull','Julie Hammel']

nan

In [82]:
movie_data.iat[2,4]

1.5