In [1]:
# loading Libraries + Files
import pandas as pd
import numpy as np
from numpy.random import randn
np.random.seed(101)

In [2]:
# Creating dataframes required below
# Importing data required below
ser1 = pd.Series([1,2,3,4],index = ['USA', 'Germany','USSR', 'Japan'])
df_basic = pd.DataFrame(randn(5,4),index='A B C D E'.split(),columns='W X Y Z'.split())
df_miss = pd.DataFrame({'A':[1,2,np.nan],
                  'B':[5,np.nan,np.nan],
                  'C':[1,2,3]})
data_grp = {'Company':['GOOG','GOOG','MSFT','MSFT','FB','FB'],
       'Person':['Sam','Charlie','Amy','Vanessa','Carl','Sarah'],
       'Sales':[200,120,340,124,243,350]}
df_grp = pd.DataFrame(data_grp)
df_op = pd.DataFrame({'col1':[1,2,3,4],'col2':[444,555,666,444],'col3':['abc','def','ghi','xyz']})

# Frequently used Operations

## 0. Data Import

## 1.EDA - High Level

#### 1.1 Determine "type" of an object

In [6]:
type(ser1)

pandas.core.series.Series

#### 1.2 Looking at summary stats

In [6]:
df_basic.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
W,5.0,0.343858,1.681131,-2.018168,0.188695,0.190794,0.651118,2.70685
X,5.0,0.453764,1.061385,-0.758872,-0.319318,0.628133,0.740122,1.978757
Y,5.0,0.452287,1.454516,-0.933237,-0.848077,0.528813,0.907969,2.605967
Z,5.0,0.431871,0.594708,-0.589001,0.503826,0.605965,0.683509,0.955057


In [7]:
df_basic.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, A to E
Data columns (total 4 columns):
W    5 non-null float64
X    5 non-null float64
Y    5 non-null float64
Z    5 non-null float64
dtypes: float64(4)
memory usage: 200.0+ bytes


## 2. Conditional Selection

In [3]:
df_basic[(df_basic['W']>0) & (df_basic['Y'] > 1)]

Unnamed: 0,W,X,Y,Z
E,0.190794,1.978757,2.605967,0.683509


## 3. Treating NAs

#### 3.1 Dropping na

In [4]:
df_miss.dropna(axis=1)

Unnamed: 0,C
0,1
1,2
2,3


#### 3.2 Replacing NA values

In [21]:
df_miss['A'].fillna(value=df_miss['A'].mean())

0    1.0
1    2.0
2    1.5
Name: A, dtype: float64

## 4. EDA - 2: DataFrame Deep Dive

#### 4.1 Sum of a col

In [8]:
df_op['col1'].sum()
# You can perform multiple operations on a column: unique, nunique, value_counts(), sort_values, isnull, dropna,fillna

10

#### 4.2 Unique values for a given column

In [9]:
df_op['col1'].value_counts()

4    1
3    1
2    1
1    1
Name: col1, dtype: int64

# Misc 

### 1. %run command

you can run any file inside your notebook using run command:
> %run ipython_script_test.py

Doing this will import the variables and results in that file into your current env

### 2. timeit:

In [11]:
a = np.random.randn(100, 100)
%timeit np.dot(a, a)

17.2 µs ± 855 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


### 3. isinstance

In [12]:
a = 3.5
b = 4

isinstance(a,(int,float))

True

### 4.Format

> template = '{0:.2f} {1:s} are worth US${2:d}'

> template.format(4.5560, 'Argentine Pesos', 1)

1. {0:.2f}: format first arg as float with 2 decimals
2. {1:s}: second arg as string
3. {2:d}: third arg as integer

In [14]:
template = '{0:.2f} {1:s} are worth US${2:d}'
template.format(4.5560, 'Argentine Pesos', 1)

'4.56 Argentine Pesos are worth US$1'

### 5. List Comprehensions

In [26]:
string = ['a','ba','cba','dcba']
[len(x) for x in string if len(x)>2]

[3, 4]

### 6. Dict Comprehensions

In [28]:
{x:len(x) for x in string if len(x) > 2}

{'cba': 3, 'dcba': 4}