#  Numpy
* Numerical Python
* Numpy is the fundamental package for scientific computing with Python
* Much more efficient data storage and operations capability
* Entire ecosystem of python data science tools depend on Numpy

In [1]:
import numpy as np
# Numpy is the linear algebra library for python
np.__version__


'1.15.0'

## 1. Creating Numpy Arrays

### 1.1 Creating Arrays from Scratch

**np.arange(start,stop,step)**

In [2]:
np.arange(10) # Creates numbers 0 to 9

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

**np.zeros()**

In [3]:
np.zeros(5) # creates 5 zeroes

array([0., 0., 0., 0., 0.])

**np.ones()**

In [4]:
np.ones(8) # creates an array of 1

array([1., 1., 1., 1., 1., 1., 1., 1.])

**np.eye()**

In [5]:
np.eye(3) # creates identity matrix

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

**np.full()**

In [6]:
np.full((2,3),1) # creates a 2 x 3 array filled with 1.  np.full((nrows,ncols),value)

array([[1, 1, 1],
       [1, 1, 1]])

**np.random.randint()**

In [7]:
np.random.randint(0,10,(3,4)) # creates an 3 x 4 array with random integers between 0 and 10

array([[5, 6, 0, 1],
       [0, 7, 7, 7],
       [8, 4, 2, 3]])

In [8]:
np.random.randint(9,20,5) # for just 1d array

array([15, 12, 11, 11, 12])

**np.random.random()**

In [9]:
np.random.random((3,4)) # creates a 3 x 4 array with random uniform values between 0 and 1

array([[0.65142196, 0.58728261, 0.23837999, 0.13046186],
       [0.83273382, 0.32960404, 0.79453488, 0.89816463],
       [0.91494949, 0.13269216, 0.69578919, 0.47254971]])

In [10]:
np.random.rand(3,4) # there is another version with just random.rand(), solves the same purpose

array([[0.98655587, 0.88257135, 0.71611761, 0.69501307],
       [0.51009317, 0.31912627, 0.60090113, 0.14113181],
       [0.9716085 , 0.58848492, 0.51008323, 0.80391459]])

**np.random.normal()**

In [11]:
np.random.normal(0,1,(4,5)) # creates a 4 x 5 array with normally distributed random values having mean 0 and SD 1

array([[-0.73319324, -0.81065069,  0.8302933 ,  0.23131167,  0.40585184],
       [ 0.28650179,  0.63268791,  1.17911648,  1.2129355 , -0.9787158 ],
       [-0.52948348, -1.31707082,  0.4193814 , -2.6182859 , -1.25491161],
       [ 0.66212997,  0.47053466, -0.30535339,  0.61859059, -0.90163269]])

In [12]:
np.random.randn(4,5) # here mean is by default 0 and SD is 1. Similar to the above method

array([[-0.27425485,  0.24060744, -0.97106879,  0.04474822, -0.13418899],
       [ 1.03165098,  0.19627808,  1.10381692, -0.27350812, -0.07648259],
       [-0.03797323,  0.40646168, -1.02701791, -0.17587186, -0.43656329],
       [ 0.11903881, -0.84032713,  0.43019523,  0.46920506,  0.5630843 ]])

**np.linspace()**

In [13]:
np.linspace(0,10,5) # creates an array from 0 to 10 with 5 equally spaced values

array([ 0. ,  2.5,  5. ,  7.5, 10. ])

### 1.2 Creating numpy arrays fom lists or tuples

In [14]:
np.array([4,5,6])

array([4, 5, 6])

In [15]:
alist = [9, 7, 5, 6]
np.array(alist)

array([9, 7, 5, 6])

In [16]:
atuple = (4, 5, 6, 7)
np.array(atuple)

array([4, 5, 6, 7])

In [17]:
list_list = [[i, i+2] for i in range(0,5)] # List of Lists
list_list

[[0, 2], [1, 3], [2, 4], [3, 5], [4, 6]]

In [22]:
my_arr = np.array(list_list) # creating a 2D numpy array from lists of lists.
my_arr

array([[0, 2],
       [1, 3],
       [2, 4],
       [3, 5],
       [4, 6]])

In [23]:
my_arr2 = np.arange(10)
my_arr2

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [24]:
list(my_arr2) # convert array back to list

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [25]:
tuple(my_arr2)

(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)

## 2. Indexing, Modifying Arrays

### 2.1 Attributes of Arrays

**.shape** gives the dimensions of an array

In [26]:
my_arr

array([[0, 2],
       [1, 3],
       [2, 4],
       [3, 5],
       [4, 6]])

In [27]:
my_arr.shape # .shape gives the dimensions of the array.

(5, 2)

**.reshape()**

In [28]:
my_arr.reshape(2,5)

array([[0, 2, 1, 3, 2],
       [4, 3, 5, 4, 6]])

In [29]:
my_arr.reshape(1,10) # doesn't happen inplace. Original array remains the same.

array([[0, 2, 1, 3, 2, 4, 3, 5, 4, 6]])

In [30]:
my_arr

array([[0, 2],
       [1, 3],
       [2, 4],
       [3, 5],
       [4, 6]])

### 2.2 Indexing of numpy arrays (similar to lists) npa[start:stop:step]

In [31]:
my_arr2 = np.array([31, 89, 94, 56, 34, 69, 98, 41, 53, 83, 77])
my_arr2

array([31, 89, 94, 56, 34, 69, 98, 41, 53, 83, 77])

***Q. how to get 69***

69

In [32]:
my_arr2[0:5]

array([31, 89, 94, 56, 34])

***Q. How to get all the numbers in reverse order***

array([77, 83, 53, 41, 98, 69, 34, 56, 94, 89, 31])

***Q. Get numbers from 98 to 89 in reverse order***

**indexing 2D arrays, [start:stop:step, start:stop:step]**

In [36]:
my_arr = np.random.rand(4,5)

In [37]:
my_arr

array([[0.3552052 , 0.86555269, 0.24743397, 0.21179829, 0.07695315],
       [0.28515557, 0.93919112, 0.32235915, 0.23153487, 0.13760578],
       [0.61506175, 0.52388965, 0.90604249, 0.96142976, 0.46163786],
       [0.96001082, 0.36332802, 0.02485411, 0.62620367, 0.988038  ]])

In [38]:
my_arr[:,:] # npa[start:stop:step, start:stop:step]

array([[0.3552052 , 0.86555269, 0.24743397, 0.21179829, 0.07695315],
       [0.28515557, 0.93919112, 0.32235915, 0.23153487, 0.13760578],
       [0.61506175, 0.52388965, 0.90604249, 0.96142976, 0.46163786],
       [0.96001082, 0.36332802, 0.02485411, 0.62620367, 0.988038  ]])

In [39]:
my_arr[0:2,0:2] # get rows 0,1 and columns 0,1

array([[0.3552052 , 0.86555269],
       [0.28515557, 0.93919112]])

In [40]:
# in Python, we generally use ':' after ',' to get all rows or columns. in R just ',' is enough

In [43]:
my_arr[1,:] #get row 1, all columns. In R, just my_arr[2,] would have worked.

array([0.28515557, 0.93919112, 0.32235915, 0.23153487, 0.13760578])

In [44]:
my_arr[:,1] # Get all rows, and first column

array([0.86555269, 0.93919112, 0.52388965, 0.36332802])

In [46]:
my_arr[1,2] # to get a particular element

0.3223591455570478

In [47]:
my_arr[1,1:3] # row 1 and columns from 1 to 2

array([0.93919112, 0.32235915])

In [48]:
my_arr

array([[0.3552052 , 0.86555269, 0.24743397, 0.21179829, 0.07695315],
       [0.28515557, 0.93919112, 0.32235915, 0.23153487, 0.13760578],
       [0.61506175, 0.52388965, 0.90604249, 0.96142976, 0.46163786],
       [0.96001082, 0.36332802, 0.02485411, 0.62620367, 0.988038  ]])

***Q. Get all rows, and columns 0, 2, 4 in reverse order***

array([[0.07695315, 0.24743397, 0.3552052 ],
       [0.13760578, 0.32235915, 0.28515557],
       [0.46163786, 0.90604249, 0.61506175],
       [0.988038  , 0.02485411, 0.96001082]])

In [52]:
my_arr

array([[0.3552052 , 0.86555269, 0.24743397, 0.21179829, 0.07695315],
       [0.28515557, 0.93919112, 0.32235915, 0.23153487, 0.13760578],
       [0.61506175, 0.52388965, 0.90604249, 0.96142976, 0.46163786],
       [0.96001082, 0.36332802, 0.02485411, 0.62620367, 0.988038  ]])

### 2.3 Boolean Masking

**Using conditionals in slicing**

In [149]:
my_arr = np.random.rand(10)
my_arr

array([0.73958717, 0.86308778, 0.67638278, 0.53214056, 0.96427992,
       0.39148806, 0.52278876, 0.27333603, 0.77236135, 0.26742307])

In [152]:
my_arr > 0.5

array([ True,  True,  True,  True,  True, False,  True, False,  True,
       False])

In [154]:
my_arr[my_arr > 0.5] # get only those array elements whose value is greater than 0.5

array([0.73958717, 0.86308778, 0.67638278, 0.53214056, 0.96427992,
       0.52278876, 0.77236135])

In [155]:
my_arr2D = np.random.rand(3,5)
my_arr2D

array([[0.65791455, 0.98967925, 0.18032624, 0.05335661, 0.00905516],
       [0.40338475, 0.61081025, 0.84343425, 0.0050766 , 0.509181  ],
       [0.7818519 , 0.3648195 , 0.20530122, 0.37290184, 0.81082579]])

In [156]:
my_arr2D > 0.5

array([[ True,  True, False, False, False],
       [False,  True,  True, False,  True],
       [ True, False, False, False,  True]])

In [163]:
my_arr2D[my_arr2D > 0.5]

array([0.65791455, 0.98967925, 0.61081025, 0.84343425, 0.509181  ,
       0.7818519 , 0.81082579])

In [165]:
my_arr2D[:,2] > 0.2

array([False,  True,  True])

In [167]:
my_arr2D[(my_arr2D[:,2] > 0.2), :] # Get all rows, for which column 2 values are greater than 0.1

array([[0.40338475, 0.61081025, 0.84343425, 0.0050766 , 0.509181  ],
       [0.7818519 , 0.3648195 , 0.20530122, 0.37290184, 0.81082579]])

In [200]:
my_arr1 = np.random.randint(0,10,6)
my_arr2 = np.random.randint(0,10,6)
print ("my_arr1" + " is " + str(my_arr1))
print ("my_arr2" + " is " + str(my_arr2))

my_arr1 is [3 3 6 9 5 8]
my_arr2 is [0 2 6 3 0 8]


**Important to remember. and, or evalue the condition on the entire object. If we need element wise comparison, use &, |**

In [202]:
(my_arr1 > 5) & (my_arr2 > 5) 

array([False, False,  True, False, False,  True])

In [207]:
(my_arr1 > -1) and (my_arr2 > -1)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

### 2.4 Modifying arrays

In [94]:
my_arr = np.random.rand(4,5)
my_arr

array([[0.66761271, 0.51202726, 0.69062597, 0.72773815, 0.39105291],
       [0.39506128, 0.64362656, 0.83643125, 0.86854254, 0.43166143],
       [0.83435932, 0.35779642, 0.73142938, 0.5568197 , 0.81854343],
       [0.37203736, 0.23131812, 0.6895064 , 0.25594383, 0.37799234]])

In [95]:
my_arr[1,2] = 0.9
my_arr

array([[0.66761271, 0.51202726, 0.69062597, 0.72773815, 0.39105291],
       [0.39506128, 0.64362656, 0.9       , 0.86854254, 0.43166143],
       [0.83435932, 0.35779642, 0.73142938, 0.5568197 , 0.81854343],
       [0.37203736, 0.23131812, 0.6895064 , 0.25594383, 0.37799234]])

In [96]:
my_arr[:,4] = np.array([8, 6, 9, 22]) # replacing all rows of 4th column with new values
my_arr

array([[ 0.66761271,  0.51202726,  0.69062597,  0.72773815,  8.        ],
       [ 0.39506128,  0.64362656,  0.9       ,  0.86854254,  6.        ],
       [ 0.83435932,  0.35779642,  0.73142938,  0.5568197 ,  9.        ],
       [ 0.37203736,  0.23131812,  0.6895064 ,  0.25594383, 22.        ]])

**Concatenation and splitting**

In [211]:
my_arr1 = np.array([1,2,3,4])
my_arr2 = np.array([9, 9, 9, 9])

In [212]:
np.vstack([my_arr1,my_arr2]) # vertical stacking

array([[1, 2, 3, 4],
       [9, 9, 9, 9]])

In [213]:
np.hstack([my_arr1,my_arr2]) # horozontal stacking np.concatenate also does something similar

array([1, 2, 3, 4, 9, 9, 9, 9])

In [216]:
my_arr = np.random.randint(0,10,(4,5))
my_arr

array([[4, 7, 5, 2, 1],
       [1, 9, 1, 9, 7],
       [4, 9, 0, 2, 6],
       [5, 1, 6, 2, 7]])

In [224]:
usplit, lsplit = np.vsplit(my_arr,[2]) # row index, where the split should happen. also try np.hsplit()

In [225]:
usplit

array([[4, 7, 5, 2, 1],
       [1, 9, 1, 9, 7]])

In [226]:
lsplit

array([[4, 9, 0, 2, 6],
       [5, 1, 6, 2, 7]])

**sub arrays are being aliased. If we mutate subarrays, origninals are also mutated. This is different from lists**

In [102]:
sub_arr = my_arr[:,1:2]
sub_arr

array([[0.51202726],
       [0.64362656],
       [0.35779642],
       [0.23131812]])

In [105]:
sub_arr[:,0] = np.array([9,9,9,9])

In [106]:
my_arr

array([[ 0.66761271,  9.        ,  0.69062597,  0.72773815,  8.        ],
       [ 0.39506128,  9.        ,  0.9       ,  0.86854254,  6.        ],
       [ 0.83435932,  9.        ,  0.73142938,  0.5568197 ,  9.        ],
       [ 0.37203736,  9.        ,  0.6895064 ,  0.25594383, 22.        ]])

### 2.5 UFuncs and Algebraic operations

In [108]:
al = [1,2,3,4] # With lists elementwise operations need for loops, 
# in R however, we can do algebraic operations on vectors, matrices, data frame directly.

In [109]:
al*4

[1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4]

In [110]:
al + [1,2,3,4]

[1, 2, 3, 4, 1, 2, 3, 4]

In [111]:
[4*i for i in al] # this is more tedious. (Ofcourse list comprehensions are better than traditional for loops)

[4, 8, 12, 16]

**See the power of numpy**

In [114]:
my_arr = np.arange(10)
my_arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [115]:
my_arr*4

array([ 0,  4,  8, 12, 16, 20, 24, 28, 32, 36])

In [116]:
my_arr-500

array([-500, -499, -498, -497, -496, -495, -494, -493, -492, -491])

In [117]:
my_arr/5

array([0. , 0.2, 0.4, 0.6, 0.8, 1. , 1.2, 1.4, 1.6, 1.8])

In [118]:
my_arr*5+3 # we can even do operations in an expression. 

array([ 3,  8, 13, 18, 23, 28, 33, 38, 43, 48])

**All these operations are basically wrappers for built in np. ufunc**

In [63]:
np.multiply(my_arr,4) # this is same as my_arr*4, similarly we have np.add, np.substract, np.divide etc,=.

array([ 0,  4,  8, 12, 16, 20, 24, 28, 32, 36])

**Some more Ufuncs: max, min, mean, sum, sqrt,exp, sin, cos, log, argmin, argmax, std, var, just use np.function**
#### Go to this link https://docs.scipy.org/doc/numpy/reference/ufuncs.html for more functions.


In [119]:
np.mean(my_arr)

4.5

In [120]:
np.prod(my_arr)

0

In [122]:
np.sum(my_arr) # for several of these aggregating functions, we could also used methods of numpy array object itself (see below)

45

In [123]:
my_arr.sum()

45

In [124]:
np.sqrt(my_arr)

array([0.        , 1.        , 1.41421356, 1.73205081, 2.        ,
       2.23606798, 2.44948974, 2.64575131, 2.82842712, 3.        ])

In [125]:
np.exp(my_arr)

array([1.00000000e+00, 2.71828183e+00, 7.38905610e+00, 2.00855369e+01,
       5.45981500e+01, 1.48413159e+02, 4.03428793e+02, 1.09663316e+03,
       2.98095799e+03, 8.10308393e+03])

**multi-dimensional aggregation**

In [128]:
my_arr = np.random.randn(3,5)
my_arr

array([[ 1.03258195, -0.43920694, -1.61326002,  0.03563248,  1.08756442],
       [-1.33040224, -0.02773772, -0.31072508,  0.91715616, -0.68630621],
       [-0.10063422, -0.13960706, -0.68993765, -0.62683217,  1.31326813]])

In [133]:
np.mean(my_arr,0) # specify whether we want to apply the function along rows (0), or columns (1)

array([-0.13281817, -0.20218391, -0.87130758,  0.10865216,  0.57150878])

In [134]:
np.sum(my_arr,1)

array([ 0.10331189, -1.43801508, -0.24374297])

**Few useful numpy methods**

In [3]:
# np.sum, np.any, np.all, np.prod, np.bincount, np.where
import numpy as np

In [4]:
my_arr = np.random.randint(0,10,20)
my_arr

array([2, 7, 9, 5, 5, 0, 2, 9, 7, 5, 4, 9, 0, 5, 5, 6, 8, 1, 2, 2])

**np.bincount**

In [5]:
np.bincount(my_arr) # bincount gives count (frequency) of each value starting from 0 to the largest value.

array([2, 1, 4, 0, 1, 5, 1, 2, 1, 3], dtype=int64)

**np.where**

In [6]:
np.where(my_arr == 5) # gives the index position where value = 5

(array([ 3,  4,  9, 13, 14], dtype=int64),)

**np.sort**

In [8]:
np.sort(my_arr) # this doesn't happen inplace. To mutate the original use my_arr.sort()

array([0, 0, 1, 2, 2, 2, 2, 4, 5, 5, 5, 5, 5, 6, 7, 7, 8, 9, 9, 9])

In [210]:
np.argsort(my_arr) # this will give indexes after sorting the array

array([ 0, 17, 14,  9,  3,  8, 12,  6, 18,  5,  1, 10, 15,  7,  2, 19,  4,
       16, 11, 13], dtype=int64)

In [10]:
my_arr2D = np.random.rand(4,5)
my_arr2D

array([[0.21042817, 0.3400031 , 0.83848833, 0.36725381, 0.89434823],
       [0.26941823, 0.06707691, 0.08454853, 0.32056739, 0.36157669],
       [0.10384941, 0.60477655, 0.38535028, 0.32545517, 0.56282364],
       [0.62393399, 0.54454186, 0.67488547, 0.90600023, 0.35552752]])

In [11]:
np.sort(my_arr2D, 0) # sort across each column

array([[0.10384941, 0.06707691, 0.08454853, 0.32056739, 0.35552752],
       [0.21042817, 0.3400031 , 0.38535028, 0.32545517, 0.36157669],
       [0.26941823, 0.54454186, 0.67488547, 0.36725381, 0.56282364],
       [0.62393399, 0.60477655, 0.83848833, 0.90600023, 0.89434823]])

In [191]:
my_arr.max() # similar to Ufuncs, numpy objects themselves have several methods

9

In [193]:
my_arr.sum()

95

In [195]:
my_arr.argmax() # index position where maximum value occurs.

11

# Pandas
* Overcomes the limitations of numpy arrays in dealing with labelled and missing data
* Built on top of Numpy
* Introduces Series and DataFrame datastructures, which take on most of data mining

## 1. Pandas Series
* One-dimensional array of indexed data.
* Built on top of numpy 1D array, and also has features similar to dictionaries.

In [12]:
# Pandas series differ form numpy arrays in that, we can define labels for the series index

In [13]:
import numpy as np
import pandas as pd
pd.__version__

'0.23.4'

### 1.1 Creating pandas series objects**
* Can be created from a list, numpy array, or tuple, or dictionary

In [17]:
my_series = pd.Series([0.5,0.8,0.9,1.3,0.4]) # similar to numpy array except now we have explicit index.
my_series

0    0.5
1    0.8
2    0.9
3    1.3
4    0.4
dtype: float64

In [18]:
my_series.values # gives us the values

array([0.5, 0.8, 0.9, 1.3, 0.4])

In [19]:
my_series.index # returns an index object.

RangeIndex(start=0, stop=5, step=1)

In [20]:
my_series = pd.Series([1,2,3,4], index = ["a","b","c","d"]) # index does'nt have to be 0,1,2,3

In [21]:
my_series

a    1
b    2
c    3
d    4
dtype: int64

In [22]:
my_series.shape # It can be seen that pandas series is one-dimensional array.

(4,)

In [24]:
my_arr = np.array([8,7,6,5]) # creating Series from a numpy array
pd.Series(my_arr)

0    8
1    7
2    6
3    5
dtype: int32

In [25]:
tp = (11,12,9,1)
pd.Series(tp) # Creating series from a tuple.

0    11
1    12
2     9
3     1
dtype: int64

In [26]:
my_dict = {'k1':1, 'k2':3, 'k4':5} # dictionary can be easily converted to series. keys become index, values are values.
pd.Series(my_dict)

k1    1
k2    3
k4    5
dtype: int64

In [27]:
my_series

a    1
b    2
c    3
d    4
dtype: int64

In [29]:
my_series2 = pd.Series([5,6,7,8], index = ["a","b","c","d"])
my_series2

a    5
b    6
c    7
d    8
dtype: int64

In [30]:
my_series+my_series2

a     6
b     8
c    10
d    12
dtype: int64

In [31]:
d3 = {"k1": 2, "k2":[1,2,3], "k3":("a","b"),"k4":{"sk1":5}}

In [32]:
d3

{'k1': 2, 'k2': [1, 2, 3], 'k3': ('a', 'b'), 'k4': {'sk1': 5}}

In [33]:
pd.Series(d3)

k1             2
k2     [1, 2, 3]
k3        (a, b)
k4    {'sk1': 5}
dtype: object

### 1.2 Indexing & Modifying Series Objects

#### 1.2.1 Indexing

In [34]:
my_series = pd.Series([1.5,0.2,1.3,0.4], index=['a', 'b', 'c', 'd'])
my_series

a    1.5
b    0.2
c    1.3
d    0.4
dtype: float64

In [36]:
my_series['a'] # explicit indexing by name of index

1.5

In [37]:
my_series[0:3] # implicit indexing from 0th row to 2nd row.

a    1.5
b    0.2
c    1.3
dtype: float64

**potential for confusion with this indexing**

In [38]:
my_series2 = pd.Series(['a', 'b', 'c', 'd', 'e'], index = [1,2,3,4,5])
my_series2

1    a
2    b
3    c
4    d
5    e
dtype: object

In [351]:
my_series2[1]

'a'

In [353]:
my_series2[1:4]

2    b
3    c
4    d
dtype: object

**best to use special indexing attributes**
* .loc - for explicit indexing with names
* .iloc - for implicit indexing with row numbers or column numbers
* .ix - for mixed indexing

In [41]:
my_series2

1    a
2    b
3    c
4    d
5    e
dtype: object

In [42]:
my_series2.loc[3] # with .loc, its explicit indexing, provide actual row or index names

'c'

In [43]:
my_series2.iloc[0] # with .iloc, its implicit indexing,  provide row position

'a'

In [45]:
my_series.iloc[1:]

b    0.2
c    1.3
d    0.4
dtype: float64

### 1.2.2 Modifying Series

In [47]:
my_series2

1    a
2    b
3    c
4    d
5    e
dtype: object

In [48]:
my_series2.loc[5] = 'y'
my_series2

1    a
2    b
3    c
4    d
5    y
dtype: object

## 2. Pandas DataFrames
*  2D array with labelled data.
* Sequence of aligned Series objects.

In [1]:
import pandas as pd
# rows are indexed 0, columns are 1

In [363]:
# Basically each row or column is a series

### 2.1 Creating Data Frames

In [364]:
pd.DataFrame

pandas.core.frame.DataFrame



#### Init signature: pd.DataFrame(data=None, index=None, columns=None, dtype=None, copy=False)

In [279]:
# DataFrames can be created by reading csv files, from from a 2D numpy array, a dictionary of series objects, 
# from a list of dictionary objects.

**Creating data frame from a 2D numpy array**

In [281]:
df = pd.DataFrame(np.random.rand(4,5))
df

Unnamed: 0,0,1,2,3,4
0,0.690206,0.860762,0.582045,0.56184,0.449062
1,0.785195,0.111243,0.470158,0.357822,0.820394
2,0.870707,0.802564,0.187139,0.797816,0.91443
3,0.177766,0.941014,0.264042,0.964489,0.043202


In [304]:
df = pd.DataFrame(np.random.rand(4,5), index = 'r1 r2 r3 r4'.split(), columns = 'c1 c2 c3 c4 c5'.split())
df

Unnamed: 0,c1,c2,c3,c4,c5
r1,0.328961,0.486862,0.177871,0.533091,0.527299
r2,0.40996,0.155901,0.672091,0.749919,0.444122
r3,0.805226,0.344257,0.446401,0.194288,0.519243
r4,0.775835,0.228127,0.267467,0.213011,0.038016


In [306]:
df.index #  We get the index (basically rownames)

Index(['r1', 'r2', 'r3', 'r4'], dtype='object')

In [307]:
df.columns # to obtain column names

Index(['c1', 'c2', 'c3', 'c4', 'c5'], dtype='object')

**Creating data frame from a dictionary of series objects**

In [287]:
d1 = {'Josh': 6, 'kevin': 5.5, 'kumar': 5.8, 'shelly': 4.9}
d2 = {'Josh': 180 , 'kevin': 150, 'kumar': 140, 'shelly': 120}
s1 = pd.Series(d1)
s2 = pd.Series(d2)

In [308]:
df = pd.DataFrame({'height':d1, 'weight':d2})
df

Unnamed: 0,height,weight
Josh,6.0,180
kevin,5.5,150
kumar,5.8,140
shelly,4.9,120


**Creating data frame by reading data from a csv file**

In [311]:
pwd

'C:\\Users\\sridhar\\Google Drive\\Emory-ECE\\Python\\DataAnalysis-Python'

In [312]:
df = pd.read_csv('Wine.csv')

In [365]:
df.shape # to get the dimensions

(178, 15)

In [323]:
df.head() # show first few rows

Unnamed: 0.1,Unnamed: 0,Type,Alcohol,Malic,Ash,Alcalinity,Magnesium,Phenols,Flavanoids,Nonflavanoids,Proanthocyanins,Color,Hue,Dilution,Proline
0,1,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,2,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,3,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,4,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,5,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


**df.info()** will give a brief info on data frame columns and types. similar to str() in R 

**df.describe()** will give brief staistics on all numeric type columns. similar to summary() in R

In [325]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 15 columns):
Unnamed: 0         178 non-null int64
Type               178 non-null int64
Alcohol            178 non-null float64
Malic              178 non-null float64
Ash                178 non-null float64
Alcalinity         178 non-null float64
Magnesium          178 non-null int64
Phenols            178 non-null float64
Flavanoids         178 non-null float64
Nonflavanoids      178 non-null float64
Proanthocyanins    178 non-null float64
Color              178 non-null float64
Hue                178 non-null float64
Dilution           178 non-null float64
Proline            178 non-null int64
dtypes: float64(11), int64(4)
memory usage: 20.9 KB


In [326]:
df.describe() # brief statistics

Unnamed: 0.1,Unnamed: 0,Type,Alcohol,Malic,Ash,Alcalinity,Magnesium,Phenols,Flavanoids,Nonflavanoids,Proanthocyanins,Color,Hue,Dilution,Proline
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,89.5,1.938202,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258
std,51.528309,0.775035,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474
min,1.0,1.0,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0
25%,45.25,1.0,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5
50%,89.5,2.0,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5
75%,133.75,3.0,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0
max,178.0,3.0,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0


#### Some methods have df.(), and others just df.    This is because methods use df.method(), and attributes just df.attribute

In [328]:
df.mean() # method. For method it does some kind of action. # attributes just state some info on the data frame.

Unnamed: 0          89.500000
Type                 1.938202
Alcohol             13.000618
Malic                2.336348
Ash                  2.366517
Alcalinity          19.494944
Magnesium           99.741573
Phenols              2.295112
Flavanoids           2.029270
Nonflavanoids        0.361854
Proanthocyanins      1.590899
Color                5.058090
Hue                  0.957449
Dilution             2.611685
Proline            746.893258
dtype: float64

### 2.2 Indexing and modifying a DataFrame
* rows
    - df.loc[] only for explicit indexing on rows and columns (with names)
    - df.iloc[] for implicit indexing on rows and columns (with row position)
* columns
    - df[] for explicit indexing with column names


In [385]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,Type,Alcohol,Malic,Ash,Alcalinity,Magnesium,Phenols,Flavanoids,Nonflavanoids,Proanthocyanins,Color,Hue,Dilution,Proline
0,1,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,2,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,3,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185


In [381]:
df['Alcohol'].head()

0    14.23
1    13.20
2    13.16
3    14.37
4    13.24
Name: Alcohol, dtype: float64

In [384]:
df.loc[1].head()

Unnamed: 0     2.00
Type           1.00
Alcohol       13.20
Malic          1.78
Ash            2.14
Name: 1, dtype: float64

In [386]:
df.iloc[3:5,1:4].head()

Unnamed: 0,Type,Alcohol,Malic
3,1,14.37,1.95
4,1,13.24,2.59


In [391]:
df.loc[[3,4,5],'Alcohol'].head()

3    14.37
4    13.24
5    14.20
Name: Alcohol, dtype: float64

In [21]:
df['Alcohol'].max()

14.83

In [25]:
df.index[df['Alcohol']==14.83] # index where value = 14.83

Int64Index([8], dtype='int64')

In [110]:
df['Alcohol'].idxmax()# this gives the index where id is max, similar to which.max() in R

8

In [230]:
# From a 2d numpy array

In [8]:
df = pd.DataFrame(np.random.randn(4,5))

In [9]:
df

Unnamed: 0,0,1,2,3,4
0,0.589008,-0.752536,-0.210306,0.927043,-0.03881
1,0.465919,1.069543,-0.755772,1.332303,0.126986
2,0.168493,-1.259497,0.263725,0.203634,1.814273
3,0.091623,1.006079,-1.841253,0.482241,-0.028039


In [10]:
df = pd.DataFrame(np.random.randn(4,5), index=["r0", "r1", "r2", "r3"], columns = ["c1", "c2", "c3", "c4", "c5"])

In [11]:
df

Unnamed: 0,c1,c2,c3,c4,c5
r0,-0.581382,-0.631437,-1.08382,-0.82409,1.978655
r1,1.078916,-0.386725,-0.096208,0.324096,-1.049378
r2,0.113787,-1.560847,0.9725,0.481824,-0.063005
r3,0.285393,-0.022709,-1.08441,-0.900464,0.652875


In [12]:
# from a dictionary

In [13]:
d = {"k1":1, "k2":2, "k3":3}

In [14]:
df =pd.DataFrame(data=d, index=[0,1,2]) # if scalar values are given, then must specify an index

In [15]:
df

Unnamed: 0,k1,k2,k3
0,1,2,3
1,1,2,3
2,1,2,3


In [16]:
df = pd.DataFrame({"k1":[1,2,3], "k2":["a","b","c"]}) # no need to specify index.

In [17]:
df

Unnamed: 0,k1,k2
0,1,a
1,2,b
2,3,c


In [18]:
# from a dictionary of series objects

In [19]:
s1 = pd.Series([1,2,3])
s1

0    1
1    2
2    3
dtype: int64

In [20]:
s2 = pd.Series(["a","b","c"])
s2

0    a
1    b
2    c
dtype: object

In [21]:
pd.DataFrame({'s1':s1, 's2':s2})

Unnamed: 0,s1,s2
0,1,a
1,2,b
2,3,c


In [22]:
pd.DataFrame(s1) # from a single series object.

Unnamed: 0,0
0,1
1,2
2,3


In [23]:
# from lists directly

In [24]:
df = pd.DataFrame([[1,2,3,4]]) # we need to pass a list of lists

In [25]:
df

Unnamed: 0,0,1,2,3
0,1,2,3,4


In [26]:
df = pd.DataFrame([[1,2,3,4],["a","b","c","d"]], index='r1,r2'.split(","), columns='c1,c2,c3,c4'.split(","))

In [27]:
df

Unnamed: 0,c1,c2,c3,c4
r1,1,2,3,4
r2,a,b,c,d


In [28]:
# Modiyinf data frames

In [29]:
df = pd.DataFrame(np.random.randn(3,4), index=['r1', 'r2', 'r3'], columns=['c1','c2','c3','c4'])

In [30]:
df

Unnamed: 0,c1,c2,c3,c4
r1,-1.807483,-1.133142,-0.946658,1.026452
r2,0.882857,0.396357,-0.688555,-1.504458
r3,0.104798,-0.660211,1.429795,-0.77333


In [31]:
# two general methods to indexing. If names are available, use them directly for columns

In [32]:
df["c1"]

r1   -1.807483
r2    0.882857
r3    0.104798
Name: c1, dtype: float64

In [33]:
# for multiple columnsc

In [34]:
df[["c1","c4"]] # pass a list of columns.  Here in R we use c("c1","c4") 

Unnamed: 0,c1,c4
r1,-1.807483,1.026452
r2,0.882857,-1.504458
r3,0.104798,-0.77333


In [35]:
df.loc["r1"] # for rows, instead use .loc["row name"]

c1   -1.807483
c2   -1.133142
c3   -0.946658
c4    1.026452
Name: r1, dtype: float64

In [36]:
df.iloc[2] # for rows using the index value use .iloc[index value]

c1    0.104798
c2   -0.660211
c3    1.429795
c4   -0.773330
Name: r3, dtype: float64

In [37]:
df["c5"] = [4,5,6]

In [38]:
df

Unnamed: 0,c1,c2,c3,c4,c5
r1,-1.807483,-1.133142,-0.946658,1.026452,4
r2,0.882857,0.396357,-0.688555,-1.504458,5
r3,0.104798,-0.660211,1.429795,-0.77333,6


In [39]:
df.iloc[0,:] # 0 row and all columns

c1   -1.807483
c2   -1.133142
c3   -0.946658
c4    1.026452
c5    4.000000
Name: r1, dtype: float64

In [40]:
df.iloc[0, 1:3] # 0 row and columns 1, 2

c2   -1.133142
c3   -0.946658
Name: r1, dtype: float64

In [41]:
 df.loc[["r1","r2"],["c1","c3"]] # when selecting multiple columns or rows, pass as a list. Also use loc for indexing with names

Unnamed: 0,c1,c3
r1,-1.807483,-0.946658
r2,0.882857,-0.688555


In [42]:
df.loc[:,"c1"]

r1   -1.807483
r2    0.882857
r3    0.104798
Name: c1, dtype: float64

In [43]:
df

Unnamed: 0,c1,c2,c3,c4,c5
r1,-1.807483,-1.133142,-0.946658,1.026452,4
r2,0.882857,0.396357,-0.688555,-1.504458,5
r3,0.104798,-0.660211,1.429795,-0.77333,6


In [44]:

df2 = pd.DataFrame([[1,2,3,4,5]], columns = 'c1 c2 c3 c4 c5'.split())

In [45]:
df2

Unnamed: 0,c1,c2,c3,c4,c5
0,1,2,3,4,5


In [46]:
df.append(df2)

Unnamed: 0,c1,c2,c3,c4,c5
r1,-1.807483,-1.133142,-0.946658,1.026452,4
r2,0.882857,0.396357,-0.688555,-1.504458,5
r3,0.104798,-0.660211,1.429795,-0.77333,6
0,1.0,2.0,3.0,4.0,5


In [47]:
df

Unnamed: 0,c1,c2,c3,c4,c5
r1,-1.807483,-1.133142,-0.946658,1.026452,4
r2,0.882857,0.396357,-0.688555,-1.504458,5
r3,0.104798,-0.660211,1.429795,-0.77333,6


In [48]:
df["c1"] > 0

r1    False
r2     True
r3     True
Name: c1, dtype: bool

In [49]:
df.loc[df["c1"] > 0,["c3","c5"]] # always remember to use .loc for rows when using names or booleans

Unnamed: 0,c3,c5
r2,-0.688555,5
r3,1.429795,6


In [50]:
#.ix allows a hybrid approach to indexing (hybrid between loc and iloc)

In [51]:
df.ix[0:3, ["c1","c2"]]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  if __name__ == '__main__':


Unnamed: 0,c1,c2
r1,-1.807483,-1.133142
r2,0.882857,0.396357
r3,0.104798,-0.660211


In [52]:
df.ix[df["c1"]>0, 1]

r2    0.396357
r3   -0.660211
Name: c2, dtype: float64

In [53]:
df.loc["r1"] >0

c1    False
c2    False
c3    False
c4     True
c5     True
Name: r1, dtype: bool

In [54]:
# when using multiple conditions, do not use AND instead use & or |
df[(df["c1"]>0) & (df["c3"] >0)] # because and will work for between two series of boolean values instead of just two booleans
# in R we use & anyway and it doesnt matter.

Unnamed: 0,c1,c2,c3,c4,c5
r3,0.104798,-0.660211,1.429795,-0.77333,6


In [55]:
df.loc[(df["c1"]>0) & (df["c3"] >0), ["c1","c5"]] # if we want to use booleans in the row, use .loc 

Unnamed: 0,c1,c5
r3,0.104798,6


In [56]:
df

Unnamed: 0,c1,c2,c3,c4,c5
r1,-1.807483,-1.133142,-0.946658,1.026452,4
r2,0.882857,0.396357,-0.688555,-1.504458,5
r3,0.104798,-0.660211,1.429795,-0.77333,6


In [57]:
df.pop("c5") # returns the popped column and mutates the original df

r1    4
r2    5
r3    6
Name: c5, dtype: int64

In [58]:
df.

SyntaxError: invalid syntax (<ipython-input-58-d230d845cd15>, line 1)

In [59]:
df["c5"] = [4,5,6]

In [60]:
df.drop("c5",1) # to remove a column. 1 is for columns. Similarly we can delete rows as well.

Unnamed: 0,c1,c2,c3,c4
r1,-1.807483,-1.133142,-0.946658,1.026452
r2,0.882857,0.396357,-0.688555,-1.504458
r3,0.104798,-0.660211,1.429795,-0.77333


In [61]:
df["c5"] = [4,5,6]

### reset index

In [62]:
df.reset_index()

Unnamed: 0,index,c1,c2,c3,c4,c5
0,r1,-1.807483,-1.133142,-0.946658,1.026452,4
1,r2,0.882857,0.396357,-0.688555,-1.504458,5
2,r3,0.104798,-0.660211,1.429795,-0.77333,6


In [63]:
df.set_index("c5", inplace=True) # to set a differnet column as index. if inplace=False, original df unchanged

In [66]:
df

Unnamed: 0_level_0,c1,c2,c3,c4
c5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4,-1.807483,-1.133142,-0.946658,1.026452
5,0.882857,0.396357,-0.688555,-1.504458
6,0.104798,-0.660211,1.429795,-0.77333


In [68]:
df.index= ['a', 'b', 'c'] # to give a new index

In [69]:
df

Unnamed: 0,c1,c2,c3,c4
a,-1.807483,-1.133142,-0.946658,1.026452
b,0.882857,0.396357,-0.688555,-1.504458
c,0.104798,-0.660211,1.429795,-0.77333


In [65]:
list(df.columns) # to get the column names (in R this was colnames(df))

['c1', 'c2', 'c3', 'c4']

In [417]:
list(df.index)

[4, 5, 6]

In [1]:
l1 = [1,2,3]
l2 = ["a","b","c"]
list(zip(l1,l2)) # zip iterates on each iterable and creates a tuple

[(1, 'a'), (2, 'b'), (3, 'c')]

In [2]:
dict(zip(l1,l2))

{1: 'a', 2: 'b', 3: 'c'}

## multilevel indexing

In [420]:
# it is important and advanced. we will get to it later.

# Missing Values

In [18]:
import pandas as pd
import numpy as np

In [19]:
d = {"k1": [1,2,3, 7], "k2": [4,np.nan, 5, 8], "k3":[np.nan, np.nan, 9, 12], "k4":[3,6,9, 11]}

In [21]:
df = pd.DataFrame(d)

In [22]:
df

Unnamed: 0,k1,k2,k3,k4
0,1,4.0,,3
1,2,,,6
2,3,5.0,9.0,9
3,7,8.0,12.0,11


In [24]:
df.isna()

Unnamed: 0,k1,k2,k3,k4
0,False,False,True,False
1,False,True,True,False
2,False,False,False,False
3,False,False,False,False


In [26]:
df.isnull()

Unnamed: 0,k1,k2,k3,k4
0,False,False,True,False
1,False,True,True,False
2,False,False,False,False
3,False,False,False,False


In [27]:
df.notna()

Unnamed: 0,k1,k2,k3,k4
0,True,True,False,True
1,True,False,False,True
2,True,True,True,True
3,True,True,True,True


In [431]:
df.dropna()

Unnamed: 0,k1,k2,k3,k4
2,3,5.0,9.0,9
3,7,8.0,12.0,11


In [432]:
df.dropna(axis=1)

Unnamed: 0,k1,k4
0,1,3
1,2,6
2,3,9
3,7,11


In [442]:
df.fillna(0)

Unnamed: 0,k1,k2,k3,k4
0,1,4.0,0.0,3
1,2,0.0,0.0,6
2,3,5.0,9.0,9
3,7,8.0,12.0,11


In [443]:
df.fillna(df.mean(),axis=0) # fill na with mean values in each column. IN R BETTER TO USE HMISC, MICE PACKAGES.

Unnamed: 0,k1,k2,k3,k4
0,1,4.0,10.5,3
1,2,5.666667,10.5,6
2,3,5.0,9.0,9
3,7,8.0,12.0,11


In [16]:
from string import ascii_lowercase
import numpy as np
import pandas as pd

In [17]:
ascii_lowercase

'abcdefghijklmnopqrstuvwxyz'

## Groupby

In [18]:
# something similar to lapply, sapply, tapply in R. where we group rows by some column and perform an aggregate function on group

In [33]:
d = {"Item":["chair", "desk", "rug", "table", "chair", "couch", "couch", "chair", "rug", "desk"], 
     "Agent":["sally", "bob", "sally", "amy", "bob", "amy", "sally", "bob", "amy", "sally"],
      "salep": [100, 110, 200, 100, 150, 800, 1000, 100, 85, 110]}

In [34]:
df = pd.DataFrame(d)

In [35]:
df

Unnamed: 0,Agent,Item,salep
0,sally,chair,100
1,bob,desk,110
2,sally,rug,200
3,amy,table,100
4,bob,chair,150
5,amy,couch,800
6,sally,couch,1000
7,bob,chair,100
8,amy,rug,85
9,sally,desk,110


In [36]:
df.groupby("Item").mean() # only salep is returned because its numeric and agent is not.

Unnamed: 0_level_0,salep
Item,Unnamed: 1_level_1
chair,116.666667
couch,900.0
desk,110.0
rug,142.5
table,100.0


In [37]:
df.groupby("Agent")["salep"].mean() # we can also specify the columns on which we want the mean.

Agent
amy      328.333333
bob      120.000000
sally    352.500000
Name: salep, dtype: float64

In [38]:
df.groupby("Agent")["salep"].count()

Agent
amy      3
bob      3
sally    4
Name: salep, dtype: int64

## Merging dataframes

In [40]:
# commands to merge dataframes are concant and merge
# use merge if we just want to add dataframes by rows or columns as long as all the index and column names are same

In [42]:
df1 = pd.DataFrame(np.random.randn(3,4), index = 'r0 r1 r2'.split(), columns = 'co c1 c2 c3'.split())

In [43]:
df1

Unnamed: 0,co,c1,c2,c3
r0,-1.037857,0.908149,-1.037233,-0.316345
r1,0.413403,-1.627548,0.760591,-2.076801
r2,0.379228,-0.964468,-0.884444,0.206144


In [45]:
df2 = pd.DataFrame(np.random.randn(3,4), index = 'r0 r1 r2'.split(), columns = 'c4 c5 c6 c7'.split())

In [46]:
df2 # same row indexes, but differnt column names. We can use concat to join data frames together bycolumn

Unnamed: 0,c4,c5,c6,c7
r0,-1.703675,0.631855,1.320687,-1.657205
r1,-1.028683,-0.717891,0.103742,-1.978568
r2,0.197764,-1.369439,2.014703,-0.811467


In [49]:
pd.concat([df1,df2], axis=1) # axis = 1 for column, default is 0

Unnamed: 0,co,c1,c2,c3,c4,c5,c6,c7
r0,-1.037857,0.908149,-1.037233,-0.316345,-1.703675,0.631855,1.320687,-1.657205
r1,0.413403,-1.627548,0.760591,-2.076801,-1.028683,-0.717891,0.103742,-1.978568
r2,0.379228,-0.964468,-0.884444,0.206144,0.197764,-1.369439,2.014703,-0.811467


In [51]:
pd.concat([df1,df2], axis=0) # we get NaN because to join by rows, the column names are different.

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,co
r0,0.908149,-1.037233,-0.316345,,,,,-1.037857
r1,-1.627548,0.760591,-2.076801,,,,,0.413403
r2,-0.964468,-0.884444,0.206144,,,,,0.379228
r0,,,,-1.703675,0.631855,1.320687,-1.657205,
r1,,,,-1.028683,-0.717891,0.103742,-1.978568,
r2,,,,0.197764,-1.369439,2.014703,-0.811467,


In [59]:
# if we want to merge by some common column, then we use merge

In [60]:
df1

Unnamed: 0,co,c1,c2,c3
r0,-1.037857,0.908149,-1.037233,-0.316345
r1,0.413403,-1.627548,0.760591,-2.076801
r2,0.379228,-0.964468,-0.884444,0.206144


In [61]:
df2

Unnamed: 0,c4,c5,c6,c7
r0,-1.703675,0.631855,1.320687,-1.657205
r1,-1.028683,-0.717891,0.103742,-1.978568
r2,0.197764,-1.369439,2.014703,-0.811467


In [None]:
# there are no common columns to merge on, but if we do it anyway as shown below, we will not get anything back

In [63]:
pd.merge(df1,df2,how='inner', left_on = 'c1', right_on='c4') # because there are no common columns, error.
# left_on and right_on are used, when the column values might represent the same thing, but just the column names are different.

Unnamed: 0,co,c1,c2,c3,c4,c5,c6,c7


In [64]:
pd.merge(df1,df2,how='left',left_on = 'c1', right_on='c4') # this is a left merge. All left data frames are returned, right df2
# columns are simply added with NaN values. No common columns anyway.

Unnamed: 0,co,c1,c2,c3,c4,c5,c6,c7
0,-1.037857,0.908149,-1.037233,-0.316345,,,,
1,0.413403,-1.627548,0.760591,-2.076801,,,,
2,0.379228,-0.964468,-0.884444,0.206144,,,,


In [65]:
# similarly there are right, and full outer join. which will return all df1 rows, and all df2 rows. Any non matching values will 
# be NaN

In [66]:
pd.merge(df1,df2,how='outer',left_on = 'c1', right_on='c4')

Unnamed: 0,co,c1,c2,c3,c4,c5,c6,c7
0,-1.037857,0.908149,-1.037233,-0.316345,,,,
1,0.413403,-1.627548,0.760591,-2.076801,,,,
2,0.379228,-0.964468,-0.884444,0.206144,,,,
3,,,,,-1.703675,0.631855,1.320687,-1.657205
4,,,,,-1.028683,-0.717891,0.103742,-1.978568
5,,,,,0.197764,-1.369439,2.014703,-0.811467


### we can also join the dataframes by matching on index values. 
### we use df1.join(df2) for that.

## Some important methods

In [1]:
import numpy as np
import pandas as pd

In [5]:
df = pd.DataFrame(np.random.rand(4,5), index= 'r0 r1 r2 r3'.split(), columns='c0 c1 c2 c3 c4'.split())

In [6]:
df

Unnamed: 0,c0,c1,c2,c3,c4
r0,0.559593,0.370294,0.11188,0.779248,0.633542
r1,0.141408,0.205284,0.647226,0.784056,0.504624
r2,0.383341,0.104656,0.468537,0.520382,0.571355
r3,0.795961,0.466227,0.76424,0.662658,0.200032


In [7]:
df['c1'] > 0.3

r0     True
r1    False
r2    False
r3     True
Name: c1, dtype: bool

In [14]:
df.loc[df['c1']>0.3,]

Unnamed: 0,c0,c1,c2,c3,c4
r0,0.559593,0.370294,0.11188,0.779248,0.633542
r3,0.795961,0.466227,0.76424,0.662658,0.200032


In [15]:
df.ix[df['c2'] <0.2,]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  if __name__ == '__main__':


Unnamed: 0,c0,c1,c2,c3,c4
r0,0.559593,0.370294,0.11188,0.779248,0.633542


In [18]:
df['c1'].mean()

0.28661522324013045

In [21]:
df.apply(sum, 0)  # apply can be used to apply a function across all columns or rows. similart o apply() in R.

c0    1.880303
c1    1.146461
c2    1.991883
c3    2.746344
c4    1.909553
dtype: float64

In [32]:
df.apply(np.mean, 1) 

r0    0.490912
r1    0.456520
r2    0.409654
r3    0.577824
dtype: float64

In [None]:
# functions to pass to apply can also be user defined functions.

In [26]:
df['c1'].unique() # all the elements are uniue in this

array([0.37029392, 0.20528401, 0.10465592, 0.46622704])

In [28]:
df['c1'].nunique()

4

In [33]:
df['c1'].value_counts() # number of values for each unique values in R we simply use table(df['c1]) to get unique values

0.466227    1
0.104656    1
0.205284    1
0.370294    1
Name: c1, dtype: int64

### deleting or dropping columns

In [49]:
df

Unnamed: 0,c1,c2,c3,c4
r0,0.370294,0.11188,0.779248,0.633542
r1,0.205284,0.647226,0.784056,0.504624
r2,0.104656,0.468537,0.520382,0.571355
r3,0.466227,0.76424,0.662658,0.200032


In [51]:
df.drop('c1',1) # in R df['c0] = NULL 
# remember, this does not mutate the origil df. inplace=True will take care of that

Unnamed: 0,c2,c3,c4
r0,0.11188,0.779248,0.633542
r1,0.647226,0.784056,0.504624
r2,0.468537,0.520382,0.571355
r3,0.76424,0.662658,0.200032


In [52]:
df

Unnamed: 0,c1,c2,c3,c4
r0,0.370294,0.11188,0.779248,0.633542
r1,0.205284,0.647226,0.784056,0.504624
r2,0.104656,0.468537,0.520382,0.571355
r3,0.466227,0.76424,0.662658,0.200032


In [53]:
df.drop('r0',0) # we can also delete rows

Unnamed: 0,c1,c2,c3,c4
r1,0.205284,0.647226,0.784056,0.504624
r2,0.104656,0.468537,0.520382,0.571355
r3,0.466227,0.76424,0.662658,0.200032


In [54]:
# another delete method is to use del function

In [55]:
del df['c1']

In [56]:
df

Unnamed: 0,c2,c3,c4
r0,0.11188,0.779248,0.633542
r1,0.647226,0.784056,0.504624
r2,0.468537,0.520382,0.571355
r3,0.76424,0.662658,0.200032


In [57]:
npa = np.random.rand(4,5)

In [58]:
npa

array([[0.48776618, 0.15425156, 0.33342979, 0.17013809, 0.87705742],
       [0.91458224, 0.34088094, 0.32272415, 0.42838838, 0.03662541],
       [0.47008211, 0.21498352, 0.63847015, 0.67712505, 0.35748363],
       [0.15016948, 0.36995678, 0.52115732, 0.16370203, 0.98617901]])

In [59]:
npa.drop(0,1) # no drop method for numpy object.

AttributeError: 'numpy.ndarray' object has no attribute 'drop'

In [60]:
np.delete(npa, 0, 1) # delete column 0.

array([[0.15425156, 0.33342979, 0.17013809, 0.87705742],
       [0.34088094, 0.32272415, 0.42838838, 0.03662541],
       [0.21498352, 0.63847015, 0.67712505, 0.35748363],
       [0.36995678, 0.52115732, 0.16370203, 0.98617901]])

In [61]:
### sorting

In [62]:
df

Unnamed: 0,c2,c3,c4
r0,0.11188,0.779248,0.633542
r1,0.647226,0.784056,0.504624
r2,0.468537,0.520382,0.571355
r3,0.76424,0.662658,0.200032


In [64]:
df.sort_values('c2',0) # In R we can use the order() which gives sorted indices, which can then be used in df[order(),]

Unnamed: 0,c2,c3,c4
r0,0.11188,0.779248,0.633542
r2,0.468537,0.520382,0.571355
r1,0.647226,0.784056,0.504624
r3,0.76424,0.662658,0.200032


## reading and writing

In [65]:
# typically pd.read_ will give many options to read and df.to_ many options to write
# pd.read_csv, pd.read_excel, df.to_csv etc..

In [None]:
#