[View in Colaboratory](https://colab.research.google.com/github/datasciencemastery/pandas-numpy-tutorial/blob/master/Numpy_and_Pandas_Tutorial.ipynb)

# Numpy

A numpy array is a grid of values, all of the same type, and is indexed by a tuple of nonnegative integers. The number of dimensions is the rank of the array; the shape of an array is a tuple of integers giving the size of the array along each dimension.

We can initialize numpy arrays from nested Python lists, and access elements using square brackets:

In [0]:
import numpy as np

a = np.array([100, 200, 300])   #Rank 1 array
print(type(a))            # Prints "<class 'numpy.ndarray'>"
print(a.shape)            # Prints "(3,)"
print(a.ndim)             # Prints 1

print(a[0], a[1], a[2])   # Prints "100 200 300"

a[0] = 500                  # Change an element of the array
print(a)                  # Prints "[500, 200, 300]"

b = np.array([[1,2,3],[4,5,6]])    # Create a rank 2 array
print(b.shape)   # Prints "(2, 3)"
print (b.ndim)   # Prints 2
print(b[0, 0], b[0, 1], b[1, 0])   # Prints "1 2 4"

<class 'numpy.ndarray'>
(3,)
1
100 200 300
[500 200 300]
(2, 3)
2
1 2 4


https://www.quora.com/In-Python-NumPy-what-is-a-dimension-and-axis


In [0]:
# Numpy Matrix Operations

import numpy as np


#   1 2      5 6      6  8
#         +        =    
#   3 4      7 8      10 12 

x = np.array([[1,2],[3,4]], dtype=np.float64)
y = np.array([[5,6],[7,8]], dtype=np.float64)

# Elementwise sum; both produce the array
# [[ 6.0  8.0]
#  [10.0 12.0]]
print(x + y)
print(np.add(x, y))


#   1 2      5 6      -4  -4
#         -        =    
#   3 4      7 8      -4  -4 


# Elementwise difference; both produce the array
# [[-4.0 -4.0]
#  [-4.0 -4.0]]
print(x - y)
print(np.subtract(x, y))


#   1 2      5 6       5  12
#         *        =    
#   3 4      7 8      21  32 


# Elementwise product; both produce the array
# [[ 5.0 12.0]
#  [21.0 32.0]]
print(x * y)
print(np.multiply(x, y))

# Elementwise division; both produce the array
# [[ 0.2         0.33333333]
#  [ 0.42857143  0.5       ]]

#   1 2      5 6       0.2  0.3333
#         /        =    
#   3 4      7 8       0.428  0.5 

print(x / y)
print(np.divide(x, y))

# Elementwise square root; produces the array
# [[ 1.          1.41421356]
#  [ 1.73205081  2.        ]]
print(np.sqrt(x))

[[ 6.  8.]
 [10. 12.]]
[[ 6.  8.]
 [10. 12.]]
[[-4. -4.]
 [-4. -4.]]
[[-4. -4.]
 [-4. -4.]]
[[ 5. 12.]
 [21. 32.]]
[[ 5. 12.]
 [21. 32.]]
[[0.2        0.33333333]
 [0.42857143 0.5       ]]
[[0.2        0.33333333]
 [0.42857143 0.5       ]]
[[1.         1.41421356]
 [1.73205081 2.        ]]


[Google SpreadSheet Link](https://docs.google.com/spreadsheets/d/1yim_nrrNpWK0D_oBeP2aD94Z6mgzDVv8SXDT5AeoZvg/edit?usp=sharing)

In [0]:
import numpy as np

x = np.array([[1,2],[3,4]])
y = np.array([[5,6],[7,8]])

v = np.array([9,10])
w = np.array([11, 12])


# 9 10  dot product   11 12



# Inner product of vectors; both produce 219
print(v.dot(w))
print(np.dot(v, w))

# Matrix / vector product; both produce the rank 1 array [29 67]
print(x.dot(v))
print(np.dot(x, v))

# Matrix / matrix product; both produce the rank 2 array
# [[19 22]
#  [43 50]]
print(x.dot(y))
print(np.dot(x, y))

219
219
[29 67]
[29 67]
[[19 22]
 [43 50]]
[[19 22]
 [43 50]]


In [0]:
import numpy as np

a = np.zeros((2,2))   # Create an array of all zeros
print(a,'\n')              # Prints "[[ 0.  0.]
                      #          [ 0.  0.]]"

b = np.ones((1,2))    # Create an array of all ones
print(b,'\n')              # Prints "[[ 1.  1.]]"

c = np.full((2,2), 7)  # Create a constant array
print(c,'\n')               # Prints "[[ 7.  7.]
                       #          [ 7.  7.]]"

d = np.eye(2)         # Create a 2x2 identity matrix
print(d,'\n')              # Prints "[[ 1.  0.]
                      #          [ 0.  1.]]"

e = np.random.random((2,2))  # Create an array filled with random values
print(e,'\n')                     # Might print "[[ 0.91940167  0.08143941]
                             #               [ 0.68744134  0.87236687]]"

[[0. 0.]
 [0. 0.]] 

[[1. 1.]] 

[[7 7]
 [7 7]] 

[[1. 0.]
 [0. 1.]] 

[[0.65673667 0.94189513]
 [0.6360079  0.0879803 ]] 



In [0]:
# Similar to Python lists, numpy arrays can be sliced. Since arrays may be multidimensional, 
# you must specify a slice for each dimension of the array:


import numpy as np

# Create the following rank 2 array with shape (3, 4)
# [[ 1  2  3  4]
#  [ 5  6  7  8]
#  [ 9 10 11 12]]
a = np.array([[1,2,3,4], [5,6,7,8], [9,10,11,12]])

# Use slicing to pull out the subarray consisting of the first 2 rows
# and columns 1 and 2; b is the following array of shape (2, 2):
# [[2 3]
#  [6 7]]
b = a[:2, 1:3]

print (b,'\n')

# A slice of an array is a view into the same data, so modifying it
# will modify the original array.
print(a[0, 1])   # Prints "2"
b[0, 0] = 77     # b[0, 0] is the same piece of data as a[0, 1]
print(a[0, 1],'\n')   # Prints "77"

print (b,'\n')

print (a,'\n')

[[2 3]
 [6 7]] 

2
77 

[[77  3]
 [ 6  7]] 

[[ 1 77  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]] 



In [0]:
# One can also mix integer indexing with slice indexing. 
# However, doing so will yield an array of lower rank than the original array


import numpy as np

# Create the following rank 2 array with shape (3, 4)
# [[ 1  2  3  4]
#  [ 5  6  7  8]
#  [ 9 10 11 12]]
a = np.array([[1,2,3,4], [5,6,7,8], [9,10,11,12]])

# Two ways of accessing the data in the middle row of the array.
# Mixing integer indexing with slices yields an array of lower rank,
# while using only slices yields an array of the same rank as the
# original array:
row_r1 = a[1, :]    # Rank 1 view of the second row of a
row_r2 = a[1:2, :]  # Rank 2 view of the second row of a
print(row_r1, row_r1.shape)  # Prints "[5 6 7 8] (4,)"
print(row_r2, row_r2.shape)  # Prints "[[5 6 7 8]] (1, 4)"

# We can make the same distinction when accessing columns of an array:
col_r1 = a[:, 1]
col_r2 = a[:, 1:2]
print(col_r1, col_r1.shape)  # Prints "[ 2  6 10] (3,)"
print(col_r2, col_r2.shape)  # Prints "[[ 2]
                             #          [ 6]
                             #          [10]] (3, 1)"

[5 6 7 8] (4,)
[[5 6 7 8]] (1, 4)
[ 2  6 10] (3,)
[[ 2]
 [ 6]
 [10]] (3, 1)


In [0]:

# Integer array indexing: When you index into numpy arrays using slicing, 
# the resulting array view will always be a subarray of the original array. 
# In contrast, integer array indexing allows you to construct arbitrary arrays using the data from another array. 
# Here is an example:
  
import numpy as np

a = np.array([[1,2], [3, 4], [5, 6]])

# An example of integer array indexing.
# The returned array will have shape (3,) and
print(a[[0, 1, 2], [0, 1, 0]])  # Prints "[1 4 5]"

# The above example of integer array indexing is equivalent to this:
print(np.array([a[0, 0], a[1, 1], a[2, 0]]))  # Prints "[1 4 5]"

# When using integer array indexing, you can reuse the same
# element from the source array:
print(a[[0, 0], [1, 1]])  # Prints "[2 2]"

# Equivalent to the previous integer array indexing example
print(np.array([a[0, 1], a[0, 1]]))  # Prints "[2 2]"

[1 4 5]
[1 4 5]
[2 2]
[2 2]


In [0]:
#Boolean array indexing: Boolean array indexing lets you pick out arbitrary elements of an array.
#Frequently this type of indexing is used to select the elements of an array that satisfy some condition. 


import numpy as np

a = np.array([[1,2], [3, 4], [5, 6]])

bool_idx = (a > 2)   # Find the elements of a that are bigger than 2;
                     # this returns a numpy array of Booleans of the same
                     # shape as a, where each slot of bool_idx tells
                     # whether that element of a is > 2.

print(bool_idx)      # Prints "[[False False]
                     #          [ True  True]
                     #          [ True  True]]"

# We use boolean array indexing to construct a rank 1 array
# consisting of the elements of a corresponding to the True values
# of bool_idx
print(a[bool_idx])  # Prints "[3 4 5 6]"

# We can do all of the above in a single concise statement:
print(a[a > 2])     # Prints "[3 4 5 6]"

[[False False]
 [ True  True]
 [ True  True]]
[3 4 5 6]
[3 4 5 6]


In [0]:
#One useful trick with integer array indexing is selecting or mutating one element from each row of a matrix:

import numpy as np

# Create a new array from which we will select elements
a = np.array([[1,2,3], [4,5,6], [7,8,9], [10, 11, 12]])

print(a)  # prints "array([[ 1,  2,  3],
          #                [ 4,  5,  6],
          #                [ 7,  8,  9],
          #                [10, 11, 12]])"

      
print(a.shape)      
# Create an array of indices
b = np.array([0, 2, 0, 1])

# Select one element from each row of a using the indices in b
# (0,1,2,3) ([0, 2, 0, 1])
print(a[np.arange(4), b])  # Prints "[ 1  6  7 11]"

# Mutate one element from each row of a using the indices in b
a[np.arange(4), b] += 10

print(a)  # prints "array([[11,  2,  3],
          #                [ 4,  5, 16],
          #                [17,  8,  9],
          #                [10, 21, 12]])

[[ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [10 11 12]]
(4, 3)
[ 1  6  7 11]
[[11  2  3]
 [ 4  5 16]
 [17  8  9]
 [10 21 12]]


In [0]:
#Numpy provides many useful functions for performing computations on arrays; one of the most useful is sum

import numpy as np


x = np.array([[1,2],[3,4]])


# 1,2
# 3,4

print(np.sum(x))  # Compute sum of all elements; prints "10"
print(np.sum(x, axis=0))  # Compute sum of each column; prints "[4 6]"
print(np.sum(x, axis=1))  # Compute sum of each row; prints "[3 7]"

10
[4 6]
[3 7]


In [0]:
# Python Program illustrating
# numpy.reshape() method
 
import numpy as np
 
array = np.arange(8)
print("Original array : \n", array)
 
# shape array with 2 rows and 2 columns
array = np.arange(4).reshape(2, 2)
print("\narray reshaped with 2 rows and 2 columns : \n", array)

# shape array with 4 rows and 2 columns
array = np.arange(8).reshape(4, 2)
print("\narray reshaped with 4 rows and 2 columns : \n", array)
 
 
# shape array with 2 rows and 4 columns
array = np.arange(8).reshape(4 ,2)
print("\narray reshaped with 4 rows and 2 columns : \n", array)
 
# Constructs 3D array
array = np.arange(8).reshape(2, 2, 2)
print("\nOriginal array reshaped to 3D : \n", array)


Original array : 
 [0 1 2 3 4 5 6 7]

array reshaped with 2 rows and 2 columns : 
 [[0 1]
 [2 3]]

array reshaped with 4 rows and 2 columns : 
 [[0 1]
 [2 3]
 [4 5]
 [6 7]]

array reshaped with 4 rows and 2 columns : 
 [[0 1]
 [2 3]
 [4 5]
 [6 7]]

Original array reshaped to 3D : 
 [[[0 1]
  [2 3]]

 [[4 5]
  [6 7]]]


In [0]:
from numpy import array
# list of data
data = [[11, 22],
		[33, 44],
		[55, 66]]
# array of data
data = array(data)
print('Rows: %d' % data.shape[0])
print('Cols: %d' % data.shape[1])

Rows: 3
Cols: 2


In [0]:

# reshape 1D array to 2D Array
from numpy import array
from numpy import reshape
# define array
data = array([11, 22, 33, 44, 55])
print(data.shape)
# reshape
print (data.shape[0])
data = data.reshape((data.shape[0], 1))
print(data.shape)

print (data)

(5,)
5
(5, 1)
[[11]
 [22]
 [33]
 [44]
 [55]]


In [0]:
# A tensor that contains only one number is called a scalar (or scalar tensor, or 0-dimensional tensor, or 0D tensor).
# In Numpy, a float32 or float64 number is a scalar tensor (or scalar array).

import numpy as np
x = np.array(12)
print (x.ndim)

0


In [0]:
#An array of numbers is called a vector, or 1D tensor. 
# A 1D tensor is said to have exactly one axis. Following is a Numpy vector:

a = np.array([100, 200, 300])   #Rank 1 array
print(a.ndim)             # Prints 1

1


In [0]:
# Matrices (2D tensors)
# An array of vectors is a matrix, or 2D tensor.
# A matrix has two axes (often referred to rows and columns).
# You can visually interpret a matrix as a rectangular grid of numbers. This is a Numpy matrix:

x = np.array([[5, 78, 2, 34, 0],
                  [6, 79, 3, 35, 1],
                  [7, 80, 4, 36, 2]])

print (x.ndim)

2


In [0]:
# If you pack such matrices in a new array, you obtain a 3D tensor, 
# which you can visually interpret as a cube of numbers. Following is a Numpy 3D tensor

x = np.array([[[5, 78, 2, 34, 0],
                   [6, 79, 3, 35, 1],
                   [7, 80, 4, 36, 2]],
                  [[5, 78, 2, 34, 0],
                   [6, 79, 3, 35, 1],
                   [7, 80, 4, 36, 2]],
                  [[5, 78, 2, 34, 0],
                   [6, 79, 3, 35, 1],
                   [7, 80, 4, 36, 2]]])

print (x)

print (x.ndim)

[[[ 5 78  2 34  0]
  [ 6 79  3 35  1]
  [ 7 80  4 36  2]]

 [[ 5 78  2 34  0]
  [ 6 79  3 35  1]
  [ 7 80  4 36  2]]

 [[ 5 78  2 34  0]
  [ 6 79  3 35  1]
  [ 7 80  4 36  2]]]
3


In [0]:
# In deep learning, you’ll generally manipulate tensors that are 0D to 4D

In [0]:
from keras.datasets import mnist
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()
print(train_images.ndim)
print(train_images.shape)
print(train_images.dtype)

#So what we have here is a 3D tensor of 8-bit integers. 
#More precisely, it’s an array of 60,000 matrices of 28 × 28 integers. 
#Each such matrix is a grayscale image, with coefficients between 0 and 255.

3
(60000, 28, 28)
uint8


Real-world examples of data tensors

**Vector data**— 2D tensors of shape (samples, features)

**Timeseries data or sequence data**— 3D tensors of shape (samples, timesteps, features)

**Images**— 4D tensors of shape (samples, height, width, channels) or (samples, channels, height, width)

**Video**— 5D tensors of shape (samples, frames, height, width, channels)


In [0]:
# The simplest example of this type of operation is transposing a matrix; 
# to transpose a matrix, simply use the T attribute of an array object:


import numpy as np

x = np.array([[1,2], [3,4]])
print(x)    # Prints "[[1 2]
            #          [3 4]]"
print(x.T)  # Prints "[[1 3]
            #          [2 4]]"

# Note that taking the transpose of a rank 1 array does nothing:
v = np.array([1,2,3])
print(v)    # Prints "[1 2 3]"
print(v.T)  # Prints "[1 2 3]"

[[1 2]
 [3 4]]
[[1 3]
 [2 4]]
[1 2 3]
[1 2 3]


Broadcasting is a powerful mechanism that allows numpy to work with arrays of different shapes 
when performing arithmetic operations. 
Frequently we have a smaller array and a larger array, 
and we want to use the smaller array multiple times to perform some operation on the larger array.

For example, suppose that we want to add a constant vector to each row of a matrix. 

In [0]:
import numpy as np

# We will add the vector v to each row of the matrix x,
# storing the result in the matrix y
x = np.array([[1,2,3], [4,5,6], [7,8,9], [10, 11, 12]])
v = np.array([1, 0, 1])
y = np.empty_like(x)   # Create an empty matrix with the same shape as x

print (y)

# Add the vector v to each row of the matrix x with an explicit loop
for i in range(4):
    y[i, :] = x[i, :] + v

# Now y is the following
# [[ 2  2  4]
#  [ 5  5  7]
#  [ 8  8 10]
#  [11 11 13]]
print(y)



[[       93583360               0             833]
 [       94290128 140340098204040 140340098204040]
 [140340098204040 140340098204040               0]
 [             13     34359738369               0]]
[[ 2  2  4]
 [ 5  5  7]
 [ 8  8 10]
 [11 11 13]]


In [0]:

import numpy as np

# We will add the vector v to each row of the matrix x,
# storing the result in the matrix y
x = np.array([[1,2,3], [4,5,6], [7,8,9], [10, 11, 12]])
v = np.array([1, 0, 1])
y = x + v  # Add v to each row of x using broadcasting
print(y)  # Prints "[[ 2  2  4]
          #          [ 5  5  7]
          #          [ 8  8 10]
          #          [11 11 13]]"
      
#The line y = x + v works even though x has shape (4, 3) 
#and v has shape (3,) due to broadcasting; this line works as if v actually had shape (4, 3), 
#where each row was a copy of v, and the sum was performed elementwise.      

# Pandas

A pandas series is similar to numpy arrays but with more functionality. 


*   It has functions like describe which will print mean, standard deviation and other statistics function. This function is not available in numpy arrays

*   One can access elements just like numpy arrays using slice notation like 
                pandaseries[0], 
                pandaseries[0:7]
                
*  One can use loops like for i in pandaseries
*  All  the convenient functions like mean, sum, max are also availabe in Pandas.
*  All the vectorized operations are also available in  Panda series.
* They are implemented using C just like Numpy Arrays and hence are fast.





In [30]:
import numpy as np
import pandas as pd

a = np.array([1,2,3,4])
series = pd.Series([1,2,3,4])


print (series.describe(),'\n')  # These functions are not available in numpy arrays






count    4.000000
mean     2.500000
std      1.290994
min      1.000000
25%      1.750000
50%      2.500000
75%      3.250000
max      4.000000
dtype: float64 



In [31]:
print (series[0],'\n')  # One can access using the same way like numpy array
print (series[:2],'\n')  # same slice notation.




1 

0    1
1    2
dtype: int64 



In [32]:
for d in series:
  print (d)
  


1
2
3
4


In [33]:
print ('\n')  

print ('mean', series.mean())  
print ('std',  series.std())
print ('max', series.max())



mean 2.5
std 1.2909944487358056
max 4


In [35]:
# Vectorized operations and index arrays
a = pd.Series([1, 2, 3, 4])
b = pd.Series([1, 2, 1, 2])
  
print (a + b)
print (a * 2)
print (a >= 3)
print (a[a >= 3])

0    2
1    4
2    4
3    6
dtype: int64
0    2
1    4
2    6
3    8
dtype: int64
0    False
1    False
2     True
3     True
dtype: bool
2    3
3    4
dtype: int64


In [43]:
# Panda Series Index

population = pd.Series([1415045928,1354051854,326766748], index = ["China", "India", "US"])

print (population,'\n')

# Numpy arrays are like superman version of list
# A Panda Series is like a mix of list and dictionary

print ('Population[0] =', population [0])
print ('Population[\'India\']=',population['India'] )

China    1415045928
India    1354051854
US        326766748
dtype: int64 

Population[0] = 1415045928
Population['India']= 1354051854


In [47]:
# If no indexes are specified Pandas creates Index
numseries = pd.Series([200,400,800])
print (numseries)

0    200
1    400
2    800
dtype: int64


In [50]:
# iloc
population = pd.Series([1415045928,1354051854,326766748], index = ["China", "India", "US"])
print (population.iloc[0]) # Accessing using index


1415045928


In [73]:


# Finding out maximum population
import pandas as pd


print (pd.__version__)

population = pd.Series(data = [1415045928,1354051854,326766748], index = ["China", "India", "US"])


print ("Country with maximum population =",population.values.argmax())
print ("The max population is =",population.iloc[population.values.argmax()])




0.22.0
Country with maximum population = 0
The max population is = 1415045928


In [76]:
import pandas as pd


# Addition when indexes are the same
s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([100, 200, 300, 400], index=['a', 'b', 'c', 'd'])
print (s1 + s2)



a    101
b    202
c    303
d    404
dtype: int64


In [77]:
# Indexes have same elements in a different order

s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([100, 200, 300, 400], index=['b', 'd', 'a', 'c'])
print (s1 + s2)



a    301
b    102
c    403
d    204
dtype: int64


In [78]:
# Indexes overlap, but do not have exactly the same elements

s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([100, 200, 300, 400], index=['c', 'd', 'e', 'f'])
print (s1 + s2)



a      NaN
b      NaN
c    103.0
d    204.0
e      NaN
f      NaN
dtype: float64


In [79]:
# Indexes do not overlap

s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([10, 20, 30, 40], index=['e', 'f', 'g', 'h'])
print (s1 + s2)

a   NaN
b   NaN
c   NaN
d   NaN
e   NaN
f   NaN
g   NaN
h   NaN
dtype: float64


In [83]:
# Using dropna

s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([10, 20, 30, 40], index=['a', 'b', 'g', 'h'])
result  =  s1 + s2
print (result.dropna())

a    11.0
b    22.0
dtype: float64


In [84]:
# Using dropna

s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([10, 20, 30, 40], index=['a', 'b', 'g', 'h'])
result  =  s1.add(s2,fill_value=0)
print (result.dropna())

a    11.0
b    22.0
c     3.0
d     4.0
g    30.0
h    40.0
dtype: float64


In [91]:
# Using Pandas Apply


def make_capital(str):
  return str.capitalize()


s1 = pd.Series(['india', 'china', 'brazil'], index=['a', 'b', 'c'])
s2 = s1.apply(make_capital)

print (s2)
                
                
                
                


a     India
b     China
c    Brazil
dtype: object


In [0]:
# Plotting values
population = pd.Series(data = [1415045928,1354051854,326766748], index = ["China", "India", "US"])
population.plot()


In [106]:
# Pandas DataFrame

import pandas as pd

country_df = pd.DataFrame({
    'country':['India','China', 'USA'],
    'population':[1415045928,1354051854,326766748], 
    'capital':['Delhi','Bejing','Washington']
    
})

print (country_df,'\n')

print (country_df.mean())


      capital country  population
0       Delhi   India  1415045928
1      Bejing   China  1354051854
2  Washington     USA   326766748 

population    1.031955e+09
dtype: float64


In [113]:
# Pandas DataFrame

import pandas as pd

country_df = pd.DataFrame({
    'population':[1415045928,1354051854,326766748], 
    'capital':['Delhi','Bejing','Washington'],
    'gdp':[2848231,14092514,20412870]
    
},
index = ['India','China', 'USA'],

)

print (country_df,'\n')
print (country_df.loc['India'],'\n')

print (country_df.iloc[0])


          capital       gdp  population
India       Delhi   2848231  1415045928
China      Bejing  14092514  1354051854
USA    Washington  20412870   326766748 

capital            Delhi
gdp              2848231
population    1415045928
Name: India, dtype: object 

capital            Delhi
gdp              2848231
population    1415045928
Name: India, dtype: object


In [120]:
# Pandas DataFrame

import pandas as pd

country_df = pd.DataFrame({
    'population':[1415045928,1354051854,326766748], 
    'capital':['Delhi','Bejing','Washington'],
    'gdp':[2848231,14092514,20412870]
    
},
index = ['India','China', 'USA'],

)


print (country_df.loc['India','gdp'],'\n')
print (country_df.iloc[0,1],'\n')

2848231 

2848231 



In [122]:
# Pandas DataFrame

import pandas as pd

country_df = pd.DataFrame({
    'population':[1415045928,1354051854,326766748], 
    'capital':['Delhi','Bejing','Washington'],
    'gdp':[2848231,14092514,20412870]
    
},
index = ['India','China', 'USA'],

)

print (country_df['gdp'],'\n')



India     2848231
China    14092514
USA      20412870
Name: gdp, dtype: int64 

capital            Delhi
gdp              2848231
population    1415045928
Name: India, dtype: object 



In [123]:
# Pandas DataFrame output as numpy values

import pandas as pd

country_df = pd.DataFrame({
    'population':[1415045928,1354051854,326766748], 
    'capital':['Delhi','Bejing','Washington'],
    'gdp':[2848231,14092514,20412870]
    
},
index = ['India','China', 'USA'],

)

print (country_df.values) # output as numpy values


[['Delhi' 2848231 1415045928]
 ['Bejing' 14092514 1354051854]
 ['Washington' 20412870 326766748]]


In [130]:
# Pandas axis

import pandas as pd

df = pd.DataFrame({'A': [0, 1, 2], 'B': [3, 4, 5]})

print (df,'\n')

print (df.sum(),'\n')
print (df.sum(axis=1),'\n')
print (df.values.sum(),'\n')

   A  B
0  0  3
1  1  4
2  2  5 

A     3
B    12
dtype: int64 

0    3
1    5
2    7
dtype: int64 

15 



In [132]:
# Vectoroperations for data frames
import pandas as pd

#Examples of vectorized operations on DataFrames:
# Adding DataFrames with the column names

df1 = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})
df2 = pd.DataFrame({'a': [10, 20, 30], 'b': [40, 50, 60], 'c': [70, 80, 90]})
print (df1 + df2)
    
# Adding DataFrames with overlapping column names 
df1 = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})
df2 = pd.DataFrame({'d': [10, 20, 30], 'c': [40, 50, 60], 'b': [70, 80, 90]})
print (df1 + df2)

# Adding DataFrames with overlapping row indexes
df1 = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]},
                   index=['row1', 'row2', 'row3'])
df2 = pd.DataFrame({'a': [10, 20, 30], 'b': [40, 50, 60], 'c': [70, 80, 90]},
                   index=['row4', 'row3', 'row2'])
print (df1 + df2)

    a   b   c
0  11  44  77
1  22  55  88
2  33  66  99
    a   b   c   d
0 NaN  74  47 NaN
1 NaN  85  58 NaN
2 NaN  96  69 NaN
         a     b     c
row1   NaN   NaN   NaN
row2  32.0  65.0  98.0
row3  23.0  56.0  89.0
row4   NaN   NaN   NaN


In [134]:
import pandas as pd


# DataFrame applymap()

df = pd.DataFrame({
    'a': [1, 2, 3],
    'b': [10, 20, 30],
    'c': [5, 10, 15]
})

def add_one(x):
    return x + 1

print (df.applymap(add_one))
    

   a   b   c
0  2  11   6
1  3  21  11
2  4  31  16


In [0]:
import numpy as np
import pandas as pd

df = pd.DataFrame({
    'a': [4, 5, 3, 1, 2],
    'b': [20, 10, 40, 50, 30],
    'c': [25, 20, 5, 15, 10]
})

# Change False to True for this block of code to see what it does

# DataFrame apply() - use case 2
if False:   
    print df.apply(np.mean)
    print df.apply(np.max)
    
def second_largest(df):
    '''
    Fill in this function to return the second-largest value of each 
    column of the input DataFrame.
    '''
    return None

In [0]:
#Adding a DataFrame to a Series

import pandas as pd

# Change False to True for each block of code to see what it does

# Adding a Series to a square DataFrame
if False:
    s = pd.Series([1, 2, 3, 4])
    df = pd.DataFrame({
        0: [10, 20, 30, 40],
        1: [50, 60, 70, 80],
        2: [90, 100, 110, 120],
        3: [130, 140, 150, 160]
    })
    
    print df
    print '' # Create a blank line between outputs
    print df + s
    
# Adding a Series to a one-row DataFrame 
if False:
    s = pd.Series([1, 2, 3, 4])
    df = pd.DataFrame({0: [10], 1: [20], 2: [30], 3: [40]})
    
    print df
    print '' # Create a blank line between outputs
    print df + s

# Adding a Series to a one-column DataFrame
if False:
    s = pd.Series([1, 2, 3, 4])
    df = pd.DataFrame({0: [10, 20, 30, 40]})
    
    print df
    print '' # Create a blank line between outputs
    print df + s
    

    
# Adding when DataFrame column names match Series index
if False:
    s = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
    df = pd.DataFrame({
        'a': [10, 20, 30, 40],
        'b': [50, 60, 70, 80],
        'c': [90, 100, 110, 120],
        'd': [130, 140, 150, 160]
    })
    
    print df
    print '' # Create a blank line between outputs
    print df + s
    
# Adding when DataFrame column names don't match Series index
if False:
    s = pd.Series([1, 2, 3, 4])
    df = pd.DataFrame({
        'a': [10, 20, 30, 40],
        'b': [50, 60, 70, 80],
        'c': [90, 100, 110, 120],
        'd': [130, 140, 150, 160]
    })
    
    print df
    print '' # Create a blank line between outputs
    print df + s

In [152]:
!pip install gspread

from google.colab import auth
auth.authenticate_user()

import gspread
from oauth2client.client import GoogleCredentials

gc = gspread.authorize(GoogleCredentials.get_application_default())

# Open our new sheet and read some data.
worksheet = gc.open('Pandas DataSheet').sheet1

# get_all_values gives a list of rows.
rows = worksheet.get_all_values()
#print(rows)

# Convert to a DataFrame and render.
import pandas as pd
df = pd.DataFrame.from_records(rows)

header = df.iloc[0] #Replace the header value with the first row’s values
df = df[1:]    # Replace the dataframe with a new one which does not contain the first row


# Rename the dataframe's column values with the header variable
df = df.rename(columns = header)

print ("\n",df)



   itemcode           date amount
1      MAG  01-April-2018   1000
2      MAG  02-April-2018    100
3      ALP  01-April-2018   1000


In [3]:
# Reading from a CSV.

import pandas as pd

df = pd.read_csv('https://raw.githubusercontent.com/datasciencemastery/data-sets/master/data_sheet.csv',sep = ',', error_bad_lines=False)
print(df)

  itemcode           date  amount
0      MAG  01-April-2018    1000
1      MAG  02-April-2018     100
2      ALP  01-April-2018    1000
