http://www.dataquest.io/blog/numpy-tutorial-python/?utm_source=mybridge&utm_medium=blog&utm_campaign=read_more

# NumPy Tutorial: Data analysis with Python

## Lists Of Lists for CSV Data

In [1]:
import csv

with open("winequality-red.csv", 'r') as f:
    wines = list(csv.reader(f, delimiter=";"))

In [2]:
print(wines[:3])

[['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality'], ['7.4', '0.7', '0', '1.9', '0.076', '11', '34', '0.9978', '3.51', '0.56', '9.4', '5'], ['7.8', '0.88', '0', '2.6', '0.098', '25', '67', '0.9968', '3.2', '0.68', '9.8', '5']]


In [3]:
float(wines[1][-1])

5.0

In [4]:
qualities = [float(item[-1]) for item in wines[1:]]

sum(qualities) / len(qualities)

5.6360225140712945

## Numpy 2-Dimensional Arrays

### Creating A NumPy Array

In [5]:
import numpy as np

# he limitations of NumPy is that all the elements in an array have to be of the same type
wines = np.array(wines[1:], dtype=np.float)

In [6]:
wines

array([[  7.4  ,   0.7  ,   0.   , ...,   0.56 ,   9.4  ,   5.   ],
       [  7.8  ,   0.88 ,   0.   , ...,   0.68 ,   9.8  ,   5.   ],
       [  7.8  ,   0.76 ,   0.04 , ...,   0.65 ,   9.8  ,   5.   ],
       ..., 
       [  6.3  ,   0.51 ,   0.13 , ...,   0.75 ,  11.   ,   6.   ],
       [  5.9  ,   0.645,   0.12 , ...,   0.71 ,  10.2  ,   5.   ],
       [  6.   ,   0.31 ,   0.47 , ...,   0.66 ,  11.   ,   6.   ]])

In [7]:
wines.shape

(1599, 12)

### Alternative NumPy Array Creation Methods

In [8]:
empty_array = np.zeros((3,4))
empty_array

array([[ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.]])

In [9]:
np.random.rand(3,4)

array([[ 0.30795395,  0.06409003,  0.98019489,  0.57032971],
       [ 0.66432613,  0.64819354,  0.39555937,  0.96409974],
       [ 0.85488269,  0.52542442,  0.68580823,  0.85955421]])

### Using NumPy To Read In Files

In [10]:
wines = np.genfromtxt("winequality-red.csv", delimiter=";", skip_header=1)

### Indexing NumPy Arrays

In [11]:
wines[2,3]

2.2999999999999998

### Slicing NumPy Arrays

In [12]:
wines[0:3,3]

array([ 1.9,  2.6,  2.3])

In [13]:
wines[:3,3]

array([ 1.9,  2.6,  2.3])

In [14]:
wines[:,3]

array([ 1.9,  2.6,  2.3, ...,  2.3,  2. ,  3.6])

In [15]:
wines[3,:]

array([ 11.2  ,   0.28 ,   0.56 ,   1.9  ,   0.075,  17.   ,  60.   ,
         0.998,   3.16 ,   0.58 ,   9.8  ,   6.   ])

In [16]:
wines[:,:]

array([[  7.4  ,   0.7  ,   0.   , ...,   0.56 ,   9.4  ,   5.   ],
       [  7.8  ,   0.88 ,   0.   , ...,   0.68 ,   9.8  ,   5.   ],
       [  7.8  ,   0.76 ,   0.04 , ...,   0.65 ,   9.8  ,   5.   ],
       ..., 
       [  6.3  ,   0.51 ,   0.13 , ...,   0.75 ,  11.   ,   6.   ],
       [  5.9  ,   0.645,   0.12 , ...,   0.71 ,  10.2  ,   5.   ],
       [  6.   ,   0.31 ,   0.47 , ...,   0.66 ,  11.   ,   6.   ]])

### Assigning Values To NumPy Arrays

In [17]:
wines[1,5] = 10

In [18]:
wines[:,10] = 50

## 1-Dimensional NumPy Arrays

In [19]:
third_wine = wines[3,:]

In [20]:
third_wine[1]

0.28000000000000003

In [21]:
np.random.rand(3)

array([ 0.19035389,  0.42359777,  0.8918197 ])

## N-Dimensional NumPy Arrays

In [22]:
year_one = [
    [500,505,490],
    [810,450,678],
    [234,897,430],
    [560,1023,640]
]

In [23]:
earnings = [
            [
                [500,505,490],
                [810,450,678],
                [234,897,430],
                [560,1023,640]
            ],
            [
                [600,605,490],
                [345,900,1000],
                [780,730,710],
                [670,540,324]
            ]
          ]

In [24]:
earnings = np.array(earnings)
earnings[0,0,0] # he earnings for January of the first year

500

In [25]:
earnings.shape

(2, 4, 3)

In [26]:
earnings[:,0,0] # the earnings for January of all years

array([500, 600])

In [27]:
earnings[:,0,:] # first quarter

array([[500, 505, 490],
       [600, 605, 490]])

## NumPy Data Types

In [28]:
wines.dtype

dtype('float64')

## Converting Data Types

In [29]:
wines.astype(int)

array([[ 7,  0,  0, ...,  0, 50,  5],
       [ 7,  0,  0, ...,  0, 50,  5],
       [ 7,  0,  0, ...,  0, 50,  5],
       ..., 
       [ 6,  0,  0, ...,  0, 50,  6],
       [ 5,  0,  0, ...,  0, 50,  5],
       [ 6,  0,  0, ...,  0, 50,  6]])

In [30]:
int_wines = wines.astype(int)
int_wines.dtype.name

'int64'

In [31]:
wines.astype(np.int32)

array([[ 7,  0,  0, ...,  0, 50,  5],
       [ 7,  0,  0, ...,  0, 50,  5],
       [ 7,  0,  0, ...,  0, 50,  5],
       ..., 
       [ 6,  0,  0, ...,  0, 50,  6],
       [ 5,  0,  0, ...,  0, 50,  5],
       [ 6,  0,  0, ...,  0, 50,  6]], dtype=int32)

## NumPy Array Operations

### Single Array Math

In [32]:
wines[:,11] + 10 # apply the operation to each of the elements in the array
# Note that the above operation won’t change the wines array

array([ 15.,  15.,  15., ...,  16.,  15.,  16.])

In [33]:
wines[:,11] += 10 # modify the array in place
wines[:,11]

array([ 15.,  15.,  15., ...,  16.,  15.,  16.])

In [34]:
wines[:,11] * 2

array([ 30.,  30.,  30., ...,  32.,  30.,  32.])

### Multiple Array Math

In [35]:
wines[:,11] + wines[:,11]

array([ 30.,  30.,  30., ...,  32.,  30.,  32.])

In [36]:
wines[:,10] * wines[:,11]

array([ 750.,  750.,  750., ...,  800.,  750.,  800.])

### Broadcasting

In [37]:
wines * np.array([1,2])

ValueError: operands could not be broadcast together with shapes (1599,12) (2,) 

In [38]:
array_one = np.array(
    [
        [1,2],
        [3,4]
    ]
)
array_two = np.array([4,5])

array_one + array_two

array([[5, 7],
       [7, 9]])

In [39]:
rand_array = np.random.rand(12)
wines + rand_array

array([[  8.08080979,   0.80627018,   0.60814501, ...,   1.00059724,
         50.40074141,  15.43005918],
       [  8.48080979,   0.98627018,   0.60814501, ...,   1.12059724,
         50.40074141,  15.43005918],
       [  8.48080979,   0.86627018,   0.64814501, ...,   1.09059724,
         50.40074141,  15.43005918],
       ..., 
       [  6.98080979,   0.61627018,   0.73814501, ...,   1.19059724,
         50.40074141,  16.43005918],
       [  6.58080979,   0.75127018,   0.72814501, ...,   1.15059724,
         50.40074141,  15.43005918],
       [  6.68080979,   0.41627018,   1.07814501, ...,   1.10059724,
         50.40074141,  16.43005918]])

## NumPy Array Methods

In [40]:
wines[:,11].sum()

25002.0

In [41]:
wines.sum(axis=0)

array([ 13303.1    ,    843.985  ,    433.29   ,   4059.55   ,
          139.859  ,  25369.     ,  74302.     ,   1593.79794,
         5294.47   ,   1052.38   ,  79950.     ,  25002.     ])

In [42]:
wines.sum(axis=0).shape

(12,)

In [43]:
wines.sum(axis=1)

array([ 125.1438 ,  158.2548 ,  149.899  , ...,  149.48174,  155.01547,
        141.49249])

## NumPy Array Comparisons


In [44]:
wines[:,11] > 5

array([ True,  True,  True, ...,  True,  True,  True], dtype=bool)

In [45]:
wines[:,11] == 10

array([False, False, False, ..., False, False, False], dtype=bool)

## Subsetting

In [46]:
high_quality = wines[:,11] > 7
wines[high_quality,:][:3,:]

array([[  7.40000000e+00,   7.00000000e-01,   0.00000000e+00,
          1.90000000e+00,   7.60000000e-02,   1.10000000e+01,
          3.40000000e+01,   9.97800000e-01,   3.51000000e+00,
          5.60000000e-01,   5.00000000e+01,   1.50000000e+01],
       [  7.80000000e+00,   8.80000000e-01,   0.00000000e+00,
          2.60000000e+00,   9.80000000e-02,   1.00000000e+01,
          6.70000000e+01,   9.96800000e-01,   3.20000000e+00,
          6.80000000e-01,   5.00000000e+01,   1.50000000e+01],
       [  7.80000000e+00,   7.60000000e-01,   4.00000000e-02,
          2.30000000e+00,   9.20000000e-02,   1.50000000e+01,
          5.40000000e+01,   9.97000000e-01,   3.26000000e+00,
          6.50000000e-01,   5.00000000e+01,   1.50000000e+01]])

In [47]:
high_quality_and_alcohol = (wines[:,10] > 10) & (wines[:,11] > 7)
wines[high_quality_and_alcohol,10:]

array([[ 50.,  15.],
       [ 50.,  15.],
       [ 50.,  15.],
       ..., 
       [ 50.,  16.],
       [ 50.,  15.],
       [ 50.,  16.]])

In [48]:
high_quality_and_alcohol = (wines[:,10] > 10) & (wines[:,11] > 7)
wines[high_quality_and_alcohol,10:] = 20

## Reshaping NumPy Arrays

In [49]:
np.transpose(wines).shape

(12, 1599)

In [50]:
wines.ravel()

array([  7.4 ,   0.7 ,   0.  , ...,   0.66,  20.  ,  20.  ])

In [51]:

array_one = np.array(
    [
        [1, 2, 3, 4], 
        [5, 6, 7, 8]
    ]
)

array_one.ravel()

array([1, 2, 3, 4, 5, 6, 7, 8])

In [52]:
wines[1,:].reshape((2,6))

array([[  7.8   ,   0.88  ,   0.    ,   2.6   ,   0.098 ,  10.    ],
       [ 67.    ,   0.9968,   3.2   ,   0.68  ,  20.    ,  20.    ]])

## Combining NumPy Arrays

In [53]:
white_wines = np.genfromtxt("winequality-white.csv", delimiter=";", skip_header=1)
white_wines.shape

(4898, 12)

In [54]:
all_wines = np.vstack((wines, white_wines))
all_wines.shape

(6497, 12)

In [55]:
np.concatenate((wines, white_wines), axis=0).shape  # Concatenating along the first axis is similar to vstack, and concatenating along the second axis is similar to hstack

(6497, 12)