In [1]:
# Matrix Processing and Numpy

In [2]:
import numpy as np
import json

In [5]:
path = '/home/review.json'
f = open(path)

In [6]:
dataset = []

In [7]:
while len(dataset) < 50000:
    dataset.append(json.loads(f.readline()))

In [8]:
dataset[0]

{'review_id': 'Q1sbwvVQXV2734tPgoKj4Q',
 'user_id': 'hG7b0MtEbXx5QzbzE6C_VA',
 'business_id': 'ujmEBvifdJM6h6RLv4wQIg',
 'stars': 1.0,
 'useful': 6,
 'funny': 1,
 'cool': 0,
 'text': 'Total bill for this horrible service? Over $8Gs. These crooks actually had the nerve to charge us $69 for 3 pills. I checked online the pills can be had for 19 cents EACH! Avoid Hospital ERs at all costs.',
 'date': '2013-05-07 04:34:36'}

In [9]:
# First extract a few simple numerical dfeatures from the dataset

In [10]:
ratings = [d['stars'] for d in dataset]

In [12]:
cool = [d['cool'] for d in dataset]

In [13]:
funny = [d['funny'] for d in dataset]

In [14]:
useful = [d['useful'] for d in dataset]

In [15]:
# Now we will convert these list into numpy arrays

In [16]:
ratings = np.array(ratings) 
cool = np.array(cool) 
funny = np.array(funny) 
useful = np.array(useful) 

In [17]:
ratings

array([1., 5., 5., ..., 4., 2., 5.])

In [18]:
# Numpy arrays can be treated much like regular python arrays but support many more operations

In [19]:
np.mean(ratings)

3.74318

In [20]:
np.var(cool)

3.81934111

In [21]:
# We can compose vectors to build ND-arrays

In [23]:
np.stack([cool, funny, useful])

array([[0, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [6, 0, 3, ..., 2, 1, 0]])

In [26]:
# We can perform matrix operations like computing the transpose to get a feature matrix (X)

In [24]:
features = np.stack([cool, funny, useful]).T

In [25]:
features

array([[0, 1, 6],
       [0, 0, 0],
       [0, 0, 3],
       ...,
       [1, 0, 2],
       [0, 0, 1],
       [0, 0, 0]])

In [28]:
features = np.matrix(features)

In [29]:
features

matrix([[0, 1, 6],
        [0, 0, 0],
        [0, 0, 3],
        ...,
        [1, 0, 2],
        [0, 0, 1],
        [0, 0, 0]])

In [30]:
# This supports operations like standard matrix multiplication

In [31]:
features.T * features

matrix([[205639, 124459, 240154],
        [124459, 129252, 170947],
        [240154, 170947, 472608]])

In [32]:
# Convert matrix inverse

In [33]:
np.linalg.inv(features.T * features)

matrix([[ 1.59632945e-05, -8.90118225e-06, -4.89204294e-06],
        [-8.90118225e-06,  1.97959901e-05, -2.63730322e-06],
        [-4.89204294e-06, -2.63730322e-06,  5.55573066e-06]])

In [34]:
# Numpy overloads primitive operations on matrices, allowing matrices to be used within 
# complex mathematical expressions, in order to perform tansformations of our data

In [35]:
2*np.sin(features) + 3

matrix([[3.        , 4.68294197, 2.441169  ],
        [3.        , 3.        , 3.        ],
        [3.        , 3.        , 3.28224002],
        ...,
        [4.68294197, 3.        , 4.81859485],
        [3.        , 3.        , 4.68294197],
        [3.        , 3.        , 3.        ]])

In [36]:
2*np.sin(features) + 3 > 4

matrix([[False,  True, False],
        [False, False, False],
        [False, False, False],
        ...,
        [ True, False,  True],
        [False, False,  True],
        [False, False, False]])

In [37]:
# Other Numpy features

# ndarray,shape: Get the shape of an array
# reshape: Change the dimensions of an array/matrix
# arange: Create an array containing a range of numbers
# numpy.random: Generate (arrays of) random numbers
# sum, min, max, etc.: Reduction opertions on matrices
# eye: Identity matrix
# trace, eig, etc.: Linear algebra operations
# see https://docs.scipy.org/doc/numpy/user/quickstart.html for more