# Getting and Cleaning Data

In [None]:
This notebook gives an introduction to getting raw data in JSON format and converting it to a format that is easily
understandable by libraries such as Pandas and NumPy. We clean the data and demonstrate how to deal with missing values.

In [1]:
# Get beer reviews data from Prof. Julian McAuley's website (UC San Diego)
# http://jmcauley.ucsd.edu/cse190/data/beer/beer_50000.json
import numpy as np
import urllib.request

def parseData(fname):
  for l in urllib.request.urlopen(fname):
    yield eval(l)

print ("Reading data...")
data = list(parseData("http://jmcauley.ucsd.edu/cse190/data/beer/beer_50000.json"))
print ("done")

Reading data...
done


In [2]:
# 'data' variable now contains JSON data
data[1]

{'beer/ABV': 6.2,
 'beer/beerId': '48213',
 'beer/brewerId': '10325',
 'beer/name': 'Red Moon',
 'beer/style': 'English Strong Ale',
 'review/appearance': 3.0,
 'review/aroma': 2.5,
 'review/overall': 3.0,
 'review/palate': 3.0,
 'review/taste': 3.0,
 'review/text': 'Dark red color, light beige foam, average.\tIn the smell malt and caramel, not really light.\tAgain malt and caramel in the taste, not bad in the end.\tMaybe a note of honey in teh back, and a light fruitiness.\tAverage body.\tIn the aftertaste a light bitterness, with the malt and red fruit.\tNothing exceptional, but not bad, drinkable beer.',
 'review/timeStruct': {'hour': 13,
  'isdst': 0,
  'mday': 1,
  'min': 44,
  'mon': 3,
  'sec': 57,
  'wday': 6,
  'yday': 60,
  'year': 2009},
 'review/timeUnix': 1235915097,
 'user/profileName': 'stcules'}

In [3]:
# Number of beer reviews that we have
len(data)

50000

In [4]:
# Number of features
# Does not count the number of sub-features (or nested features). E.g. - mday, min, mon, sec etc 
len(data[0])

14

# Visualize Data

In [5]:
# Restricting number of samples for simplicity and quick execution
import random
random.shuffle(data) # Shuffle the data so that we are more likely to get a true picture of the data
data = data[:5000]

In [None]:
# Taking just two features and plotting them
# This will also give us an idea if the two features taken are somehow correlated

import matplotlib.pyplot as plt

x = [d['beer/ABV'] for d in data]
y = [d['review/overall'] for d in data]
plt.plot(x,y)
plt.show()

In [None]:
# Okay, so they were not correlated much.
# Taste should be a better indication of the overall rating, right?

x = [d['review/taste'] for d in data]
y = [d['review/overall'] for d in data]
plt.plot(x,y)
plt.show()
# Bingo! More positive correlation than the last case for sure!

Let's assume that we need just ABV (Alcohol By Volume) and the overall beer rating

In [None]:
# How scikit-learn expects data?
# X = input - set of features that we have
# y = output - the prediction that we need to make (can be real valued or discrete)
# e.g. - Given an email, predict spam (1) or not spam (0). Here X = email, y = spam/not spam: Classification Problem
# e.g. - Given area of the house in square feet, predict the price of the house
# X = area in square feet, y = Price: Regression Problem

In [None]:
# What if ABV values is missing from some reviews?
X = [d['beer/ABV'] if 'beer/ABV' in d else 5 for d in data] # Replace the value by 5 whereever ABV value is missing
y = [d['review/overall'] for d in data]

In [None]:
# Better would be to substitute by mean

# find the mean
sum_abv = 0.0
for d in data:
    if 'beer/ABV' in d:
        sum_abv += d['beer/ABV']
mean_abv = sum_abv/len(data)

X = [d['beer/ABV'] if 'beer/ABV' in d else mean_abv for d in data]
y = [d['review/overall'] for d in data]

Load data to Pandas Dataframe

In [None]:
import pandas

pd_df = pandas.DataFrame(data)
pd_df = pd_df[:5000]

In [None]:
# Check if there are any values of ABV which are null/empty/NaN
null_cols = pandas.isnull(pd_df['beer/ABV'])
print (np.sum(null_cols == True))

In [None]:
# No review with invalid ABV value!

In [None]:
# What about reviewer's age?
null_cols = pandas.isnull(pd_df['user/ageInSeconds'])
print (np.sum(null_cols == True))

In [None]:
# Fill NA/NaN values using the specified method
pd_df.fillna(0)
# This method works in most cases, but should be avoided. Better ways to deal with null values are:
# -> Fill with that column's mean
# -> Drop the row altogether (provided null values constitute a small subset of the total sample size)

In [None]:
# Filling with column's mean
pd_df.fillna(pd_df.mean())

In [None]:
# Seaborn is an interesting library which allows us to plot confusion matrices and heatmaps
import seaborn as sns

# Get all correlations
corr = pd_df.corr()
# Plot the heatmap
corr_plot = sns.heatmap(corr, xticklabels=corr.columns.values, yticklabels=corr.columns.values).get_figure()
# Save the figure on disk and view it
corr_plot.savefig('./correlation_matrix.png', dpi=1024, bbox_inches='tight')