BerkeleyX: Data8.3x

Foundations of Data Science: Prediction and Machine Learning

In [None]:
from datascience import *
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

Lab 4: Song Classification, Part 1

In [None]:
# dataset was extracted from the Million Song Dataset
# http://labrosa.ee.columbia.edu/millionsong/
lyrics = Table.read_table('../../data/lyrics.csv')

In [None]:
# this song contains 168 words
# "like" appears twice: 2/168 = .0119...
# "love" 10/168 = .059...
lyrics.where("Title", "In Your Eyes").select(0, 1, 2, 3, 4, 5, "like", "love")

In [None]:
title_index = lyrics.index_by('Title')
def row_for_title(title):
    """Return the row for a title, similar to the following expression (but faster)
    
    lyrics.where('Title', title).row(0)
    """
    return title_index.get(title)[0]

In [None]:
row_for_title('In Your Eyes').item('love')

In [None]:
# Question 1.1
# expected total of all proportions in each row, excluding the first three columns
expected_row_sum = 1

In [None]:
Table().with_column('sums', lyrics.drop([0, 1, 2]).apply(sum)).hist(0, normed=None, density=True)

In [None]:
print('Words with frequencies:', lyrics.drop('Title', 'Artist', 'Genre').num_columns)
print('Songs with genres:', lyrics.num_rows)

1.1: Word Stemming

In [None]:
vocab_mapping = Table.read_table('../../data/mxm_reverse_mapping_safe.csv')
stemmed = np.take(lyrics.labels, np.arange(3, len(lyrics.labels)))
vocab_table = Table().with_column('Stem', stemmed).join('Stem', vocab_mapping)
vocab_table.take(np.arange(1100, 1106))

In [None]:
# Question 1.1.1
# percentage of words in vocab_table that are the same as their stemmed form

percent_unchanged = np.count_nonzero(vocab_table.column(0) == vocab_table.column(1)) / vocab_table.num_rows * 100
print(round(percent_unchanged, 2))

percent_unchanged = vocab_table.where('Stem', are.equal_to, 'Word').num_rows / vocab_table.num_rows * 100
print(round(percent_unchanged, 2), 'percent are unchanged')

In [None]:
# Question 1.1.2 
# Assign stemmed_message to the stemmed version of the word "message"
stemmed_message = vocab_table.where('Word', 'message').column(0).item(0)
stemmed_message

In [None]:
# Question 1.1.3 
# Assign unstemmed_singl to the word in vocab_table that has "singl" as its stemmed form
unstemmed_singl = vocab_table.where('Stem', 'singl').column(1).item(0)
unstemmed_singl

1.2: Splitting the dataset

In [None]:
# Here we have defined the proportion of our data
# that we want to designate for training as 11/16ths
# of our total dataset.  5/16ths of the data is
# reserved for testing.

training_proportion = 11/16

num_songs = lyrics.num_rows
num_train = int(num_songs * training_proportion)
num_valid = num_songs - num_train

train_lyrics = lyrics.take(np.arange(num_train))
test_lyrics = lyrics.take(np.arange(num_train, num_songs))

print("Training: ",   train_lyrics.num_rows, ";",
      "Test: ",       test_lyrics.num_rows)

In [None]:
def country_proportion(table):
    """Return the proportion of songs in a table that have the Country genre."""
    return table.where('Genre', are.equal_to('Country')).num_rows / table.num_rows

Table().with_columns(
        'Dataset', make_array('Training', 'Test'),
        'Proportion of Country', make_array(country_proportion(train_lyrics), country_proportion(test_lyrics)))\
       .barh('Dataset')

2: K-Nearest Neighbors - a Guided Example

2.1: Classifying a song

In [None]:
def plot_with_two_features(test_song, training_songs, x_feature, y_feature):
    """Plot a test song and training songs using two features."""
    test_row = row_for_title(test_song)
    distances = Table().with_columns(
            x_feature, [test_row.item(x_feature)],
            y_feature, [test_row.item(y_feature)],
            'Color',   ['Unknown'],
            'Title',   [test_song]
        )
    for song in training_songs:
        row = row_for_title(song)
        distances.append([row.item(x_feature), row.item(y_feature), row.item('Genre'), song])
    distances.scatter(x_feature, y_feature, colors='Color', labels='Title', s=200)
    
training = ["Sangria Wine", "Insane In The Brain"]
plot_with_two_features("In Your Eyes", training, "like", "love")

In [None]:
# Question 2.1.1 
# Compute the distance between the two country songs, In Your Eyes and Sangria Wine,
# using the like and love features only. Assign it the name country_distance
in_your_eyes = row_for_title("In Your Eyes")
sangria_wine = row_for_title("Sangria Wine")
country_distance = (
    (in_your_eyes.item('like') - sangria_wine.item('like')) ** 2 +
    (in_your_eyes.item('love') - sangria_wine.item('love')) ** 2) ** .5
country_distance

In [None]:
training = ["Sangria Wine", "Lookin' for Love", "Insane In The Brain"]
plot_with_two_features("In Your Eyes", training, "like", "love")

In [None]:
# Question 2.1.2 
# Complete the function distance_two_features
def distance_two_features(title0, title1, x_feature, y_feature):
    """Compute the distance between two songs with titles title0 and title1
    
    Only the features named x_feature and y_feature are used when computing the distance.
    """
    row0 = row_for_title(title0)
    row1 = row_for_title(title1)
    return (
        (row0.item('like') - row1.item('like')) ** 2 +
        (row0.item('love') - row1.item('love')) ** 2) ** .5

for song in make_array("Lookin' for Love", "Insane In The Brain"):
    song_distance = distance_two_features(song, "In Your Eyes", "like", "love")
    print(song, 'distance:\t', song_distance)

In [None]:
# Question 2.1.3 Define the function
def distance_from_in_your_eyes(title):
    """The distance between the given song and "In Your Eyes", based on the features "like" and "love".
    
    This function takes a single argument:
      title: A string, the name of a song.
    """
    row0 = row_for_title(title)
    row1 = row_for_title('In Your Eyes')
    return (
        (row0.item('like') - row1.item('like')) ** 2 +
        (row0.item('love') - row1.item('love')) ** 2) ** .5

In [None]:
# distances = close_songs.column(0) .apply(distance_from_in_your_eyes)
#close_songs.apply(distance_from_in_your_eyes, 'Title')
close_songs.with_column('distance', close_songs.apply(distance_from_in_your_eyes, 'Title')).sort('distance').take(range(7))

In [None]:
# Question 2.1.4
# Using the features "like" and "love",
# what are the names and genres of the 7 songs in the training set closest to "In Your Eyes"?
# To answer this question, make a table named close_songs containing those 7 songs
# with columns "Title", "Artist", "Genre", "like", and "love",
# as well as a column called "distance" that contains the distance from "In Your Eyes".
# The table should be sorted in ascending order by distance.
close_songs = train_lyrics.select(0, 1, 2, 'like', 'love') \
    .with_column('distance', close_songs.apply(distance_from_in_your_eyes, 'Title')) \
    .sort('distance').take(range(7))

close_songs

In [None]:
# Question 2.1.5 
# Define the function most_common

def most_common(label, table):
    """The most common element in a column of a table.
    
    This function takes two arguments:
      label: The label of a column, a string.
      table: A table.
     
    It returns the most common value in that column of that table.
    In case of a tie, it returns any one of the most common values
    """
    x = table.group(label)
    return x.where('count', x['count'].max())[0][0]


# Calling most_common on your table of 7 nearest neighbors classifies
# "In Your Eyes" as a country song, 4 votes to 3.
most_common('Genre', close_songs)