# Roadmap

### Data preparation
* Data import
* Remove stop words, non english words
* Document frequency threshold, term length threshold

### Term weightings
* Tf
* TfIdf (sklearn.feature_extraction.text.TfidfTransformer)
* Binary

### Dimensionality reduction (PCA)

* sklearn.decomposition.PCA

### Clustering

* KMeans with sklearn.cluster.MiniBatchKMeans
* Kmeans with sklearn.cluster.KMeans

In [16]:
save_load_path = '/home/eolus/Dropbox/MA755 Public/pynotes/Danny-Eole-Yuchen/Pickles'
# save_load_path = '/Users/YuchenZhou/Dropbox (Personal)/MA755 Public/pynotes/Danny-Eole-Yuchen/Pickles'

In [2]:
import pickle
import pandas as pd
import numpy as np

### Import lyrics data

In [3]:
import sqlite3

# Define path
# lyrics_path = '/home/eolus/Documents/MA755_data/LyricsData'
lyrics_path = '/Users/YuchenZhou/Documents/FILES/STUDY/755Assignments/LyricsData'

# Put `lyrics` table from `mxm_dataset.db` into lyrics_df pandas df
con = sqlite3.connect(lyrics_path +'/mxm_dataset.db')
lyrics_df = pd.read_sql_query("SELECT track_id, word, count from lyrics limit 1000000", con)
con.close()

lyrics_df.shape

(1000000, 3)

In [4]:
lyrics_df.head()

Unnamed: 0,track_id,word,count
0,TRAAAAV128F421A322,i,6
1,TRAAAAV128F421A322,the,4
2,TRAAAAV128F421A322,you,2
3,TRAAAAV128F421A322,to,2
4,TRAAAAV128F421A322,and,5


## Remove stop words
#### Remove `nltk` stop words

In [5]:
# Next, load the stopwords from the `nltk` package.
import nltk 
stopword_set = set(nltk.corpus.stopwords.words('english'))
is_not_stopword = [word not in stopword_set for word in lyrics_df.word.ravel()]

# Remove nltk stopwords.
lyrics_df = lyrics_df[is_not_stopword]

# Unique words remaining
count_unique_w = len(pd.unique(lyrics_df.word.ravel()))
print('COUNT UNIQUE WORDS = {count_unique_w}'.format(count_unique_w = count_unique_w))

COUNT UNIQUE WORDS = 4883


#### Remove custom stop words

In [6]:
import csv
from itertools import chain

list_csv = []

with open(save_load_path+'/StopWords/SMART_stop_words.csv', 'r') as f:
    reader = csv.reader(f)
    list_csv = list(reader)
    
smart_stop_words = list(chain.from_iterable(list_csv))

print("Number of new stop words: {len_stop} \n".format(len_stop = len(smart_stop_words)))

Number of new stop words: 593 



In [7]:
# Create list of word not in smart_stop_words
is_not_smart_stopword = [word not in smart_stop_words for word in lyrics_df.word.ravel()]

# Remove nsmart stopwords.
lyrics_df = lyrics_df[is_not_smart_stopword]

# Unique words remaining
count_unique_w = len(pd.unique(lyrics_df.word.ravel()))
print('COUNT UNIQUE WORDS = {count_unique_w}'.format(count_unique_w = count_unique_w))

COUNT UNIQUE WORDS = 4683


### Pivot ----> [index=track_id | columns=word | values=count]

In [8]:
# Pivot `lyrics_df`
final_df = lyrics_df.pivot(index='track_id', columns='word', values='count')

# Fill NaN values with 0
NaN_locations = np.isnan(final_df)
final_df[NaN_locations] = 0

# Display
final_df.head()

word,aan,ab,abandon,aber,abl,aboard,abov,abr,absenc,absolut,...,è,é,él,és,était,être,ô,über,–,‘caus
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TRAAAAV128F421A322,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TRAAABD128F429CF47,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TRAAAED128E0783FAB,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TRAAAEF128F4273421,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TRAAAEW128F42930C0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
print('SONGS COUNT: {song_count}'.format(song_count = final_df.shape[0]))

SONGS COUNT: 12460


### Remove non english terms / Documents
#### Load collection of non-english terms

In [10]:
# Load list of foreign words in memory
list_csv = []
with open(save_load_path+'/StopWords/foreign_words.csv', 'r') as f:
    reader = csv.reader(f)
    list_csv = list(reader)
    
foreign_words = list(chain.from_iterable(list_csv))

print('Sample:\n{sample}'.format(sample = foreign_words[:9]))

Sample:
['acaba', 'acabar', 'alegria', 'alguna', 'algún', 'alibi', 'ando', 'aquel', 'aquella']


#### List all track_id with non zero amount of foreign words

In [11]:
existing_terms = final_df.columns.values.tolist()

foreign_songs = []
associated_words = []

for i, row in final_df.iterrows():
    foreign = False
    foreign_word = None
    
    for foreign_word in foreign_words:
        if foreign_word in existing_terms:
            if(row[foreign_word] > 0):
                foreign = True
                associated_words.append(foreign_word)
                break
    if foreign:
        foreign_songs.append(i)
    
print("Number of foreign songs detected: {num_foreign_songs}"\
      .format(num_foreign_songs = len(foreign_songs)))

Number of foreign songs detected: 2108


#### Review foreign songs detected by song title

In [18]:
# Load look-up table for song title; `reference.df`
reference_df = pd.read_pickle(save_load_path+'/mss_df.pkl')
reference_df = reference_df.filter(items = ['track','title'])
reference_df.columns = ['TRACK_ID','TITLE']

In [None]:
# Subset `reference.df` based to the elements of `foreign_songs`
reference_df[reference_df['TRACK_ID'].isin(foreign_songs)]

#### Remove foreign songs from `final.df` dataframe

In [13]:
# Create list of english tracks
is_english_track = [track not in foreign_songs for track in final_df.index.values]

# Subset `final.df` to keep only index in `is_english_track`
final_df = final_df.ix[is_english_track]

# Count remaining tracks (english)
print('COUNT ENGLISH TRACKS: {english_tracks}'.format(english_tracks = final_df.shape[0]))

COUNT ENGLISH TRACKS: 10352


#### Drop terms where freq==0 for all documents

In [14]:
# List terms with 0 freq on all documents
word_no_doc = pd.concat([final_df.sum()], axis=1)
word_no_doc = word_no_doc[word_no_doc[0] == 0]
drop_terms = word_no_doc.index.values.tolist()
drop_terms

['abr',
 'acaba',
 'acabar',
 'adentro',
 'adesso',
 'adio',
 'adió',
 'agua',
 'ahi',
 'ahí',
 'al',
 'alegr',
 'alegria',
 'alegría',
 'algo',
 'alguien',
 'alguna',
 'alguém',
 'algún',
 'alibi',
 'aller',
 'alltid',
 'allá',
 'allí',
 'alma',
 'alor',
 'altro',
 'amant',
 'amart',
 'amiga',
 'anch',
 'ancora',
 'anda',
 'andar',
 'ando',
 'aprend',
 'aprè',
 'aquel',
 'aquella',
 'aquello',
 'aqui',
 'aquí',
 'asi',
 'así',
 'atrá',
 'até',
 'aujourdhui',
 'aun',
 'aunqu',
 'aussi',
 'autr',
 'aux',
 'avait',
 'avant',
 'avec',
 'ayer',
 'azul',
 'año',
 'aún',
 'baila',
 'bailando',
 'bailar',
 'bajo',
 'barrio',
 'basta',
 'bella',
 'bello',
 'bene',
 'beso',
 'besoin',
 'besser',
 'bien',
 'bij',
 'bist',
 'bitt',
 'blanca',
 'blanco',
 'bli',
 'blick',
 'boca',
 'bonheur',
 'bonita',
 'bonito',
 'bord',
 'brauch',
 'brazo',
 'buen',
 'buena',
 'bueno',
 'busca',
 'buscando',
 'buscar',
 'busco',
 'cabeza',
 'cada',
 'cae',
 'caer',
 'calma',
 'calor',
 'cama',
 'cambia',
 'camb

In [15]:
# Drop the columns of `final_df` if column name in `drop_terms`
current_terms = final_df.columns

for word in drop_terms:
    if word in current_terms:
        final_df = final_df.drop(word, 1)


# Count remaining terms
print('COUNT REMAINING TERMS: {term_count}'.format(term_count = final_df.shape[1]))

COUNT REMAINING TERMS: 3893


### Min document threshold (==3) + Min term-length threshold (==3)

    + Drops terms found in strictly less than `doc_count_threshold` documents
    + Drops terms if term length is strictly inferior to `term_length_threshold`

In [16]:
def termThreshold(dtm_df, doc_count_threshold, term_length_threshold):
    
    # Build array for the count of non-0 term-scores for each term
    sum_vec = dtm_df[ dtm_df > 0 ].count(axis=0)
    
    # Initialize list containing position of term columns to drop from dtm
    term_drop = []

    c = 0
    for column in dtm_df:
        if (len(column) < term_length_threshold) or (sum_vec[c] < doc_count_threshold) : 
            term_drop.append(column)
        c +=1
    
    dtm_df.drop(dtm_df[term_drop], axis=1, inplace=True)
    return(dtm_df)

In [17]:
# Apply `termThreshold()` to `final_df`
final_df = termThreshold(final_df, 3, 3)

print('SUMMARY \n\tNumber of Terms:\t{term_count}\n\tNumber of Documents:\t{doc_count}'\
      .format(term_count = final_df.shape[1], doc_count = final_df.shape[0]))

SUMMARY 
	Number of Terms:	3423
	Number of Documents:	10352


In [18]:
final_df.to_pickle('/home/eolus/Documents/MA755_data/myPickle/final_df')
# final_df.to_pickle('/Users/YuchenZhou/Documents/FILES/STUDY/755Assignments/myPickles/final_df')


In [8]:
import pandas as pd
final_df = pd.read_pickle('/home/eolus/Documents/MA755_data/myPickle/final_df')
# final_df = pd.read_pickle('/Users/YuchenZhou/Documents/FILES/STUDY/755Assignments/myPickles/final_df')

### Document Term Matrix

In [9]:
# Folder path where DTM matrices are saved (input for clustering analysis)
dtm_path = '/home/eolus/Dropbox/MA755 Public/pynotes/Danny-Eole-Yuchen/Pickles/DTM'
# dtm_path = '/Users/YuchenZhou/Dropbox (Personal)/MA755 Public/pynotes/Danny-Eole-Yuchen/Pickles/DTM'

#### Document-term weighting (output is np.array)


##### TFIDF

We weight the term frequency with the inverse document frequency. The document frequency is the number of documents a term appears in. By doing this, we scale down the impact of terms that occur very frequently across documents to have a more balance weight on term frequency.

**Assumption: high frequency terms across documents are less informative than rare frequency terms.**

##### Binary

Binary method aims to convert the term weight to either 1 if the term appears in a document or 0 if it does not.
This scales down the impact of not informative term repetition in documents. For example a long document is more likely to contains repetition but can still be close to a shorter document containing the same terms.

**Assumption: term repetition is not relevant to represent document distance.**

In [10]:
def genTfMatrix(dtm_matrix):
    # Convert dataframe to numpy array
    dtm_matrix_tf_np = dtm_matrix.as_matrix(columns=None)
    return(dtm_matrix_tf_np)    
        
def genBinaryMatrix(dtm_matrix):
    dtm_matrix[dtm_matrix > 1] = 1
    dtm_matrix_binary_np = dtm_matrix.as_matrix()
    return(dtm_matrix_binary_np)

from sklearn.feature_extraction.text import TfidfTransformer
def genTfIdfMatrix(dtm_matrix):
    tfidf = TfidfTransformer(norm="l2") # l2 for Euclidean dist
    tfidf.fit(dtm_matrix)
    dtm_tfidf_matrix = tfidf.transform(dtm_matrix)
    dtm_tfidf_matrix_dense = dtm_tfidf_matrix.todense() # Sparse -> dense
    return(dtm_tfidf_matrix_dense)

### Dimensionality reduction using `PCA` from `sklearn.decomposition`

In [11]:
from sklearn.decomposition import PCA

def doPCA(data, component_dim = 1000):
    pca = PCA(n_components = component_dim)
    pca.fit(data)
    return(pca)

#### Apply PC on TF-weighted DTM

In [23]:
# Convert to TF weighted np.array matrix
tf_matrix = genTfMatrix(final_df)

# Fit the PCA model
pca = doPCA(tf_matrix, 800)

# Total variance explained
print('TOTAL VARIANCE EXPLAINED: {var}%'\
      .format(var = round(sum(pca.explained_variance_ratio_) * 100,1)))

TOTAL VARIANCE EXPLAINED: 89.9%


In [24]:
# Create PCA transformed matrix
tf_transformed = pca.transform(tf_matrix)

# Save as `.npy` file
np.save(dtm_path+ '/tf_matrix', tf_transformed)

#### Apply PC on TFIDF-weighted DTM

In [12]:
# Convert to TF weighted np.array matrix
tfidf_matrix = genTfIdfMatrix(final_df)

# Fit the PCA model
pca = doPCA(tfidf_matrix, 1200)

# Total variance explained
print('TOTAL VARIANCE EXPLAINED: {var}%'\
      .format(var = round(sum(pca.explained_variance_ratio_) * 100,1)))

TOTAL VARIANCE EXPLAINED: 82.2%


In [13]:
# Create PCA transformed matrix
tfidf_transformed = pca.transform(tfidf_matrix)

# Save as `.npy` file
np.save(dtm_path+ '/tfidf_matrix', tfidf_transformed)

#### Apply PC on binary-weighted DTM

In [27]:
# Convert to TF weighted np.array matrix
binary_matrix = genBinaryMatrix(final_df)

# Fit the PCA model
pca = doPCA(binary_matrix, 1000)

# Total variance explained
print('TOTAL VARIANCE EXPLAINED: {var}%'\
      .format(var = round(sum(pca.explained_variance_ratio_) * 100,1)))

TOTAL VARIANCE EXPLAINED: 84.7%


In [28]:
# Create PCA transformed matrix
binary_transformed = pca.transform(binary_matrix)

# Save as `.npy` file
np.save(dtm_path+ '/binary_matrix', binary_transformed)

### Reference Dataframe
#### Split data into a training and a test set

I merge with a left-join:
* On left: `final_df` containing `TRACK_ID` of a subset of the 237,000 songs with lyrics
* On right: `reference.df` containing `TRACK_ID` and `TITLE` from the 10,000 songs subset

Logically, the left-join will see some missing `TITLE` from records in `final.df` that are not in `reference.df`.
I will use the 2,700 songs with a `TITLE` as my test set while the remaining 7,300 with no `TITLE` makes for a great training set.

In [19]:
left = pd.DataFrame(final_df.index.values.tolist(), columns = ['TRACK_ID'])
right = reference_df

# Left join
track_ref_df = pd.merge(left, right, on='TRACK_ID', how='left')

# Identify cases no song title, with title == 'No title'
track_ref_df.fillna('No title', inplace = True)

# All songs without title information are put in the training set
track_ref_df['isTest'] = track_ref_df['TITLE'].apply(lambda x: 0 if x == 'No title' else 1)

track_ref_df.head()

Unnamed: 0,TRACK_ID,TITLE,isTest
0,TRAAAAV128F421A322,No title,0
1,TRAAABD128F429CF47,b'Soul Deep',1
2,TRAAAED128E0783FAB,No title,0
3,TRAAAEF128F4273421,b'Something Girls',1
4,TRAAAEW128F42930C0,No title,0


### Clustering


#### KMeans

Clustering technique based on centroids

1. Select initial cluster centers: The algorithm arbitrarily selects k points as the initial cluster centers (“means”).
        - Note 1: Non deterministic
        - Note 2: init = Kmeans++ provides a smart initialization of the centroids (converge fast)
2. Each point in the dataset is assigned to the closest cluster.
3. Each cluster center is recomputed as the average of the points in that cluster.
4. Repeat 2 and 3

**Why Kmeans?**
- Fast, robust, easy to understand
- Efficient

#### MiniBatchKMeans

The MiniBatchKMeans is a variant of the KMeans algorithm which runs on subsets of the input data to reduce the computation time.

1. ’form a mini-batch’: Draw n samples randomly from the dataset
2. Update centroids found during previous iterations of the algorithm

*These steps are performed until convergence or a predetermined number of iterations is reached.*

In [20]:
import pandas as pd
import numpy as np

In [21]:
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from sklearn import datasets

#### Load data

In [30]:
clustering_input_path = '/home/eolus/Dropbox/MA755 Public/pynotes/Danny-Eole-Yuchen/Pickles/DTM'
# clustering_input_path = '/Users/YuchenZhou/Dropbox (Personal)/MA755 Public/pynotes/Danny-Eole-Yuchen/Pickles/DTM'


tfidf_data_np = np.load(clustering_input_path+'/tfidf_matrix.npy')
binary_data_np = np.load(clustering_input_path+'/binary_matrix.npy')

#### Minibatch kmeans on TFIDF weighted + PC DTM

In [31]:
N_CLUSTERS = 70

In [32]:
# Perform clustering on entire data set
tfidf_mini_k_means = MiniBatchKMeans(n_clusters=N_CLUSTERS, random_state=10)
tfidf_labels = tfidf_mini_k_means.fit_predict(tfidf_data_np)

In [33]:
# Concatenate cluster labels to reference data frame
track_ref_df['tfidf_miniKmeans_label'] = tfidf_labels
track_ref_df.head()

Unnamed: 0,TRACK_ID,TITLE,isTest,tfidf_miniKmeans_label
0,TRAAAAV128F421A322,No title,0,35
1,TRAAABD128F429CF47,b'Soul Deep',1,10
2,TRAAAED128E0783FAB,No title,0,5
3,TRAAAEF128F4273421,b'Something Girls',1,10
4,TRAAAEW128F42930C0,No title,0,35


#### Minibatch kmeans on Binary weighted + PC DTM

In [34]:
# Perform clustering on entire data set
binary_mini_k_means = MiniBatchKMeans(n_clusters=N_CLUSTERS, random_state=10)
binary_labels = binary_mini_k_means.fit_predict(binary_data_np)

In [35]:
# Concatenate cluster labels to reference data frame
track_ref_df['binary_miniKmeans_label'] = binary_labels
track_ref_df.head()

Unnamed: 0,TRACK_ID,TITLE,isTest,tfidf_miniKmeans_label,binary_miniKmeans_label
0,TRAAAAV128F421A322,No title,0,35,65
1,TRAAABD128F429CF47,b'Soul Deep',1,10,65
2,TRAAAED128E0783FAB,No title,0,5,3
3,TRAAAEF128F4273421,b'Something Girls',1,10,14
4,TRAAAEW128F42930C0,No title,0,35,65


### Cluster evaluation

In [36]:
# Inspect cluster sizes
def cluster_size_eval(labels, min_size = 10, max_size = 900):
    from collections import Counter
    cluster_size = Counter(labels.tolist())

    top_clusters = {k: v for k, v in cluster_size.items() \
                if v >= min_size and v <= max_size}
    
    return(top_clusters)

In [37]:
tfidf_top_clusters = cluster_size_eval(tfidf_labels)
print('TFIDF CLUSTER SIZE:')
tfidf_top_clusters

TFIDF CLUSTER SIZE:


{1: 108,
 5: 405,
 8: 296,
 9: 304,
 11: 99,
 13: 92,
 19: 37,
 20: 121,
 23: 16,
 26: 15,
 27: 10,
 29: 14,
 31: 60,
 46: 16,
 47: 12,
 55: 27,
 57: 63,
 58: 236,
 61: 12,
 62: 69}

In [38]:
from collections import Counter

Counter(tfidf_labels.tolist()) 

Counter({0: 5,
         1: 108,
         2: 2,
         3: 2,
         4: 9,
         5: 405,
         6: 1,
         7: 5,
         8: 296,
         9: 304,
         10: 2535,
         11: 99,
         12: 1,
         13: 92,
         14: 1,
         15: 1,
         16: 1,
         17: 1,
         18: 1,
         19: 37,
         20: 121,
         21: 1,
         22: 1,
         23: 16,
         24: 1,
         25: 4,
         26: 15,
         27: 10,
         28: 1,
         29: 14,
         30: 1,
         31: 60,
         32: 1,
         33: 1,
         34: 2,
         35: 5714,
         36: 2,
         37: 1,
         38: 2,
         39: 1,
         40: 1,
         41: 4,
         42: 1,
         43: 4,
         44: 1,
         45: 1,
         46: 16,
         47: 12,
         48: 1,
         49: 1,
         50: 1,
         51: 1,
         52: 1,
         53: 7,
         54: 1,
         55: 27,
         56: 1,
         57: 63,
         58: 236,
         59: 2,
         60: 1,
    

In [39]:
binary_top_clusters = cluster_size_eval(binary_labels, 0, 1000000)
print('BINARY CLUSTER SIZE:')
binary_top_clusters

BINARY CLUSTER SIZE:


{0: 1,
 1: 1,
 2: 2,
 3: 1075,
 4: 1,
 5: 1,
 6: 1,
 7: 2,
 8: 1,
 9: 1,
 10: 1,
 11: 1,
 12: 1,
 13: 1,
 14: 3,
 15: 1,
 16: 1,
 17: 1,
 18: 1,
 19: 1,
 20: 1,
 21: 1,
 22: 1,
 23: 1,
 24: 1,
 25: 1,
 26: 1,
 27: 1,
 28: 1,
 29: 1,
 30: 1,
 31: 1,
 32: 2,
 33: 1,
 34: 1,
 35: 1,
 36: 1,
 37: 1,
 38: 1,
 39: 1,
 40: 1,
 41: 1,
 42: 1,
 43: 1,
 44: 1,
 45: 1,
 46: 1,
 47: 1,
 48: 1,
 49: 1,
 50: 1,
 51: 1,
 52: 1,
 53: 1,
 54: 2,
 55: 1,
 56: 1,
 57: 1,
 58: 1,
 59: 1,
 60: 1,
 61: 1,
 62: 1,
 63: 1,
 64: 1,
 65: 9203,
 66: 1,
 67: 1,
 68: 1,
 69: 1}

### Analyze top words per song in clusters

In [40]:
def topWords(track_id, number_top_w = 9):
    term_scores = final_df.ix[track_id]
    term_scores.sort_values(ascending = False, inplace = True)
    doc_terms = term_scores[term_scores > 0].index.tolist()

    num_top = min(len(doc_terms), number_top_w)
    res = doc_terms[:num_top]
                  
    return(res)

In [41]:
print('TOP WORDS FOR [{song}]:\n{topw}'.format(song = 'TRAIFGH12903CDB164', topw = topWords('TRAIFGH12903CDB164')))

TOP WORDS FOR [TRAIFGH12903CDB164]:
['night', 'shelter', 'find', 'give', 'hold', 'shadow', 'onc', 'death', 'unfold']


In [42]:
tfidf_cluster_dict = {}

for labs in iter(tfidf_top_clusters.keys()):
    cluster_subset = track_ref_df[track_ref_df['tfidf_miniKmeans_label'] == labs]
    cluster_tracks = cluster_subset['TRACK_ID'].tolist()
    
    tfidf_cluster_dict[labs] = []
    for track in cluster_tracks:
        new_dict = {track : topWords(track)}  
        tfidf_cluster_dict[labs].append(new_dict)

for k, v in  tfidf_cluster_dict.items():
    print('\n\nCLUSTER: {clu}'.format(clu = k))
    for track_dict in v:
        for track_id, top_w in track_dict.items():
            print(top_w)



CLUSTER: 1
['night', 'men', 'ladi', 'cheek', 'chorus', 'shorti', 'floor', 'eye', 'ass']
['kill', 'night', 'dream', 'yeah', 'rememb', 'stage', 'sky', 'swore', 'set']
['morn', 'sleep', 'night', 'smile', 'bodi', 'feet', 'show', 'anoth', 'local']
['guid', 'night', 'light', 'onli', 'face', 'time', 'darl', 'hear', 'make']
['danc', 'wrong', 'spell', 'sway', 'ecstasi', 'day', 'night', 'bodi', 'feel']
['day', 'night', 'hell', 'punish', 'ahead', 'road', 'straight', 'pave', 'good']
['night', 'reason', 'danc', 'floor', 'long', 'moon', 'babi', 'sun', 'feel']
['night', 'eye', 'make', 'soul', 'girl', 'time', 'goe', 'princ', 'life']
['light', 'night', 'shown', 'star', 'made', 'day', 'street', 'yea', 'fate']
['night', 'lone', 'light', 'guid', 'anoth', 'babi', 'onli', 'forev', 'blame']
['jesus', 'heaven', 'night', 'shine', 'love', 'made', 'child', 'make', 'window']
['walk', 'fli', 'night', 'day', 'moonlight', 'moon', 'dawn', 'share', 'watch']
['yeah', 'night', 'turn', 'head', 'begin', 'gonna', 'found'

Comment:

The result of this clustering suggests an overwhelming importance of most common terms.
As a result, clusters are not extremely relevant.

For instance, cluster1 grouped songs in which the term 'night' is predominant. However, songs in this cluster do not hold much term overlap besides the term 'night'.
Similarly, cluster5 groups song containing the term 'love' and cluster8 songs with the term 'baby'. Cluster9 groups songs containing 'hey', cluster11 songs containing 'remember' and cluster20 the term 'back'.

Cluster19 is our 'christmas' cluster. The term 'christmas' is convenient because unlike 'hey', it is a meaningful concept that enables to group songs conveying a similar theme.

Interestingly cluster9 is an homogeneous 'strong language' cluster grouping songs not only around one common term but around a comprehensive lexical field - here, 'ass','shit','nigga','fuck',...- which was the original objective of this research.

Future efforts to better the clusters could include adding new stop words to the stop word filter as well as changing the weighting scheme to mitigate the weight of common terms. Even if the TFIDF weighting scheme already scales down the impact of high document frequency terms we probably want to reinforce this effect further to avoid clusters dominated by a single term.

### KMEANS with training and testing sets

#### Initialize test and training sets

In [84]:
train_ref = track_ref_df[track_ref_df['isTest'] == 0]
test_ref =  track_ref_df[track_ref_df['isTest'] == 1]

# Define list of TRACK_ID for test and train set
train_ref_list = train_ref['TRACK_ID'].tolist()
test_ref_list = test_ref['TRACK_ID'].tolist()

# `sample_df_train` and `sample_df_test` are subsets of `final.df`
sample_df_train = final_df.ix[train_ref_list]
sample_df_test = final_df.ix[test_ref_list]

#### Run KMEANS analysis

In [85]:
cluster = KMeans(n_clusters=N_CLUSTERS, \
                                 init='k-means++', \
                                 n_init=10, \
                                 max_iter=300, \
                                 tol=0.0001, \
                                 precompute_distances='auto', \
                                 verbose=0, \
                                 random_state=None, \
                                 copy_x=True, \
                                 n_jobs=1)

# Compute k-means clustering
cluster.fit(sample_df_train)

# Predict the closest cluster each sample in X belongs to
result = cluster.predict(sample_df_test)

In [87]:
test_ref['kmeans_labels'] = result
test_ref.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0,TRACK_ID,TITLE,isTest,tfidf_miniKmeans_label,kmeans_labels
1,TRAAABD128F429CF47,b'Soul Deep',1,10,47
3,TRAAAEF128F4273421,b'Something Girls',1,10,10
5,TRAAAFD128F92F423A,b'Face the Ashes',1,35,10
10,TRAAARJ128F9320760,b'Pink World',1,23,7
18,TRAABJV128F1460C49,b'Tonight Will Be Alright',1,59,47


In [92]:
tfidf_top_clusters2 = cluster_size_eval(result)
tfidf_top_clusters2

{1: 162,
 3: 88,
 7: 188,
 10: 337,
 16: 79,
 22: 173,
 28: 52,
 37: 207,
 40: 77,
 43: 97,
 45: 244,
 47: 102,
 66: 173}

In [100]:
for clu in list(tfidf_top_clusters2.keys()):
    temp_df = test_ref[test_ref['kmeans_labels'] == clu]
    print(temp_df.head)

<bound method NDFrame.head of                 TRACK_ID                                           TITLE  \
26    TRAACFV128F935E50B               b'James (Hold The Ladder Steady)'   
103   TRAAIII128F427D5D8                     b'Little Girl (LP Version)'   
147   TRAAMPA128F92E7D0D                                     b'Orgofart'   
151   TRAANBH128F9345A6C                              b'Cooler Than You'   
317   TRABBXU128F92FEF48                            b'Jenny Take a Ride'   
389   TRABJIX128C7196953                             b'Stronger Than Me'   
434   TRABNEX128F92C9DEA                                   b'Knocked Up'   
467   TRABPGO128F931B50C                              b"You're My World"   
536   TRABVWG128F9359323                              b'Get It For Free'   
569   TRABZRZ128F42628AD                            b'Steamroller Blues'   
699   TRACLRS12903CE9386                            b'Stand By Your Man'   
700   TRACLRV128EF33FAB8          b'Young Blood (2007 Digi