# Project: Sentiment analysis

In [None]:
### Notes

- Recommend songs to a user based on the songs they listen to
- Pick the song that is closest to the geometric mean of the 
    songs which the user listens to. 
- Create the sentiment variable for each song    
    
- Looking at "profile" dataset to get the songs the user listens to


## Goals:
1. Sentiment analysis using text mining technique on lyrics data
2. Cluster analysis of song sentiment using sentiment data determined in previous step

## Data: 

###  Opinion Lexicon: Positive & Negative

`positive-words.txt` contains a list of POSITIVE opinion words (or sentiment words).
`negative-words.txt` contains a list of NEGATIVE opinion words (or sentiment words).

This file and the papers can all be downloaded from 
http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html

Citation:
Minqing Hu and Bing Liu. "Mining and Summarizing Customer Reviews." 
Proceedings of the ACM SIGKDD International Conference on Knowledge 
Discovery and Data Mining (KDD-2004), Aug 22-25, 2004, Seattle, Washington, USA.
Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and Comparing Opinions on the Web."
Proceedings of the 14th International World Wide Web conference (WWW-2005), May 10-14, 2005, Chiba, Japan.

### Mapping {Stemmed word : Unstemmed word}

`mxm_reverse_mapping.txt` contains the mapping for Stemmed word -> Unstemmed word

This file can be downloaded from 
http://labrosa.ee.columbia.edu/millionsong/sites/default/files/mxm_reverse_mapping.txt

Citation:
musiXmatch dataset, the official lyrics collection for the Million Song Dataset
author: Thierry Bertin-Mahieux and Daniel P.W. Ellis and Brian Whitman and Paul Lamere
title: The Million Song Dataset

## Modules and functions:

In [1]:
import pandas as pd
import numpy as np
import re
import pickle
import sqlite3
import itertools

### 1 - Preparation of the 'mood' data using Pandas

In [2]:
path = '/home/eolus/Documents/MA755_data/LyricsData/'

# Put text content stemmed/unstemmed entries in list
f_1 = open(path+'mxm_reverse_mapping.txt', 'r')
lines_1 = [line.rstrip('\n').split('<SEP>') for line in f_1.readlines()]

# Put list in pandas df
df_1 = pd.DataFrame(lines_1, columns=['Stemmed', 'Unstemmed'])

# Remove non letter terms
df_stem_mapping = df_1[df_1.Stemmed.str.match("^[a-zA-Z]+$") == True]
df_stem_mapping.head()

Unnamed: 0,Stemmed,Unstemmed
1,pido,pido
2,hatr,hatred
3,pide,pide
4,yellow,yellow
5,four,four


In [3]:
# Put text content positive & negative unstemmed entries in list
f_2 = open(path+'positive-words.txt', 'r', encoding='ISO-8859-1')
lines_2 = [line.rstrip('\n') for line in f_2.readlines()]

f_3 = open(path+'negative-words.txt', 'r', encoding='ISO-8859-1')
lines_3 = [line.rstrip('\n') for line in f_3.readlines()]

In [4]:
# Put list positive in pandas df
df_2 = pd.DataFrame(lines_2, columns=['Unstemmed'])
df_2['Mood'] = pd.Series([1] * len(df_2.index) )
df_2.head()

Unnamed: 0,Unstemmed,Mood
0,a+,1
1,abound,1
2,abounds,1
3,abundance,1
4,abundant,1


In [5]:
# Put list negative in pandas df
df_3 = pd.DataFrame(lines_3, columns=['Unstemmed'])
df_3['Mood'] = pd.Series([-1] * len(df_3.index) )
df_3.head()

Unnamed: 0,Unstemmed,Mood
0,2-faced,-1
1,2-faces,-1
2,abnormal,-1
3,abolish,-1
4,abominable,-1


In [6]:
# Stack the positive and negative df on top of each other
df_mood = pd.concat([df_2, df_3], axis=0)
df_mood.head()
df_mood.tail()

Unnamed: 0,Unstemmed,Mood
4778,zaps,-1
4779,zealot,-1
4780,zealous,-1
4781,zealously,-1
4782,zombie,-1


In [7]:
# Define outter join data.frame to JOIN stem_mapping and df_mood on `Unstemmed` column
df_outter_join = pd.merge(df_stem_mapping, df_mood, on='Unstemmed', how='outer')
df_outter_join.head()

Unnamed: 0,Stemmed,Unstemmed,Mood
0,pido,pido,
1,hatr,hatred,-1.0
2,pide,pide,
3,yellow,yellow,
4,four,four,


In [8]:
# Filter out Nan values in `Stemmed` (meaning is not in Lyrics bag of word) and from `Mood`(meaning not in sentiment lexicon docs)
df_stem = df_outter_join[df_outter_join.Stemmed.notnull() & df_outter_join.Mood.notnull()]
print('Stemmed words tagged with a mood value: %d' %(len(df_stem.index)))
df_stem.head()

Stemmed words tagged with a mood value: 737


Unnamed: 0,Stemmed,Unstemmed,Mood
1,hatr,hatred,-1
7,thirst,thirst,-1
10,hate,hate,-1
17,pardon,pardon,1
20,sorri,sorry,-1


In [9]:
# Save `df_stem` to pickle file:
save_load_path = '/home/eolus/Documents/MA755_data/myPickles'
df_stem.to_pickle(save_load_path+'/df_stem.pkl')







### 2 - Preparation of the 'lyrics' data using SQLite and Pandas

In [None]:
import pandas as pd
import numpy as np
import pickle
import sqlite3
import itertools

In [20]:
# Define path
lyrics_path = '/home/eolus/Documents/MA755_data/LyricsData'
pickle_path = '/home/eolus/Documents/MA755_data/myPickles'

# Access MXM data stored in SQLite using Python and Pandas
con = sqlite3.connect(lyrics_path +'/mxm_dataset.db')
c = con.cursor()

# Get set of track_id
c.execute('SELECT DISTINCT track_id FROM lyrics')
data_track_id = c.fetchall()
set_track_id = set(itertools.chain.from_iterable(data_track_id))

# Convert to list of track_id to use as DataFrame index
list_track_id = []
list_track_id.extend(set_track_id)

for track_id in list_track_id[0:9]:
    print(track_id)

TREHRKF128F423349E
TRWGKJQ128F9347978
TRHUJRK12903CB4638
TRCZSUV128F42816A1
TRFNAIW128F421F3DD
TRKHUIZ128F934435A
TRYYAHV128F4277393
TRFXFMA128F42652C7
TRPJCAN128F1459DE2


In [21]:
# Load df_stem from the pickle file (STEM | MOOD)
df_stem = pd.read_pickle(pickle_path+'/df_stem.pkl')# Put unique stemmed words into a set for faster look-ups
stemmed_set = set(pd.unique(df_stem.Stemmed.ravel()))

list_stemmed_set = []
list_stemmed_set.extend(stemmed_set)
for stem in list_stemmed_set[0:9]:
    print(stem)

skinni
suav
naughti
leer
protect
kill
stranger
guilt
beg


In [22]:
# Create list of `lyrics_df` column names [<WORD#1>, ... ,<WORD#N>, 'COUNT_OTHER']
column_names = []
for word in stemmed_set:
    column_names.append(word.upper())
column_names.append('COUNT_OTHER')

# Initialize pd.dataframe `lyrics_df` : INDEX = TRACK_ID | <WORD#1> | ... | <WORD#N> | COUNT_OTHER
lyrics_df = pd.DataFrame(columns = column_names, index = list_track_id)
lyrics_df = lyrics_df.fillna(0)
lyrics_df[column_names] = lyrics_df[column_names].astype('int32')
print(lyrics_df)

# Save initialized `lyrics_df` in pkl file
lyrics_df.to_pickle(pickle_path+'/df_lyrics.pkl')

                    SKINNI  SUAV  NAUGHTI  LEER  PROTECT  KILL  STRANGER  \
TREHRKF128F423349E       0     0        0     0        0     0         0   
TRWGKJQ128F9347978       0     0        0     0        0     0         0   
TRHUJRK12903CB4638       0     0        0     0        0     0         0   
TRCZSUV128F42816A1       0     0        0     0        0     0         0   
TRFNAIW128F421F3DD       0     0        0     0        0     0         0   
TRKHUIZ128F934435A       0     0        0     0        0     0         0   
TRYYAHV128F4277393       0     0        0     0        0     0         0   
TRFXFMA128F42652C7       0     0        0     0        0     0         0   
TRPJCAN128F1459DE2       0     0        0     0        0     0         0   
TRTEPVI128F427ED34       0     0        0     0        0     0         0   
TRYAATV128F93009B4       0     0        0     0        0     0         0   
TRIWSZK128F934B34B       0     0        0     0        0     0         0   
TRDHFBL128F4

### 3 - Fill lyrics.df

In [None]:
import pandas as pd
import numpy as np
import pickle
import sqlite3
import itertools

In [None]:
lyrics_path = '/home/eolus/Documents/MA755_data/LyricsData'
pickle_path = '/home/eolus/Documents/MA755_data/myPickles'


# Open output DataFrame
try:
    # Open `full_df_lyrics.pkl` if exists
    lyrics_df = pd.read_pickle(pickle_path+'/full_df_lyrics.pkl')
    print("Retrieving work in progress, frame `full_lyrics_df`...\n")
except:
    # Open initialized `lyrics_df` otherwise
    lyrics_df = pd.read_pickle(pickle_path+'/df_lyrics.pkl')
    print("Retrieving empty frame `lyrics_df`...\n")

In [28]:
# Access MXM data stored in SQLite using Python and Pandas
con = sqlite3.connect(lyrics_path +'/mxm_dataset.db')
c = con.cursor()


# Get the remaining list of track_id to process
try:
    # Retrieve from pickle file if exists
    track_list = pickle.load(open(pickle_path + '/track_stack.pkl', 'rb'))
    print("Retrieving stack of tracks to process...\n")
    
except:
    # Generate from .db query otherwise
    c.execute('SELECT DISTINCT track_id FROM lyrics')
    data_track_id = c.fetchall()

    #Get set of unique track_id's
    track_set = set(itertools.chain.from_iterable(data_track_id))

    # Convert to list for slicing (create a buffer..)
    track_list = []
    track_list.extend(track_set)
    
    # Save to pickle
    pickle.dump(track_list, open(pickle_path + '/track_stack.pkl', 'wb'))
    print("Generating new stack of tracks to process...\n")
    
for track_id in track_list[0:5]:
    print(track_id)
print('...')
for track_id in track_list[-5:]:
    print(track_id)

Retrieving stack of tracks to process...

TRMWRJM128F427611C
TRBHHZX128F14AE20E
TRGALDW128F92D0CDE
TRAUTTP128F425BDEB
TRSKWNE128F42A1FC7
...
TRHUGGQ128F423844F
TRPBTBJ128F92D4A2A
TROEQUU128E0795673
TRXWLNP128F933C86A
TRIDHXQ128F9346553


In [None]:
iterations_count = 10
track_buffer_size = 10

for i in range(0,iterations_count-1):

    # Select a bunch of track_id for processing: track_buffer
    if (len(track_list) >= track_buffer_size):
        track_buffer = track_list[0:track_buffer_size-1]
    else:
        track_buffer = track_list

    print('')
    print('Buffering tracks:')
    [print(track) for track in track_buffer]
    print('')

    # Query for track data in track_buffer
    placeholder= '?' # For SQLite. See DBAPI paramstyle.
    placeholders= ', '.join(placeholder for unused in track_buffer)
    query= 'SELECT * FROM lyrics WHERE track_id IN (%s)' % placeholders

    # Iterate through tokenized row object (tuple)
    #   [0]          [1]         [2]         [3]       [4]
    #   track_id    |mxm_id     |word       |count     |is_test
    for row in c.execute(query, track_buffer):
        word = row[2].upper()
        track_id = row[0]
        print('ROW = {row}'.format(row = row))
        print('track_id = {track_id}'.format(track_id = track_id))
        print('word = {word}'.format(word = word))
        print('')

        try:
            lyrics_df.loc[track_id][word] += row[3]
        except:
            lyrics_df.loc[track_id]['COUNT_OTHER'] += row[3]


    # Save complete `lyrics_df` in pkl file
    lyrics_df.to_pickle(pickle_path+'/full_df_lyrics.pkl')

    # Remove buffered value from track_list
    track_list = track_list[len(track_buffer):]
    print("Updating track list to process")

    # Save to pickle
    pickle.dump(track_list, open(pickle_path + '/track_stack.pkl', 'wb'))
    print("Saving updated track stack...\n")


Buffering tracks:
TRMWRJM128F427611C
TRBHHZX128F14AE20E
TRGALDW128F92D0CDE
TRAUTTP128F425BDEB
TRSKWNE128F42A1FC7
TRUDGYR12903CCD8CE
TREAMHV128F92DC7EA
TRBKUIX12903CEB628
TRDTUOK128F428D064

ROW = ('TRAUTTP128F425BDEB', 707069, 'of', 11, 0)
track_id =TRAUTTP128F425BDEB
word =OF

ROW = ('TRAUTTP128F425BDEB', 707069, 'say', 2, 0)
track_id =TRAUTTP128F425BDEB
word =SAY

ROW = ('TRAUTTP128F425BDEB', 707069, 'littl', 11, 0)
track_id =TRAUTTP128F425BDEB
word =LITTL

ROW = ('TRAUTTP128F425BDEB', 707069, 'bit', 11, 0)
track_id =TRAUTTP128F425BDEB
word =BIT

ROW = ('TRAUTTP128F425BDEB', 707069, 'bass', 9, 0)
track_id =TRAUTTP128F425BDEB
word =BASS

ROW = ('TRBHHZX128F14AE20E', 3706327, 'i', 11, 0)
track_id =TRBHHZX128F14AE20E
word =I

ROW = ('TRBHHZX128F14AE20E', 3706327, 'the', 19, 0)
track_id =TRBHHZX128F14AE20E
word =THE

ROW = ('TRBHHZX128F14AE20E', 3706327, 'you', 1, 0)
track_id =TRBHHZX128F14AE20E
word =YOU

ROW = ('TRBHHZX128F14AE20E', 3706327, 'to', 7, 0)
track_id =TRBHHZX128F14AE20E
wo

### TESTING : ONE - TWO...

In [None]:
import pandas as pd
import numpy as np
import pickle

In [None]:
lyrics_path = '/home/eolus/Documents/MA755_data/LyricsData'
pickle_path = '/home/eolus/Documents/MA755_data/myPickles'

# Retrieve `lyrics_df`
lyrics_df = pd.read_pickle(pickle_path+'/full_df_lyrics.pkl')

# Deactivate Pandas warning on chained assignment
pd.options.mode.chained_assignment = None

# Create filtered version of the lyrics DataFrame for reporting purpose
lyrics_df_filtered = lyrics_df[(lyrics_df.COUNT_OTHER > 0)]
lyrics_df_filtered['COUNT_MOOD_RATED'] = ''
lyrics_df_filtered['%_MOOD_RATED'] = ''

# Create `count_mood` and `count_no_mood` for reporting purpose.
sum_columns = lyrics_df.columns[:(len(lyrics_df.columns)-2)]

for index, row in lyrics_df_filtered.iterrows():

    count_mood = sum([lyrics_df_filtered.get_value(index, col) for col in sum_columns])
    count_no_mood = lyrics_df_filtered.get_value(index, 'COUNT_OTHER')

    lyrics_df_filtered['COUNT_MOOD_RATED'][index] = count_mood
    lyrics_df_filtered['%_MOOD_RATED'][index] =
    '{percent}%'.format(percent = round((count_mood / (count_mood + count_no_mood) * 100),2))

lyrics_df_reporting = lyrics_df_filtered[['COUNT_OTHER', 'COUNT_MOOD_RATED', '%_MOOD_RATED']]
lyrics_df_reporting.columns = ['NOT_MOOD_RATED', 'MOOD_RATED', '%_MOOD_RATED']

print(lyrics_df_reporting)