# Imports

In [24]:
# Data processing and functions
import pandas as pd
import numpy as np
import scipy as sp

# Analytics and modeling
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV


# Graphing and visualizing
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from matplotlib import cm
from pylab import savefig

from scipy import sparse, io

# Setting graphing preferences
sns.set(style="darkgrid", color_codes=True)

# Printing
import locale

# Show plots locally
locale.setlocale( locale.LC_ALL, '' )

%matplotlib inline

# Read-in

#### Features

In [2]:
#read in full feature data

path = r'C:\Users\Duncan\Desktop\School\lyric_analysis\feature_engineering\\'
file = r'full_feature_df.csv'

df = pd.read_csv(path + file)
df.head()

Unnamed: 0.1,Unnamed: 0,title,artist,chart,achievement,affection,aggression,air_travel,alcohol,ancient,...,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
0,0,0 to 100 the catch up,drake,1,0.001119,0.002237,0.001119,0.0,0.001119,0.0,...,6.4e-05,0.013941,0.12908,0.375114,0.047032,0.123831,6.4e-05,0.053529,6.4e-05,0.0453
1,1,1 2 3 4 sumpin new,coolio,1,0.0,0.001493,0.0,0.001493,0.001493,0.0,...,0.003029,8.8e-05,0.519424,0.163619,8.8e-05,8.8e-05,8.8e-05,0.017569,0.049457,0.116211
2,2,1 2 3 red light,1910 fruitgum company,1,0.0,0.027586,0.0,0.0,0.0,0.0,...,0.053761,0.000368,0.000368,0.000368,0.000368,0.053659,0.000368,0.077738,0.204193,0.023555
3,3,1 2 step,ciara and missy elliott,1,0.002288,0.011442,0.0,0.0,0.0,0.0,...,0.020948,0.000132,0.485826,0.077888,0.094944,0.031137,0.000132,0.021016,0.008769,0.025581
4,4,1 thing,amerie,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0001,0.0001,0.0001,0.127557,0.012391,0.147871,0.0001,0.0001,0.0001,0.117694


In [3]:
# Drop unwanted columns

df.drop('Unnamed: 0', axis=1, inplace=True)
df.reset_index(inplace=True, drop=True)

In [4]:
df.shape

(46574, 225)

#### TF-IDF

In [32]:
tfidf = io.mmread(r"C:\Users\Duncan\Desktop\School\Data Mining\data\tfidf\tfidf.mtx")

In [30]:
import sys

# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# Get a sorted list of the objects and their sizes
sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_')
        and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

[('df', 89712768),
 ('train', 4968340),
 ('test', 1656516),
 ('DecisionTreeClassifier', 1056),
 ('GridSearchCV', 1056),
 ('PolynomialFeatures', 1056),
 ('RandomForestClassifier', 1056),
 ('StandardScaler', 1056),
 ('NamespaceMagics', 888),
 ('accuracy_score', 136),
 ('classification_report', 136),
 ('confusion_matrix', 136),
 ('cross_val_predict', 136),
 ('cross_val_score', 136),
 ('roc_auc_score', 136),
 ('roc_curve', 136),
 ('savefig', 136),
 ('train_test_split', 136),
 ('var_dic_list', 136),
 ('path', 98),
 ('cm', 80),
 ('colors', 80),
 ('linear_model', 80),
 ('mpl', 80),
 ('np', 80),
 ('pd', 80),
 ('plt', 80),
 ('sns', 80),
 ('sp', 80),
 ('sparse', 80),
 ('getsizeof', 72),
 ('file', 57),
 ('tfidf', 56)]

In [33]:
tfidf_df = pd.DataFrame(tfidf.toarray())

MemoryError: 

#### Training Data

In [8]:
#read in train split

path = r'C:\Users\Duncan\Desktop\School\Data Mining\data\\'
file = r'train.csv'

train = pd.read_csv(path + file)
train.head()

Unnamed: 0.1,Unnamed: 0,title,artist,full_lyrics,chart
0,17599,poinciana,steve lawrence,blow tropic wind sing a song to the trees tree...,0
1,2383,live like you were dying,tim mcgraw,he said i was in my early forties with a lot o...,1
2,40051,living legends,da band,dylan yo blaze the fire and watch the enemies ...,0
3,18917,in the night side of eden,h i m,divided we stand in the light of a frozen sun ...,0
4,19625,how insensitive,rosemary clooney,how insensitive i must have seemed when she to...,0


In [9]:
# Drop unwanted columns

train.drop('Unnamed: 0', axis=1, inplace=True)
train.drop('full_lyrics', axis=1, inplace=True)
train.drop('chart', axis=1, inplace=True)

train.reset_index(inplace=True, drop=True)

In [17]:
train.shape

(34930, 2)

In [13]:
# Join features

full_train = pd.merge(train, df, how='left', on=['title', 'artist'])

In [22]:
full_train.head()

Unnamed: 0,title,artist,chart,achievement,affection,aggression,air_travel,alcohol,ancient,anger,...,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
0,poinciana,steve lawrence,0,0.0,0.048544,0.009709,0.0,0.0,0.0,0.0,...,0.634067,0.000581,0.049407,0.000581,0.000581,0.000581,0.049197,0.000581,0.000581,0.000581
1,live like you were dying,tim mcgraw,1,0.0,0.002809,0.0,0.0,0.0,0.0,0.0,...,0.021545,0.030394,0.032153,0.000167,0.000167,0.35898,0.000167,0.000167,0.024599,0.000167
2,living legends,da band,0,0.0,0.0,0.0,0.0,0.001748,0.0,0.0,...,0.000104,0.000104,0.090056,0.285902,0.000104,0.080269,0.000104,0.049208,0.052042,0.109604
3,in the night side of eden,h i m,0,0.0,0.008197,0.008197,0.0,0.008197,0.0,0.008197,...,0.085937,0.000556,0.000556,0.000556,0.192639,0.000556,0.158833,0.000556,0.000556,0.000556
4,how insensitive,rosemary clooney,0,0.0,0.023438,0.0,0.0,0.0,0.0,0.0,...,0.078585,0.000424,0.000424,0.085921,0.000424,0.341022,0.000424,0.000424,0.000424,0.000424


#### Testing Data

In [18]:
#read in test split

path = r'C:\Users\Duncan\Desktop\School\Data Mining\data\\'
file = r'test.csv'

test = pd.read_csv(path + file)
test.head()

Unnamed: 0.1,Unnamed: 0,title,artist,full_lyrics,chart
0,4672,make me a song radio edit,kiley dean,what you hear is not a test wha wha wha wha wh...,0
1,45569,rest of your life,n leo pilimehana,what are you doing for the rest of your life w...,0
2,7682,children of the damned,therion,hes walking like a small child but watch his e...,0
3,6541,the girl from ipanema,king curtis,tall and tan and young and lovely the girl fro...,0
4,23690,bad blood,neil sedaka,it coulda been me but it was you who went and ...,0


In [19]:
# Drop unwanted columns

test.drop('Unnamed: 0', axis=1, inplace=True)
test.drop('full_lyrics', axis=1, inplace=True)
test.drop('chart', axis=1, inplace=True)

test.reset_index(inplace=True, drop=True)

In [20]:
test.shape

(11644, 2)

In [21]:
# Join features

full_test = pd.merge(test, df, how='left', on=['title', 'artist'])

In [23]:
full_test.head()

Unnamed: 0,title,artist,chart,achievement,affection,aggression,air_travel,alcohol,ancient,anger,...,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,topic_16,topic_17,topic_18,topic_19
0,make me a song radio edit,kiley dean,0,0.0,0.001953,0.0,0.0,0.0,0.0,0.0,...,0.000112,0.040377,0.322445,0.021671,0.000112,0.089884,0.015813,0.000112,0.034163,0.070532
1,rest of your life,n leo pilimehana,0,0.0,0.019355,0.0,0.0,0.0,0.0,0.0,...,0.13603,0.00041,0.00041,0.00041,0.296379,0.115288,0.00041,0.00041,0.00041,0.00041
2,children of the damned,therion,0,0.0,0.0,0.0,0.0,0.0,0.0,0.006211,...,0.000385,0.205092,0.000385,0.000385,0.000385,0.000385,0.028882,0.000385,0.000385,0.000385
3,the girl from ipanema,king curtis,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000472,0.509808,0.000472,0.152921,0.000472,0.052697,0.000472,0.000472,0.000472,0.000472
4,bad blood,neil sedaka,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000227,0.000227,0.000227,0.091859,0.000227,0.000227,0.000227,0.011791,0.057769,0.000227


# Random Forest