In [1]:
import pandas as pd
import numpy as np

## Load dataset we used for Logistic Regression.

In [2]:
df = pd.read_csv('pf_readyforlogistic1.csv', encoding="ISO-8859-1")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10986 entries, 0 to 10985
Data columns (total 32 columns):
reviewid              10986 non-null int64
Unnamed: 0            10986 non-null int64
Unnamed: 0.1          10986 non-null int64
content               10986 non-null object
genre                 9674 non-null object
label                 10964 non-null object
title                 10986 non-null object
artist                10986 non-null object
url                   10986 non-null object
score                 10986 non-null float64
best_new_music        10986 non-null int64
author                10986 non-null object
author_type           8561 non-null object
pub_date              10986 non-null object
pub_weekday           10986 non-null int64
pub_day               10986 non-null int64
pub_month             10986 non-null int64
pub_year              10986 non-null int64
year                  10705 non-null float64
content_words         10986 non-null int64
subjectivity        

In [3]:
df.columns

Index(['reviewid', 'Unnamed: 0', 'Unnamed: 0.1', 'content', 'genre', 'label',
       'title', 'artist', 'url', 'score', 'best_new_music', 'author',
       'author_type', 'pub_date', 'pub_weekday', 'pub_day', 'pub_month',
       'pub_year', 'year', 'content_words', 'subjectivity', 'polarity',
       'score_bin', 'genre_experimental', 'genre_folk/country', 'genre_global',
       'genre_jazz', 'genre_metal', 'genre_pop/r&b', 'genre_rap', 'genre_rock',
       'sum_genres'],
      dtype='object')

In [4]:
df.drop(['reviewid', 'Unnamed: 0', 'Unnamed: 0.1', 'content', 'genre', 'url', 'score', 'best_new_music', 'content_words', 'subjectivity', 'polarity', 'sum_genres'], axis = 1, inplace = True)

In [5]:
sum(df.pub_year != df.year)

1706

## Try a neural net on the currently numeric variables, to build a quick model without preprocessing.

In [6]:
df1 = df.select_dtypes(include=[np.number])
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10986 entries, 0 to 10985
Data columns (total 14 columns):
pub_weekday           10986 non-null int64
pub_day               10986 non-null int64
pub_month             10986 non-null int64
pub_year              10986 non-null int64
year                  10705 non-null float64
score_bin             10986 non-null int64
genre_experimental    10986 non-null int64
genre_folk/country    10986 non-null int64
genre_global          10986 non-null int64
genre_jazz            10986 non-null int64
genre_metal           10986 non-null int64
genre_pop/r&b         10986 non-null int64
genre_rap             10986 non-null int64
genre_rock            10986 non-null int64
dtypes: float64(1), int64(13)
memory usage: 1.2 MB


In [7]:
df1['year'].fillna(df1['pub_year'], inplace = True)
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10986 entries, 0 to 10985
Data columns (total 14 columns):
pub_weekday           10986 non-null int64
pub_day               10986 non-null int64
pub_month             10986 non-null int64
pub_year              10986 non-null int64
year                  10986 non-null float64
score_bin             10986 non-null int64
genre_experimental    10986 non-null int64
genre_folk/country    10986 non-null int64
genre_global          10986 non-null int64
genre_jazz            10986 non-null int64
genre_metal           10986 non-null int64
genre_pop/r&b         10986 non-null int64
genre_rap             10986 non-null int64
genre_rock            10986 non-null int64
dtypes: float64(1), int64(13)
memory usage: 1.2 MB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [8]:
import keras

Using TensorFlow backend.


In [9]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.datasets import mnist
from keras.utils import np_utils

def build_logistic_model(input_dim, output_dim):
    model = Sequential()
    model.add(Dense(output_dim, input_dim=input_dim, activation='softmax'))

    return model

In [10]:
X = df1.drop('score_bin', axis = 1)
y = df1['score_bin']

In [11]:
X.shape

(10986, 13)

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify = y)
batch_size = 128
nb_classes = 1
nb_epoch = 10
input_dim = 13

# the data, shuffled and split between train and test sets

X_train = X_train.astype('int64')
X_test = X_test.astype('int64')
print(X_train.shape[0], 'train samples')
print(X_test.shape[0], 'test samples')

model = build_logistic_model(input_dim, nb_classes)

model.summary()

# compile the model
model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy', 'mae'])
history = model.fit(X_train, y_train,
                    batch_size=batch_size, nb_epoch=nb_epoch,
                    verbose=1, validation_data=(X_test, y_test))
score = model.evaluate(X_test, y_test, verbose=0)

print('Test score:', score[0])
print('Test accuracy:', score[1])
print(score)

8788 train samples
2198 test samples
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 1)                 14        
Total params: 14
Trainable params: 14
Non-trainable params: 0
_________________________________________________________________
Train on 8788 samples, validate on 2198 samples
Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test score: 10.393739137137555
Test accuracy: 0.3480436760420361
[10.393739137137555, 0.3480436760420361, 0.6519563236596695]


## Try making y vector into a matrix.

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42, stratify = y)
batch_size = 32
nb_classes = 2
nb_epoch = 10
input_dim = 13

# the data, shuffled and split between train and test sets

X_train = X_train.astype('int64')
X_test = X_test.astype('int64')
print(X_train.shape[0], 'train samples')
print(X_test.shape[0], 'test samples')

# convert class vectors to binary class matrices
Y_train = np_utils.to_categorical(y_train, nb_classes)
Y_test = np_utils.to_categorical(y_test, nb_classes)

model = build_logistic_model(input_dim, nb_classes)

model.summary()

# compile the model
model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, Y_train,
                    batch_size=batch_size, nb_epoch=nb_epoch,
                    verbose=1, validation_data=(X_test, Y_test))
score = model.evaluate(X_test, Y_test, verbose=0)

print('Test score:', score[0])
print('Test accuracy:', score[1])
print(score)

7690 train samples
3296 test samples
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 2)                 28        
Total params: 28
Trainable params: 28
Non-trainable params: 0
_________________________________________________________________




Train on 7690 samples, validate on 3296 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test score: 5.583347915445716
Test accuracy: 0.6516990291262136
[5.583347915445716, 0.6516990291262136]


## Let's try adding more independent variables. 
## Add label and author variables. 
## The high rank music labels are the ones that most often receive high scores, high rank authors on average give high scores.

In [16]:
high_rank_labels = ['rhino', 'light in the attic', 'profound lore', 'matador', 'emi',
       'editions mego', 'epitaph', 'constellation', 'relapse', '4ad']
high_rank_authors = ['jenn pelly', 'seth colter walls', 'mark richardson',
       'grayson haver currin', 'david drake', 'philip sherburne',
       "andy o'connor", 'andy beta', 'amanda petrusich', 'marc masters']
df.dropna(subset = ['label'], inplace = True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10964 entries, 0 to 10985
Data columns (total 20 columns):
label                 10964 non-null object
title                 10964 non-null object
artist                10964 non-null object
author                10964 non-null object
author_type           8541 non-null object
pub_date              10964 non-null object
pub_weekday           10964 non-null int64
pub_day               10964 non-null int64
pub_month             10964 non-null int64
pub_year              10964 non-null int64
year                  10683 non-null float64
score_bin             10964 non-null int64
genre_experimental    10964 non-null int64
genre_folk/country    10964 non-null int64
genre_global          10964 non-null int64
genre_jazz            10964 non-null int64
genre_metal           10964 non-null int64
genre_pop/r&b         10964 non-null int64
genre_rap             10964 non-null int64
genre_rock            10964 non-null int64
dtypes: float64(1), int6

In [17]:
df2 = df.drop(['title', 'artist', 'author_type'], axis = 1)

In [18]:
df2['is_author_kind'] = df2['author'].isin(high_rank_authors)

In [19]:
df2['is_label_favored'] = df2['label'].isin(high_rank_labels)
df2['year'].fillna(df2['pub_year'], inplace = True)
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10964 entries, 0 to 10985
Data columns (total 19 columns):
label                 10964 non-null object
author                10964 non-null object
pub_date              10964 non-null object
pub_weekday           10964 non-null int64
pub_day               10964 non-null int64
pub_month             10964 non-null int64
pub_year              10964 non-null int64
year                  10964 non-null float64
score_bin             10964 non-null int64
genre_experimental    10964 non-null int64
genre_folk/country    10964 non-null int64
genre_global          10964 non-null int64
genre_jazz            10964 non-null int64
genre_metal           10964 non-null int64
genre_pop/r&b         10964 non-null int64
genre_rap             10964 non-null int64
genre_rock            10964 non-null int64
is_author_kind        10964 non-null bool
is_label_favored      10964 non-null bool
dtypes: bool(2), float64(1), int64(13), object(3)
memory usage: 1.5+ MB

In [20]:
X = df2.drop(['label', 'author', 'pub_date', 'score_bin'], axis = 1)
y = df2['score_bin']

In [21]:
X.shape

(10964, 15)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify = y)
batch_size = 32
nb_classes = 2
nb_epoch = 10
input_dim = 15

# the data, shuffled and split between train and test sets

X_train = X_train.astype('int64')
X_test = X_test.astype('int64')
print(X_train.shape[0], 'train samples')
print(X_test.shape[0], 'test samples')

# convert class vectors to binary class matrices
Y_train = np_utils.to_categorical(y_train, nb_classes)
Y_test = np_utils.to_categorical(y_test, nb_classes)

model = build_logistic_model(input_dim, nb_classes)

model.summary()

# compile the model
model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, Y_train,
                    batch_size=batch_size, epochs=nb_epoch,
                    verbose=1, validation_data=(X_test, Y_test))
score = model.evaluate(X_test, Y_test, verbose=0)

print('Test score:', score[0])
print('Test accuracy:', score[1])
print(score)

8771 train samples
2193 test samples
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 2)                 32        
Total params: 32
Trainable params: 32
Non-trainable params: 0
_________________________________________________________________
Train on 8771 samples, validate on 2193 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test score: 10.460224631275155
Test accuracy: 0.3474692202870073
[10.460224631275155, 0.3474692202870073]


## Try a model with two hidden layers.

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify = y)
batch_size = 32
nb_classes = 2
nb_epoch = 10
input_dim = 15

# the data, shuffled and split between train and test sets

X_train = X_train.astype('int64')
X_test = X_test.astype('int64')
print(X_train.shape[0], 'train samples')
print(X_test.shape[0], 'test samples')

# convert class vectors to binary class matrices
Y_train = np_utils.to_categorical(y_train, nb_classes)
Y_test = np_utils.to_categorical(y_test, nb_classes)

model = Sequential()
model.add(Dense(64, input_dim=input_dim, activation='tanh'))
model.add(Dense(nb_classes, input_dim=64, activation='softmax'))

model.summary()

# compile the model
model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, Y_train,
                    batch_size=batch_size, epochs=nb_epoch,
                    verbose=1, validation_data=(X_test, Y_test))
score = model.evaluate(X_test, Y_test, verbose=0)

print('Test score:', score[0])
print('Test accuracy:', score[1])
print(score)

8771 train samples
2193 test samples
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 64)                1024      
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 130       
Total params: 1,154
Trainable params: 1,154
Non-trainable params: 0
_________________________________________________________________
Train on 8771 samples, validate on 2193 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test score: 0.6773259057826883
Test accuracy: 0.6525307799440184
[0.6773259057826883, 0.6525307799440184]


## Try ReLu as activation function for first layer.

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify = y)
batch_size = 32
nb_classes = 2
nb_epoch = 10
input_dim = 15

# the data, shuffled and split between train and test sets

X_train = X_train.astype('int64')
X_test = X_test.astype('int64')
print(X_train.shape[0], 'train samples')
print(X_test.shape[0], 'test samples')

# convert class vectors to binary class matrices
Y_train = np_utils.to_categorical(y_train, nb_classes)
Y_test = np_utils.to_categorical(y_test, nb_classes)

model = Sequential()
model.add(Dense(64, input_dim=input_dim, activation='relu'))
model.add(Dense(nb_classes, input_dim=64, activation='softmax'))

model.summary()

# compile the model
model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, Y_train,
                    batch_size=batch_size, epochs=nb_epoch,
                    verbose=1, validation_data=(X_test, Y_test))
score = model.evaluate(X_test, Y_test, verbose=0)

print('Test score:', score[0])
print('Test accuracy:', score[1])
print(score)

8771 train samples
2193 test samples
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 64)                1024      
_________________________________________________________________
dense_7 (Dense)              (None, 2)                 130       
Total params: 1,154
Trainable params: 1,154
Non-trainable params: 0
_________________________________________________________________
Train on 8771 samples, validate on 2193 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test score: 10.460224631275155
Test accuracy: 0.3474692202870073
[10.460224631275155, 0.3474692202870073]


## Try smaller batches with tanh activation function in first layer.

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify = y)
batch_size = 16
nb_classes = 2
nb_epoch = 10
input_dim = 15

# the data, shuffled and split between train and test sets

X_train = X_train.astype('int64')
X_test = X_test.astype('int64')
print(X_train.shape[0], 'train samples')
print(X_test.shape[0], 'test samples')

# convert class vectors to binary class matrices
Y_train = np_utils.to_categorical(y_train, nb_classes)
Y_test = np_utils.to_categorical(y_test, nb_classes)

model = Sequential()
model.add(Dense(64, input_dim=input_dim, activation='tanh'))
model.add(Dense(nb_classes, input_dim=64, activation='softmax'))

model.summary()

# compile the model
model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, Y_train,
                    batch_size=batch_size, epochs=nb_epoch,
                    verbose=1, validation_data=(X_test, Y_test))
score = model.evaluate(X_test, Y_test, verbose=0)

print('Test score:', score[0])
print('Test accuracy:', score[1])
print(score)

8771 train samples
2193 test samples
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_8 (Dense)              (None, 64)                1024      
_________________________________________________________________
dense_9 (Dense)              (None, 2)                 130       
Total params: 1,154
Trainable params: 1,154
Non-trainable params: 0
_________________________________________________________________
Train on 8771 samples, validate on 2193 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test score: 0.6515959793717906
Test accuracy: 0.6525307799440184
[0.6515959793717906, 0.6525307799440184]
