In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from matplotlib import pyplot
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline

from scipy.sparse import csr_matrix, hstack

#from numpy import hstack

In [2]:
%run FeatureExtractor.ipynb

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/romitbarua/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/romitbarua/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/romitbarua/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Importing Clean DF -> Completed Import


# Function to Create the Models

In [3]:
def buildOlsModel():
    model = LinearRegression()
    return model

In [4]:
class LogitRegression(LinearRegression):

    def fit(self, x, p):
        p = np.asarray(p)
        y = np.log(p / (1 - p))
        return super().fit(x, y)

    def predict(self, x):
        y = super().predict(x)
        return 1 / (np.exp(-y) + 1)

In [5]:
def buildLogitModel():
    model = LogitRegression(fit_intercept=True)
    return model

In [6]:
def buildGradientBoostedRegressor():
    model = GradientBoostingRegressor()
    return model

In [7]:
def buildRandomForestModel():
    model = RandomForestRegressor()
    return model

# Utility

In [8]:
def combine_features(train_feature_list, test_feature_list):
    final_trainX = train_feature_list[0]
    final_testX = test_feature_list[0]
    
    train_feature_list.pop(0)
    test_feature_list.pop(0)
    
    while len(train_feature_list) != 0:
        final_trainX = hstack((final_trainX, train_feature_list[0]))
        final_testX = hstack((final_testX, test_feature_list[0]))
        train_feature_list.pop(0)
        test_feature_list.pop(0)
        
    return final_trainX, final_testX

In [9]:
def runAndEvalModel(model, trainX, testX, trainy, testy):
    model.fit(trainX, (trainy))
    train_pred = model.predict(trainX)
    test_pred = model.predict(testX)
    
#     print(sorted(model.feature_importances_)[-1000:])
#     pyplot.bar([x for x in range(len(model.feature_importances_))], model.feature_importances_)
#     pyplot.show()
    
    mae = mean_absolute_error(y_true = testy, y_pred = test_pred)
    mape = mean_absolute_percentage_error(y_true = testy, y_pred = test_pred)
    mse = mean_squared_error(y_true = testy, y_pred = test_pred)
    train_r2 = r2_score(y_true = trainy, y_pred = train_pred)
    test_r2 = r2_score(y_true = testy, y_pred = test_pred)
    
    return mae, mape, mse, train_r2, test_r2
    

# Run the models set by parameters defined in the input file

In [10]:
def runModels(inputCsv, outputCsv):
    input_df = pd.read_csv(inputCsv)
    final_df = input_df.copy()
    final_df['MAE'] = None
    final_df['MAPE'] = None
    final_df['MSE'] = None
    final_df['train_R2'] = None
    final_df['test_R2'] = None

    for run in range(len(input_df)):

        train_feature_list = []
        test_feature_list = []

        #create the model
        if input_df.loc[run, 'ModelType'] == 'OLS':
            model = buildOlsModel()
        elif input_df.loc[run, 'ModelType'] == 'RandomForest':
            model = buildRandomForestModel()
        elif input_df.loc[run, 'ModelType'] == 'Logit':
            model = buildLogitModel()
        else:
            model = buildGradientBoostedRegressor()

        #build BoW features if requested
        if input_df.loc[run, 'BowFeature'] == 1:
            
            BowFeatureType = input_df.loc[run, 'BowFeatureType']
            BowFeatureSVD = input_df.loc[run, 'BowFeatureSVD']
            BowFeatureTfidf = input_df.loc[run, 'BowFeatureTfidf']
            
            bow_vocab, bow_trainX, bow_testX = buildBowFeatures(BowFeatureType, BowFeatureTfidf, BowFeatureSVD)
            train_feature_list.append(bow_trainX)
            test_feature_list.append(bow_testX)

        #build Chord Progression features if requested
        if input_df.loc[run, 'chordProgFeature'] == 1:

            keyColumn = input_df.loc[run, 'keyColumn']
            chordNormType = input_df.loc[run, 'chordNormType']
            chordLowGram = int(input_df.loc[run, 'chordLowGram'])
            chordHighGram = int(input_df.loc[run, 'chordHighGram'])
            chordBinary = input_df.loc[run, 'chordBinary']
            chordProgSVD = input_df.loc[run, 'chordProgSVD']
            chordProgSvdNumComp = input_df.loc[run, 'chordProgSvdNumComp']
            chordProgTfidf = input_df.loc[run, 'chordProgTfidf']
            chordProgDocFreq = input_df.loc[run, 'chordProgDocFreq']

            chord_vocab, chord_trainX, chord_testX = buildChordProgressionFeatures(keyColumn,
                                                                                   chordNormType,
                                                                                   chordLowGram,
                                                                                   chordHighGram,
                                                                                   chordBinary,
                                                                                   chordProgSVD,
                                                                                   chordProgTfidf,
                                                                                   num_components = chordProgSvdNumComp,
                                                                                   min_df = chordProgDocFreq)
            train_feature_list.append(chord_trainX)
            test_feature_list.append(chord_testX)

        #build LDA Features features if requested
        if input_df.loc[run, 'ldaFeature'] == 1:

            topics, lda_trainX, lda_testX = buildLdaFeatures()
            train_feature_list.append(lda_trainX)
            test_feature_list.append(lda_testX)

        #build Emotion Features features if requested
        if input_df.loc[run, 'emotionFeature'] == 1:

            emotionFeatureType = input_df.loc[run, 'emotionFeatureType']
            emotion_trainX, emotion_testX = buildEmotionFeatures(emotionFeatureType)
            train_feature_list.append(emotion_trainX)
            test_feature_list.append(emotion_testX)

        #build Text Complexity Features features if requested
        if input_df.loc[run, 'textComplexFeature'] == 1:
            textComplex_trainX, textComplex_testX = buildTextComplexityFeatures()
            train_feature_list.append(textComplex_trainX)
            test_feature_list.append(textComplex_testX)


        final_trainX, final_testX = combine_features(train_feature_list, test_feature_list)


        attribute = input_df.loc[run, 'PredAttribute']
        
        randomY = input_df.loc[run, 'RandomY']
        final_trainy, final_testy = buildY(attribute, randomY = randomY)


        mae, mape, mse, train_r2, test_r2 = runAndEvalModel(model, final_trainX, final_testX, final_trainy, final_testy)

        final_df.loc[run, 'MAE'] = mae
        final_df.loc[run, 'MAPE'] = mape
        final_df.loc[run, 'MSE'] = mse
        final_df.loc[run, 'train_R2'] = train_r2
        final_df.loc[run, 'test_R2'] = test_r2

        print('Completed Run #{} out of {}'.format(run+1, len(input_df)))
        print()
        print(final_df)
        
    final_df.to_csv(outputCsv)
    print(final_df)
    
        

In [11]:
runModels('Input.csv', 'FinalResults.csv')

Building BoW Features -> BoW Features Built
Building LDA Features -> LDA Features Built
Completed Run #1 out of 3

  PredAttribute     ModelType  RandomY  BowFeature BowFeatureType  \
0        energy  RandomForest    False           1         Lyrics   
1        energy  RandomForest    False           0            NaN   
2        energy  RandomForest    False           0            NaN   

   BowFeatureSVD  BowFeatureTfidf  chordProgFeature chordNormType  \
0            NaN              NaN                 0           NaN   
1            NaN              NaN                 1          Full   
2            NaN              NaN                 0           NaN   

   chordLowGram  ...  chordProgDocFreq ldaFeature emotionFeature  \
0           NaN  ...               NaN          1              0   
1           2.0  ...               1.0          1              0   
2           NaN  ...               NaN          0              1   

   emotionFeatureType  textComplexFeature      MAE      MA

In [12]:
final_trainy, final_testy = buildY('acousticness', randomY = False)

In [13]:
avg = np.average(final_testy)
preds = [avg]*len(final_testy)
len(preds)

2085

In [14]:
mae = mean_absolute_error(y_true = final_testy, y_pred = preds)
mae

0.26615040891367253

In [15]:
df.columns

Index(['track_name', 'track_id', 'playlist_name', 'playlist_id',
       'playlist_genre', 'track_name', 'track_artist_name', 'track_artist_id',
       'danceability', 'energy', 'spotify_key', 'loudness', 'mode',
       'acousticness', 'valence', 'tempo', 'song_name', 'dirty_tabs', 'capo',
       'dirty_lyrics', 'clean_lyrics', 'words_for_LDA', 'clean_tabs',
       'greer_key', 'firstNote_key', 'lastNote_key', 'bestChoice_key',
       'text2emotion', 'vader_emotion'],
      dtype='object')