In [1]:
import numpy as np
import requests 
from bs4 import BeautifulSoup
import pandas as pd
import datetime

import warnings

#from scrapefork import *
from cleaning_functions import *
from create_pitchfork_lineups_data import *
from other_utils import *

In [2]:
warnings.simplefilter('ignore')

utils files:
1. scrapefork.py (borrowed)
2. scrape_pitchfork_reviews.py
3. scrape_pitchfork_features.py
4. create_pitchfork_lineups_data.py
5. cleaning_functions.py - all functions used too clean/munge/aggregate data for the model
6. other_utils.py - functions to print model performance metrics
    
data files:
1. data/raw_p4k_lineups.txt
2. data/pitchfork_reviews.csv
3. data/pitchfork_features_with_article_txt.csv

In [3]:
# import & clean datasets

# lineups
filepath = 'data/raw_p4k_lineups.txt'
lineups = get_lineups_data(filepath)

# reviews
reviews_df = pd.read_csv('data/pitchfork_reviews.csv')
reviews = clean_reviews_df(reviews_df)

# features
articles_df = pd.read_csv('data/pitchfork_features_with_article_txt.csv')
articles = clean_features_df(articles_df)

### first model - reviews

In [4]:
# first the reviews model
# formatting - 
reviews_cumul = gather_cumulative_review_data_by_artist(reviews)
reviews_cumul_final = add_review_features(reviews_cumul)
reviews_dfs = group_cumul_reviews_by_year(reviews_cumul_final)

# normalize numeric variables
reviews_full_df = join_year_dfs(reviews_dfs)

reviews_full_df['score_norm'] \
    = normalize_variable(reviews_full_df, 'score')
    
reviews_full_df['avg_cumul_score_norm'] \
    = normalize_variable(reviews_full_df, 'avg_cumul_score')

reviews_full_df['days_to_announcement_norm'] \
    = normalize_variable(reviews_full_df, 'days_to_announcement')

# interaction terms
reviews_full_df['score_x_days'] = reviews_full_df.score_norm  \
                    * reviews_full_df.days_to_announcement_norm
reviews_full_df['reissue_x_days'] = reviews_full_df.reissue \
                    * reviews_full_df.days_to_announcement_norm
reviews_full_df['cumul_score_x_days'] = reviews_full_df.avg_cumul_score_norm \
                    * reviews_full_df.days_to_announcement_norm
reviews_full_df['chicago_x_days'] = reviews_full_df.chicago_based \
                    * reviews_full_df.days_to_announcement_norm
reviews_full_df['performance_x_days'] = reviews_full_df.performance_mention \
                    * reviews_full_df.days_to_announcement_norm
reviews_full_df['best_x_days'] = reviews_full_df.best \
                    * reviews_full_df.days_to_announcement_norm
reviews_full_df['chicago_x_score'] = reviews_full_df.chicago_based \
                    * reviews_full_df.score_norm
reviews_full_df['performance_x_score'] = reviews_full_df.performance_mention \
                    * reviews_full_df.score_norm
reviews_full_df['chicago_x_performance'] = reviews_full_df.chicago_based \
                    * reviews_full_df.performance_mention


In [5]:
#from sklearn.feature_extraction.text import TfidfVectorizer 

#tfidf_vectorizer = TfidfVectorizer(stop_words = 'english')

#X_reviews = tfidf_vectorizer.fit_transform(reviews_full_df['review']) \
#                            .toarray()

#reviews_frequency_matrix = pd.DataFrame(X_reviews, columns 
#                                = tfidf_vectorizer.get_feature_names())

#reviews_frequency_matrix = pd.read_csv('saved_features/reviews_frequeny_matrix.csv')

In [6]:
#reviews_full_full_df = reviews_full_df.reset_index(drop=True) \
#                            .join(reviews_frequency_matrix, 
#                                  lsuffix='_review')

In [7]:
#reviews_model_df = join_lineups(reviews_full_full_df, lineups, 'left')
reviews_model_df = join_lineups(reviews_full_df, lineups, 'left')

In [8]:
# train reviews sub-model
reviews_training_set = reviews_model_df.loc[
                    reviews_model_df.fest_date != '2020-07-15']
reviews_test_set = reviews_model_df.loc[
                    reviews_model_df.fest_date == '2020-07-15']

reviews_inputs = ['score_norm', 'avg_cumul_score_norm', 
                  'best', 'previous_reviews_count', 
                  'performance_mention', 'chicago_based', 'reissue',
                  'days_to_announcement_norm', 
                  'Rock', 'Rap', 'Jazz', 'Experimental', 'Pop/R&B',
                  'Electronic', 'Metal', 'Global', 'Folk/Country',
                  'score_x_days', 'reissue_x_days',
                  'cumul_score_x_days', 'chicago_x_days',
                  'performance_x_days', 'best_x_days',
                  'chicago_x_score', 'performance_x_score',
                  'chicago_x_performance'] #\
                   # + tfidf_vectorizer.get_feature_names()

output = ['played_fest']

In [25]:
from sklearn.ensemble import RandomForestClassifier

X_train = reviews_training_set[reviews_inputs]
y_train = reviews_training_set[output].astype('int')
X_test = reviews_test_set[reviews_inputs]
y_test = reviews_test_set[output].astype('int')

reviews_model = RandomForestClassifier()
reviews_model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [20]:
# performance LOL
from sklearn.metrics import f1_score, accuracy_score, precision_score, \
                            recall_score, confusion_matrix

reviews_preds = reviews_model.predict(X_test)
print_performance_metrics(y_test, reviews_preds)

f1:  0.0
accuracy:  0.996
precision:  0.0
recall:  0.0
              predicted false  predicted true
actual false             8800               4
actual true                34               0


In [26]:
reviews_training_preds = reviews_model.predict(X_train)
print_performance_metrics(y_train, reviews_training_preds)

f1:  0.992
accuracy:  1.0
precision:  0.998
recall:  0.985
              predicted false  predicted true
actual false            67905               1
actual true                 7             469


In [21]:
# trying out grid search to tune model
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

reviews_model = RandomForestClassifier()

parameters = {'max_depth':[10, 12, 15],
              'n_estimators':[100, 200],
              'min_samples_split':[6, 8],
              'min_samples_leaf':[2, 4, 6]}

scorer = make_scorer(f1_score)
grid_obj = GridSearchCV(reviews_model, parameters, scoring=scorer)
grid_fit = grid_obj.fit(X_train, y_train)

best_reviews_model = grid_fit.best_estimator_
best_reviews_model.fit(X_train, y_train)

best_train_predictions = best_reviews_model.predict(X_train)
best_test_predictions = best_reviews_model.predict(X_test)

print('training f1: ', f1_score(best_train_predictions, y_train))
print('test f1: ', f1_score(best_test_predictions, y_test))

best_reviews_model

training f1:  0.28468468468468466
test f1:  0.0


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=15, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=6,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [22]:
# performance on test set
reviews_preds = best_reviews_model.predict(X_test)
reviews_prob_preds = best_reviews_model.predict_proba(X_test)

print_performance_metrics(y_test, reviews_preds)

f1:  0.0
accuracy:  0.996
precision:  0.0
recall:  0.0
              predicted false  predicted true
actual false             8803               1
actual true                34               0


In [23]:
# performance on training set
reviews_training_preds = best_reviews_model.predict(X_train)
reviews_training_prob_preds = best_reviews_model.predict_proba(X_train)

print_performance_metrics(y_train, 
                          reviews_training_preds)

f1:  0.285
accuracy:  0.994
precision:  1.0
recall:  0.166
              predicted false  predicted true
actual false            67906               0
actual true               397              79


In [24]:
# look at feature importances
pd.DataFrame(zip(reviews_inputs, best_reviews_model.feature_importances_))

Unnamed: 0,0,1
0,score_norm,0.082309
1,avg_cumul_score_norm,0.095033
2,best,0.029118
3,previous_reviews_count,0.040512
4,performance_mention,0.003918
5,chicago_based,0.002999
6,reissue,0.025346
7,days_to_announcement_norm,0.142615
8,Rock,0.017544
9,Rap,0.010387


### second model - articles

In [27]:
# now the articles model
# formatting - 
articles_dfs = group_articles_by_year(articles)

# join into full df
articles_full_df = join_year_dfs(articles_dfs)
# normalize date var
articles_full_df['days_to_announcement_norm'] \
    = normalize_variable(articles_full_df, 'days_to_announcement')

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer 

tfidf_vectorizer = TfidfVectorizer(stop_words = 'english')

X_article = tfidf_vectorizer.fit_transform(articles_full_df['article']).toarray()

articles_frequency_matrix = pd.DataFrame(X_article, columns 
                                = tfidf_vectorizer.get_feature_names())

In [29]:
articles_full_full_df = articles_full_df.reset_index(drop=True) \
                    .join(articles_frequency_matrix, lsuffix='_article')

In [30]:
# join y col
articles_model_df = join_lineups(articles_full_full_df, 
                                 lineups, 'left')

In [37]:
# train articles sub-model
articles_training_set = articles_model_df.loc[
                        articles_model_df.fest_date != '2020-07-15']
articles_test_set = articles_model_df.loc[
                        articles_model_df.fest_date == '2020-07-15']

articles_inputs = ['days_to_announcement_norm', 'artist_count',
                   'Interview', 'Moodboard', 'Rising', 'Song by Song',
                   '5-10-15-20', 'Longform', 'Profile', 'Lists & Guides', 
                   'Photo Gallery', 'Podcast', 'Family Matters', 
                   'Overtones', 'Festival Report', 'Cover Story',
                   'Afterword', 'Situation Critical', 'Director\'s Cut'] \
                    + tfidf_vectorizer.get_feature_names()

output = ['played_fest']

In [38]:
X_train = articles_training_set[articles_inputs]
y_train = articles_training_set[output].astype('int')
X_test = articles_test_set[articles_inputs]
y_test = articles_test_set[output].astype('int')

articles_model = RandomForestClassifier()
articles_model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [39]:
# basic performance
articles_preds = articles_model.predict(X_test)
print_performance_metrics(y_test, articles_preds)

f1:  0.0
accuracy:  0.952
precision:  0.0
recall:  0.0
              predicted false  predicted true
actual false              295               0
actual true                15               0


In [40]:
# basic performance on training
articles_training_preds = articles_model.predict(X_train)
print_performance_metrics(y_train, articles_training_preds)

f1:  0.263
accuracy:  0.974
precision:  0.952
recall:  0.152
              predicted false  predicted true
actual false            12414               3
actual true               334              60


In [41]:
article_feature_importances = pd.DataFrame(
    zip(articles_inputs, articles_model.feature_importances_)
            )

In [42]:
article_feature_importances.sort_values(1, ascending=False)

Unnamed: 0,0,1
7,Longform,0.012349
0,days_to_announcement_norm,0.007861
6668,best,0.004781
5,Song by Song,0.003200
10,Photo Gallery,0.002717
...,...,...
27486,hannett,0.000000
27487,hanni,0.000000
27489,hannoda,0.000000
27490,hannon,0.000000


### output probability predictions from first layer

In [43]:
# output dataframes for next model layer
reviews_layer_output = format_next_layer_df(reviews_model_df,
                                            reviews_model,
                                            reviews_inputs)

articles_layer_output = format_next_layer_df(articles_model_df,
                                             articles_model,
                                             articles_inputs)

In [44]:
# then join them together & join in lineups
model_df = reviews_layer_output \
                .set_index(['artist_clean', 'fest_date']) \
            .join(articles_layer_output
                      .set_index(['artist_clean', 'fest_date']),
                  how = 'outer', lsuffix = '_reviews',
                  rsuffix = '_articles').reset_index()

full_model_df = join_lineups(model_df, lineups, 'outer')
full_model_df.fillna(0, inplace=True)

In [45]:
full_model_df['review_x_article'] = full_model_df.prob_reviews \
                                        * full_model_df.prob_articles

### final model

In [46]:
training_set = full_model_df.loc[
                    full_model_df.fest_date != '2020-07-15']
test_set = full_model_df.loc[
                    full_model_df.fest_date == '2020-07-15']

inputs = ['prob_reviews', 'prob_articles', 'played_previous_fest', 
          'review_x_article']
ycol = ['played_fest']

In [47]:
full_model = RandomForestClassifier()
full_model.fit(training_set[inputs], training_set[ycol])

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [48]:
preds = full_model.predict(test_set[inputs])
prob_preds = full_model.predict_proba(test_set[inputs])

In [49]:
print_performance_metrics(test_set[ycol], preds)

f1:  0.113
accuracy:  0.995
precision:  0.273
recall:  0.071
              predicted false  predicted true
actual false             8846               8
actual true                39               3


#### lol

In [50]:
training_preds = full_model.predict(training_set[inputs])
training_preds_prob = full_model.predict_proba(training_set[inputs])

In [51]:
print_performance_metrics(training_set[ycol], training_preds)

f1:  0.896
accuracy:  0.998
precision:  0.996
recall:  0.814
              predicted false  predicted true
actual false            74051               2
actual true               110             480


In [52]:
results = training_set.reset_index() \
                .join(pd.DataFrame(training_preds_prob)[1]) \
                .join(pd.DataFrame(training_preds)) \
                .sort_values(1, ascending=False)

In [53]:
results.groupby(round(results['prob_articles'], 1)).agg({
    'played_fest':'mean', 0:'mean', 'artist_clean':'count'
})

Unnamed: 0_level_0,played_fest,0,artist_clean
prob_articles,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.003184,0.002021,72239
0.1,0.089791,0.080566,1626
0.2,0.193811,0.180782,614
0.3,0.294118,0.294118,68
0.4,0.444444,0.407407,27
0.5,0.571429,0.571429,7
0.6,0.969697,0.969697,33
0.7,0.923077,0.923077,26
0.8,1.0,1.0,3


In [55]:
pd.DataFrame(zip(inputs, full_model.feature_importances_))

Unnamed: 0,0,1
0,prob_reviews,0.718991
1,prob_articles,0.029993
2,played_previous_fest,0.041913
3,review_x_article,0.209103
