In [None]:
import pandas as pd
import numpy as np
import pickle
import operator
import copy

from dateutil.parser import parse
from datetime import datetime

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import statsmodels.api as sm

from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

from sklearn import metrics

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso, LassoCV
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.model_selection import cross_val_score

In [None]:
#load pickled file into dataframe
def open_pickle(pkl_file):
    with open(pkl_file, 'rb') as picklefile:
        return pickle.load(picklefile)

In [None]:
#authors = open_pickle('authors_cleaned.pkl')
books = open_pickle('books_cleaned.pkl')

In [None]:
books.drop(['a_id', 'b_id', 'b_ratings_count'], axis=1, inplace=True)

In [None]:
books.columns

### Preprocessing for sklearn

Add dummy variables

In [None]:
#hometown in NY
books.loc[books['hometown'].str.contains('New York'), 'a_ny_hometown'] = 1
books['a_ny_hometown'].fillna(0, inplace=True)

In [None]:
def make_add_cat_dummies(df, prefix, column):
    '''
    Make dummy variable columns and merge with existing dataframe
    for columns with categorical data
    '''
    return df.merge(pd.get_dummies(df[column], 
                                 prefix=prefix, 
                                 drop_first=True), 
                  left_index=True, 
                  right_index=True)

In [None]:
#decade, gender, publisher
#books = make_add_cat_dummies(books, 'b', 'decade')
books = make_add_cat_dummies(books, 'a', 'gender')
books = make_add_cat_dummies(books, 'a', 'publisher')

In [None]:
def make_add_value_dummies(df, old_column, new_column, value):
    '''
    Make dummy variable columns
    for columns with values above value threshold
    '''
    df.loc[df[old_column] > value, new_column] = 1
    df[new_column].fillna(0, inplace=True)
    return df

In [None]:
#books with more than 1 week on list
books = make_add_value_dummies(books, 'b_wks_on_list', 'b_repeat', 1)

#authors with more than 1 week on list
books = make_add_value_dummies(books, 'a_wks_on_list', 'a_repeat', 1)

#authors with more than 1 book on list
books = make_add_value_dummies(books, 'a_books_on_list', 'a_b_repeat', 1)

Create feature and target dataframes

In [None]:
def make_target_features(df, target):
    '''
    df = data frame with all features and target as columns
    target = name of column with target (string)
    '''
    #create target and features dfs
    X = copy.deepcopy(df)
    y = X.pop(target)
    
    #only keep numerical features
    sklearn_columns = []
    for column in X.columns:
        if np.dtype(X[column]) == 'float64' or np.dtype(X[column]) == 'int64':
            sklearn_columns.append(column)
    X = X[sklearn_columns]
    
    #standardize features
    X_scaled = pd.DataFrame(preprocessing.scale(X))
    X_scaled.columns = X.columns
    
    
    #return feature and target dfs
    return X_scaled, y

### Make and compare models

For each model/regularization combination, find the optimal lambda, then create a model using that lambda, calculate the average MSE for that model, and compare MSEs across models.

In [None]:
def calc_avg_RMSE(model):
    scores = cross_val_score(model, X, y, cv=10, scoring='neg_mean_squared_error')
    return np.sqrt(-scores.mean())

In [None]:
def find_best_model(X_scaled, y):
    '''
    model_type: LR, Lasso, Ridge, EN
    '''
    
    avg_RMSEs = {}
    
    #linear
    #model_LR = LinearRegression()
    
    
    #lasso
    model_LassoCV = LassoCV(cv=5, normalize=False)
    model_LassoCV.fit(X_scaled, y)
    model_Lasso = Lasso(alpha=model_LassoCV.alpha_)
    avg_RMSEs['lasso'] = calc_avg_RMSE(model_Lasso)
    
    #ridge
    model_RidgeCV = RidgeCV(cv=5, normalize=False)
    model_RidgeCV.fit(X_scaled, y)
    model_Ridge = Ridge(alpha=model_RidgeCV.alpha_)
    avg_RMSEs['ridge'] = calc_avg_RMSE(model_Ridge)
    
    #elasticnet
    model_ElasticCV = ElasticNetCV(cv=5, normalize=False)
    model_ElasticCV.fit(X_scaled, y)
    model_Elastic = ElasticNet(alpha=model_ElasticCV.alpha_)
    avg_RMSEs['elasticnet'] = calc_avg_RMSE(model_Elastic)
    
    models = {'lasso': model_Lasso,
             'ridge': model_Ridge,
             'elasticnet': model_Elastic}
    
    best_model_name =  min(avg_RMSEs.iteritems(), key=operator.itemgetter(1))[0]
    
    print 'best model: ', best_model_name
    print 'avg RMSE: ', avg_RMSEs[best_model_name]
    #print 'coefficients: ', model_Elastic.get_params()

In [None]:
import seaborn as sns
sns.heatmap(books.corr());

### _Model 1: All features_

In [None]:
model1 = books['b_avg_rating']

In [None]:
X, y = make_target_features(books, 'a_avg_rating')

In [None]:
lsm = sm.OLS(y,X)
fit = lsm.fit()
fit.summary()

In [None]:
fit.resid.plot(style='o', figsize=(12,8));

In [None]:
find_best_model(X, y)

### _Model 2: only books with at least 1 week on NYT list_

In [None]:
model2 = books[books['b_repeat']==False]

In [None]:
X, y = make_target_features(model2, 'b_avg_rating')

In [None]:
find_best_model(X, y)

### _Model 3: only authors with at least 1 week on NYT list_

In [None]:
#only include authors with at least 1 book on NYT list
model3 = books[books['a_b_repeat']==False]

In [None]:
X, y = make_target_features(model3, 'b_avg_rating')

In [None]:
find_best_model(X, y)