In [None]:
import pandas as pd
import pickle
import numpy as np
from multiprocessing import Pool
import warnings

from sklearn.svm import SVR
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

warnings.simplefilter("once")

In [None]:
domains = ['brand', 'good', 'trait', 'food', 'occupation', 'risk', 'people']
dims = [('sincere','exciting'),
        ('hedonic','utilitarian'),
        ('masculine','feminine'),
        ('tasty','nutritious'),
        ('significance','autonomy'),
        ('dread-inducing','unknowable'),
        ('warm', 'competent')]

In [None]:
with open("data/dict_of_Xs.pickle", "rb") as handle:
    dict_of_Xs = pickle.load(handle)
with open("data/dict_of_ys.pickle", "rb") as handle:
    dict_of_ys = pickle.load(handle)

# with open("data/dict_of_glove_Xs.pickle", "rb") as handle:
#     dict_of_Xs = pickle.load(handle)        
# with open("data/dict_of_glove_ys.pickle", "rb") as handle:
#     dict_of_ys = pickle.load(handle)

# with open("data/dict_of_swow_Xs.pickle", "rb") as handle:
#     dict_of_Xs = pickle.load(handle)
    
# with open("data/dict_of_swow_ys.pickle", "rb") as handle:
#     dict_of_ys = pickle.load(handle)

In [None]:
dict_of_Xs['brand'].head()

In [None]:
dict_of_ys['sincere']

# Check dimension-specific distributions of percentage of items not rated

In [None]:
mean_missing = pd.DataFrame([(dim, 200 - df.shape[0]) for dim, df in dict_of_ys.items()], 
                             columns=['dim','missing_items'])
mean_missing

In [None]:
mean_missing.mean()

From prereg:

> We will use the 300-dimensional word2vec vectors for each judgment target to predict the mean judgment ratings with ridge, lasso, support vector (SVR, with radial basis function, polynomial, and sigmoid kernels), and k-nearest neighbors (KNN) regression (all techniques as implemented in Scikit-Learn).

> For each judgment target, model fit will be assessed by training each model on a random 90% of the data, computing r-squared on the remaining 10%, and repeating this procedure 1000 times, to obtain an average out-of-sample r-squared on different test-train splits. For support vector, lasso, and ridge regressions, we will perform the above procedure once for each of the following values of the hyperparameter c: {10^-2, 10^-1, 10^0, 10^1, 10^2, 10^3, 10^4, 10^5, 10^6, 10^7}. For KNN regression, we will perform the above procedure once for each of the following values of the hyperparameter k: {1,2,3,...10}.

In [None]:
# model_list = [(LinearRegression,    'ols'),
model_list = [
              (SVR,                 'svr'),
              (Ridge,               'ridge'),
              (Lasso,               'lasso'), 
              (KNeighborsRegressor, 'knn')
               ]

# arg_dicts_by_model = [
#                          [{'fit_intercept':True}],
arg_dicts_by_model = [
                         [{'C':10**x}       for x in range(-2,8)],
                         [{'alpha':10**x}   for x in range(-2,8)],
                         [{'alpha':10**x}   for x in range(-2,8)],
#                          [{'alpha':10**x}   for x in np.linspace(-2,0,10)],   # for additional explo of lasso
#                          [{'alpha':10**x}   for x in np.linspace(-.9,-.4,10)],  # for additional explo of lasso
                         [{'n_neighbors':x} for x in range(1,11)]
                      ]

# if you just want to run ridge
# model_list = [(Ridge, 'ridge')]

# arg_dicts_by_model = [
#                         [{'alpha':10}]
# ]


# because I forgot to run SVR with these two kernels initially....

# model_list = [(SVR, 'svr'),
#               (SVR, 'svr')]

# arg_dicts_by_model = [
#                          [{'kernel':'poly',    'C':10**x} for x in range(-2,8)],
#                          [{'kernel':'sigmoid', 'C':10**x} for x in range(-2,8)],
#                       ]

# kernels = ['poly', 'sigmoid']

In [None]:
n_test_train_splits = 100 # TO TEST THIS CODE MORE QUICKLY, SET THIS VALUE CLOSER TO 5 OR 10

In [None]:
dim_names_flattened = [x for dim_pair in dims for x in dim_pair]

In [None]:
def rmse_score(y_true, y_pred):
    return sqrt(mean_squared_error(y_true, y_pred))

In [None]:
def many_test_train_splits(arg_dict):
    rsquareds = []
    rmses = []
    # copy vecs_and_judgment because each process of this function will modify it,
    # and changing the original might lead to unwanted behavior
    # I think the behavior can be replaced by the randomization of train_test_split, but I think I 
    # was getting weird behavior with that....
    vecs_and_judgment_temp = vecs_and_judgment.copy(deep=True)
    for _ in range(n_test_train_splits):
        vecs_and_judgment_temp = vecs_and_judgment_temp.sample(frac=1)
#         X = vecs_and_judgment_temp.iloc[:,:300]
        X = vecs_and_judgment_temp.iloc[:,:-1]
        y = vecs_and_judgment_temp['judgment']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
        regression = model(**arg_dict) # refers to, e.g., svr(C=.01)
        regression.fit(X=X_train, y=y_train)
        y_pred = regression.predict(X=X_test)
        rsquared = r2_score(y_test, y_pred)
        rmse     = rmse_score(y_test, y_pred)
        
        rsquareds.append(rsquared)
        rmses.append(rmse)
    mean_rsquared = np.mean(rsquareds)
    mean_rmses = np.mean(rmses)
    return mean_rsquared, mean_rmses

I forgot to include the last two SVR models/kernels when I initially wrote this code, and my rewrite with that in mind required saving the files in a new way. So, you'll need to change the commenting of the first line and the last lines in the code below depending on whether you want to run SVR-poly and SVR-sigmoid.

In [None]:
for model, list_of_arg_dicts in zip(model_list, arg_dicts_by_model):
# for model, list_of_arg_dicts, kernel in zip(model_list, arg_dicts_by_model, kernels):
    model, model_name = model
    print(model_name)
    model_results = []
    for domain, dim_pair in zip(domains, dims):
        for dim in dim_pair:
            print('\t', dim)
            X = dict_of_Xs[domain]
            y = dict_of_ys[dim]
            vecs_and_judgment = pd.concat([X,y], axis=1)
            with Pool() as p:
                scores_by_hyperparam = [mean_scores for mean_scores in p.map(many_test_train_splits, list_of_arg_dicts)]
            model_results.append(scores_by_hyperparam)
    model_results = pd.DataFrame(data=model_results, index=dim_names_flattened, columns=list_of_arg_dicts).T
    rsquared_df = model_results.applymap(lambda x: x[0])
    rmse_df     = model_results.applymap(lambda x: x[1])
    
    rsquared_df.to_csv(f'results/preregistered_models/rsquared/{model_name}_all_judgments.csv', float_format='%.2f')
    rmse_df.to_csv(    f'results/preregistered_models/rmse/{model_name}_all_judgments.csv',     float_format='%.2f')
#     rsquared_df.to_csv(f'results/preregistered_models/rsquared/{model_name}_{kernel}_all_judgments.csv', float_format='%.2f')
#     rmse_df.to_csv(    f'results/preregistered_models/rmse/{model_name}_{kernel}_all_judgments.csv',     float_format='%.2f')

#     rsquared_df.to_csv(f'results/preregistered_models_glove/rsquared/{model_name}_all_judgments.csv', float_format='%.2f')
#     rmse_df.to_csv(    f'results/preregistered_models_glove/rmse/{model_name}_all_judgments.csv',     float_format='%.2f')
#     rsquared_df.to_csv(f'results/preregistered_models_glove/rsquared/{model_name}_{kernel}_all_judgments.csv', float_format='%.2f')
#     rmse_df.to_csv(    f'results/preregistered_models_glove/rmse/{model_name}_{kernel}_all_judgments.csv',     float_format='%.2f')
    
#     rsquared_df.to_csv(f'results/preregistered_models_swow/rsquared/{model_name}_all_judgments.csv', float_format='%.2f')
#     rmse_df.to_csv(    f'results/preregistered_models_swow/rmse/{model_name}_all_judgments.csv',     float_format='%.2f')
#     rsquared_df.to_csv(f'results/preregistered_models_swow/rsquared/{model_name}_{kernel}_all_judgments.csv', float_format='%.2f')
#     rmse_df.to_csv(    f'results/preregistered_models_swow/rmse/{model_name}_{kernel}_all_judgments.csv',     float_format='%.2f')