In [2]:
cd ..

/Users/amiyaguchi/wikipedia-retention


In [26]:
# Model for neural net regression on all 
import numpy as np
import pandas as pd
import sklearn

# load features
base = "data/processed/"

user_df = pd.read_csv(base + 'base_features_reg.csv', header=None)
#role_df = pd.read_csv(base + 'roles/2007-1-nmf-G.csv', header=None, skiprows=1)
role_df = pd.read_csv(base + 'roles/2007-1-averaged.csv', header=None, sep="\t")
article_df = pd.read_csv(base + 'all_article_features.csv', header=None)

idx = user_df[1].str.startswith("2007-01")
y = user_df.iloc[:, -1][idx].values
user_df = user_df.drop([1,user_df.columns[-1]], axis=1)

In [27]:
# process joined data
X = (
    user_df
    .merge(article_df, how='left', on=0)
    .merge(role_df, how='left', on=0)
    .fillna(0.0)
)[idx].values

In [29]:
# only articles
X = (
    pd.DataFrame(user_df.iloc[:, 0])
    .merge(role_df, how='left', on=0)
    .fillna(0.0)
)[idx].values

In [35]:
from sklearn.preprocessing import MinMaxScaler

X = np.ndarray.astype(X[:,1:],float) # remove user_id
X[np.isnan(X)] = 0 # clear NaNs
# add new column for log(sum(log(textdata)))
#lslt = np.array([np.log(X[:,5])+1]).T
#X = np.append(X, lslt, 1)

# min-max scaling

scalar = MinMaxScaler(feature_range=(0,1))
scalar.fit(X)
dmin = scalar.data_min_
dmax = scalar.data_max_
Xnorm = scalar.transform(X)
Xnorm = Xnorm - Xnorm.mean(axis=0)

yl = np.log(y+1) # run on log y for smoother fit

In [38]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.metrics import log_loss, r2_score
import matplotlib.pyplot as plt


# train model
def fit_model(model,X,y):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=115)
    model.fit(X_train, y_train)
    score = model.score(X_test,y_test) # can have sample weight here
    return model, score

def plot_preds(preds, y, xlab='prediction', ylab='actual contribution'):
    plt.plot(np.exp(preds-1),np.exp(y-1),'.')
    mx = np.exp(min(np.max(preds),np.max(y)))
    plt.plot([1,mx],[1,mx],color='red')
    plt.xscale('log')
    plt.yscale('log')
    plt.xlabel(xlab)
    plt.ylabel(ylab)
    plt.show()

def plot_yl(yl):
    plt.hist(yl,log=True)
    plt.xlabel('log contribution')
    plt.ylabel('bin count')
    plt.show()

# NOTE: Below is the full 2-model (class * reg)
alphas = [0.1, 1, 10, 100]
nodes = [(3), (5), (7), (9), (8, 3), (10, 3), (12, 4), (14, 5)]
#alphas = [0.1]
# nodes = [(8,3)]
params = {"alpha" : alphas, "hidden_layer_sizes" : nodes}
theta_idx = yl > 0
yl_theta = yl[theta_idx]
X_theta = Xnorm[theta_idx] # note: don't rescale since we need to combine models

# run regression model
MLPR = MLPRegressor(activation = 'relu', solver = 'adam', random_state = 112358)
GSR = GridSearchCV(MLPR, params, return_train_score = True, n_jobs=4)
reg_model, reg_score = fit_model(GSR, X_theta, yl_theta)
rm = reg_model.best_estimator_
reg_preds = rm.predict(Xnorm) # predict on all

# run classification model
MLPC = MLPClassifier(activation='relu', solver='adam', random_state = 112358)
GSC = GridSearchCV(MLPC, params, return_train_score = True, n_jobs=4)
theta = np.ndarray.astype(theta_idx,int)
class_model, class_score = fit_model(GSC, Xnorm, theta)
cm = class_model.best_estimator_
class_preds = cm.predict_proba(Xnorm)[:,1]

# now combine
combined_preds = class_preds * reg_preds
combined_score = sklearn.metrics.r2_score(yl,combined_preds)

# plot class hist
def plot_class_hist(theta_idx, class_preds):
    plt.hist([1-class_preds[theta_idx],class_preds[~theta_idx]],label=['theta=1','theta=0'],log=True,bins=20)
    plt.xlabel("Classification error")
    plt.ylabel("Bin Count")
    plt.legend()
    plt.show()




In [32]:
# without roles
print(reg_score, class_score, combined_score)
# full: 0.3644307322834125 0.8623086114977545 0.37620858386813083
# 2007: 0.34289857834953696 0.8675161538113958 0.29584123721214706

0.34289857834953696 0.8675161538113958 0.29584123721214706


In [39]:
# with roles
print(reg_score, class_score, combined_score)
# full: 0.589164057706867 0.8864176537017097 0.5713924962545209
# 2007 averaged roles: 0.3413120348830906 0.8683520853101984 0.2988905580821656
# 2007 weighted total contribution: 0.34328552018276004 0.8688943111472595 0.29611516400833926

-0.0002852585513886119 0.8491256608377389 -6.2388374810762315e-06
