In [1]:
cd ..

/home/amiyaguchi/cs224w/wikipedia-retention


In [4]:
! head -n2 data/processed/role-features

,role_-1,role_-1,role_0,role_1,role_2,role_3,role_4,role_5,role_6,role_7,role_8,role_9,role_10,role_11,role_12,role_13,role_14,role_15,role_16,role_17,role_18,role_19,role_20,role_21,role_22,role_23,role_24,role_25,role_26,role_27,role_28,role_29,role_30
0,0.0,0.0,3.385962476763866e-06,7.593783790784927e-05,0.0,5.112422163371725e-06,6.250467324660294e-05,2.5913449080068567e-06,3.781486786067123e-06,1.2189651473482883e-06,0.0,0.0,3.8132232413406166e-06,2.830387035209037e-06,1.0047484411328175e-06,0.0,1.1455070825746656e-06,7.518268179048739e-06,0.0,1.9660844809643654e-06,3.240578019407499e-06,0.0,0.0,2.044498092073995e-06,2.5600785666519907e-06,6.661282512800984e-06,7.815884136577666e-06,1.3817765777815095e-06,3.245648475330179e-06,3.5783210994510185e-06,5.420547778876492e-06,3.603760080559225e-06,0.0


In [7]:
# Model for neural net regression on 2007-Q1
import numpy as np
import pandas as pd
import sklearn

# load features
base = "data/processed/"
uf_name = base + 'base_features_reg.csv'
af_name = base + 'all_article_features.csv'
rolef_name = base + 'role-features'
cf_name = base + 'community_norm_features.csv'

# process roles
role_df = pd.read_csv(rolef_name, header=None, skiprows=1)

user_df = pd.read_csv(uf_name, header=None)
# mark columns that are 2007-q1
time_str = '2007-01-01T00:00:00.000-08:00'
time_idx = np.where(user_df[1] == '2007-01-01T00:00:00.000-08:00')[0]
y = np.ndarray.astype(user_df.values[:,-1],int)
user_df = user_df.drop([1,user_df.columns[-1]],axis=1) # drop time and y column
article_df = pd.read_csv(af_name, header=None)
community_norm_df = pd.read_csv(cf_name, header=None)

# process joined data
ua_df = user_df.merge(article_df, on=0)
uac_df = ua_df.merge(community_norm_df, on=0)
X_df = uac_df.merge(role_df, how='left', on=0) 
X = X_df.as_matrix()
# strip by time_idx
X = X[time_idx,:]
y = y[time_idx]
# set missing roles to 1 (in what used to be role_id)
X[np.isnan(X[:,40]),40] = 1
X = np.ndarray.astype(X[:,1:],float) # remove user_id
X[np.isnan(X)] = 0 # clear NaNs
# add new column for log(sum(log(textdata)))
lslt = np.array([np.log(X[:,5])+1]).T
X = np.append(X, lslt, 1)
# min-max scaling
from sklearn.preprocessing import MinMaxScaler
scalar = MinMaxScaler(feature_range=(0,1))
scalar.fit(X)
dmin = scalar.data_min_
dmax = scalar.data_max_
Xnorm = scalar.transform(X)
Xnorm = Xnorm - Xnorm.mean(axis=0)

# NOTE: Above is mostly copy paste from logreg, after here is NN.
# We should probably try to standardized our feature sets to make joining easier

# setup train-test split. might want train-dev-test for final model testing
# could also rebalance (since so many 0 examples)
yl = np.log(y+1) # run on log y for smoother fit



In [8]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.metrics import log_loss, r2_score
import matplotlib.pyplot as plt


# train model
def fit_model(model,X,y):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=115)
    model.fit(X_train, y_train)
    score = model.score(X_test,y_test) # can have sample weight here
    return model, score

def plot_preds(preds, y, xlab='prediction', ylab='actual contribution'):
    plt.plot(np.exp(preds-1),np.exp(y-1),'.')
    mx = np.exp(min(np.max(preds),np.max(y)))
    plt.plot([1,mx],[1,mx],color='red')
    plt.xscale('log')
    plt.yscale('log')
    plt.xlabel(xlab)
    plt.ylabel(ylab)
    plt.show()

def plot_yl(yl):
    plt.hist(yl,log=True)
    plt.xlabel('log contribution')
    plt.ylabel('bin count')
    plt.show()

# NOTE: Below is the full 2-model (class * reg)
alphas = [0.1, 1, 10, 100]
nodes = [(3), (5), (7), (9), (8, 3), (10, 3), (12, 4), (14, 5)]
#alphas = [0.1]
#nodes = [(8,3)]
params = {"alpha" : alphas, "hidden_layer_sizes" : nodes}
theta_idx = yl > 0
yl_theta = yl[theta_idx]
X_theta = Xnorm[theta_idx] # note: don't rescale since we need to combine models
# run regression model
MLPR = MLPRegressor(activation = 'relu', solver = 'adam', random_state = 112358)
GSR = GridSearchCV(MLPR, params, return_train_score = True)
reg_model, reg_score = fit_model(GSR, X_theta, yl_theta)
rm = reg_model.best_estimator_
reg_preds = rm.predict(Xnorm) # predict on all
# run classification model
MLPC = MLPClassifier(activation='relu', solver='adam', random_state = 112358)
GSC = GridSearchCV(MLPC, params, return_train_score = True)
theta = np.ndarray.astype(theta_idx,int)
class_model, class_score = fit_model(GSC, Xnorm, theta)
cm = class_model.best_estimator_
class_preds = cm.predict_proba(Xnorm)[:,1]
# now combine
combined_preds = class_preds * reg_preds
combined_score = sklearn.metrics.r2_score(yl,combined_preds)

# plot class hist
def plot_class_hist(theta_idx, class_preds):
    plt.hist([1-class_preds[theta_idx],class_preds[~theta_idx]],label=['theta=1','theta=0'],log=True,bins=20)
    plt.xlabel("Classification error")
    plt.ylabel("Bin Count")
    plt.legend()
    plt.show()


In [10]:
combined_score

0.29616916808554106