In [1]:
#load needed python library
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import numpy as np
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import StratifiedKFold
from scipy.stats import pearsonr
from sklearn.metrics import matthews_corrcoef,recall_score, precision_score



In [2]:
#Read in the data file
dt = pd.read_csv("all_ter_data.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
#remove the nulls, provider_id, reviewer_id
dt = dt[~pd.isnull(dt.provider_id)]
#fill in zeros for the null in ht column
dt['ht'].fillna(0,inplace = True)
dt['ht'] = dt['ht'].astype('bool')
v = dt[['provider_id','general_details','juicy_details','ht']]
del dt
#fill in the Na's in general and juicy details to be ''
v['general_details'].fillna("",inplace = True)
v['juicy_details'].fillna("",inplace = True)
#vectorizer using binary 
stopWords = set(stopwords.words('english')) 

In [4]:
#aggreagate the reviews based on provider_id
pid, comments, label = [], [], []
for name, group in v.groupby("provider_id"):
    pid.append(name)
    label.append(any(group.ht))
    compound = group.general_details + group.juicy_details
    comments.append(" ".join(compound))
label = np.array(label) == True

In [6]:
#vectorizer the comments [so that it could be put into machine learning models] 
# could change setting, for examples, ngram_range = (1,2) allows bi-grams to be used
vect_count = CountVectorizer(ngram_range = (1,1), binary = False, stop_words = list(stopWords)) 
sp_mat = vect_count.fit_transform(comments)

In [7]:
#set up the cross validation procedure
ss_index = []
for i,j in StratifiedKFold(label,4):
    ss_index.append((i,j))

In [8]:
###The ensemble models 
#create the bootstrape index and subsampling index for our data
def boot_model(trainX,trainY,repeatn,prop = 1, bootp = None):
    Xsize = trainX.shape[0]
    p_index = trainY.index[trainY == 1]
    n_index = trainY.index[trainY == 0]
    psize, nsize = p_index.size, n_index.size
    boot_ind = np.zeros((repeatn,psize+int(psize*prop)))
    for i in range(repeatn):
        if bootp == None:
            pb = np.random.choice(p_index,psize)
        else:
            pb = np.random.choice(p_index,int(psize*bootp),replace = False)
            pb = np.concatenate((pb,np.random.choice(pb,psize-pb.size)))
        nb = np.random.choice(n_index,int(psize*prop),replace = False)
        pb = np.concatenate((pb,nb))
        boot_ind[i,:] = np.random.permutation(pb)
    return boot_ind
 
    
#model the predefined sklearn models, logistics, svm, naive bayes, etc.
#prop describes how much to subsample from class 0 / #of class 1
def cross_val_boot(X,Y,n_splits,n_repeat,model, prop = 1, bootp = None):
    #rs = StratifiedShuffleSplit(n_splits = n_splits)
    r_result = []
    for train_ind, test_ind in ss_index:
        trainX, testX = X[train_ind,:], X[test_ind,:]
        trainY, testY = Y[train_ind], Y[test_ind]
        boot_ind = boot_model(trainX,trainY,n_repeat,prop)
        pre_testY = np.zeros((n_repeat,testX.shape[0]))
        pre_testY_auc = np.zeros((n_repeat,testX.shape[0]))
        for i in range(n_repeat):
            bd = boot_ind[i,:]
            bootX, bootY = X[bd,:], trainY[bd]
            try:
                model.fit(bootX,bootY)
            except:
                print(bootY)
            pre_testY[i,:] = model.predict(testX)
            pre_testY_auc[i,:] = model.predict_proba(testX)[:,1]
        preY = pre_testY.sum(axis = 0) > (n_repeat//2)
        pre_testY_auc = pre_testY_auc.max(axis = 0)
        accuracy = sum(preY == testY)/len(testY)
        fpr, tpr, thresholds = roc_curve(testY, pre_testY_auc)
        s = auc(fpr, tpr)
        r_result.append((accuracy, precision_score(testY,preY),recall_score(testY,preY),matthews_corrcoef(testY,preY),s))
    r_result = np.array(r_result).reshape((n_splits,-1))
    return r_result

In [9]:
#initialize models
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
model1 = MultinomialNB()
model2 = LogisticRegression()

In [None]:
#evalidation on the naive bayes ensemble, using 10 models per fold (could adjust the parameter)
cross_val_boot(sp_mat,pd.Series(label),4,10,model1)

In [None]:
#evaluation on the logistic regression ensemble, using 10 models per fold (could adjust the parameter)
cross_val_boot(sp_mat,pd.Series(label),4,10,model2)