warm start parameter search supported

-add functions for parameter search -add init_sol in struct parameter -modify function train to support initial solution for L2R_LR and L2R_L2LOSS_SVC -interface and train.c update: option -C is added
cjlin1 · Jun 29, 2015 · 621b867 · 621b867
1 parent 25fb1ce
commit 621b867
Show file tree

Hide file tree

Showing 12 changed files with 422 additions and 35 deletions.
diff --git a/Makefile b/Makefile
@@ -2,7 +2,7 @@ CXX ?= g++
 CC ?= gcc
 CFLAGS = -Wall -Wconversion -O3 -fPIC
 LIBS = blas/blas.a
-SHVER = 2
+SHVER = 3
 OS = $(shell uname)
 #LIBS = -lblas
 

diff --git a/README b/README
@@ -131,11 +131,16 @@ options:
 -B bias : if bias >= 0, instance x becomes [x; bias]; if < 0, no bias term added (default -1)
 -wi weight: weights adjust the parameter C of different classes (see README for details)
 -v n: n-fold cross validation mode
+-C : find parameter C (only for -s 0 and 2)
 -q : quiet mode (no outputs)
 
 Option -v randomly splits the data into n parts and calculates cross
 validation accuracy on them.
 
+Option -C conducts cross validation under different C values and finds
+the best one. This options is supported only by -s 0 and -s 2. If
+the solver is not specified, -s 2 is used.
+
 Formulations:
 
 For L2-regularized logistic regression (-s 0), we solve
@@ -245,6 +250,12 @@ Do five-fold cross-validation using L2-loss svm.
 Use a smaller stopping tolerance 0.001 than the default
 0.1 if you want more accurate solutions.
 
+> train -C -s 0 data_file
+
+Conduct cross validation many times by logistic regression
+and finds the parameter C which achieves the best cross 
+validation accuracy.
+
 > train -c 10 -w1 2 -w2 5 -w3 2 four_class_data_file
 
 Train four classifiers:
@@ -407,6 +418,22 @@ Library Usage
 
     The format of prob is same as that for train().
 
+- Function: void find_parameter_C(const struct problem *prob, 
+            const struct parameter *param, int nr_fold, double start_C, 
+	    double max_C, double *best_C, double *best_rate);
+
+    This function is similar to cross_validation. However, instead of
+    conducting cross validation under a specified parameter C, it 
+    conducts cross validation many times under parameters C = start_C, 
+    2*start_C, 4*start_C, 8*start_C, ..., and finds the best one with
+    the highest cross validation accuracy.
+
+    If start_C <= 0, then this procedure calculates a small enough C 
+    for prob as the start_C. The procedure stops when the models of 
+    all folds become stable or C reaches max_C. The best C and the 
+    corresponding accuracy are assigned to *best_C and *best_rate,
+    respectively.
+
 - Function: double predict(const model *model_, const feature_node *x);
 
     For a classification model, the predicted class for x is returned.

diff --git a/linear.cpp b/linear.cpp
@@ -27,6 +27,7 @@ static void print_string_stdout(const char *s)
 	fputs(s,stdout);
 	fflush(stdout);
 }
+static void print_null(const char *s) {}
 
 static void (*liblinear_print_string) (const char *) = &print_string_stdout;
 
@@ -2180,14 +2181,18 @@ static void group_classes(const problem *prob, int *nr_class_ret, int **label_re
 
 static void train_one(const problem *prob, const parameter *param, double *w, double Cp, double Cn)
 {
-	double eps=param->eps;
+	//inner and outer tolerances for TRON
+	double eps = param->eps;
+	double eps_cg = 0.1;
+	if(param->init_sol != NULL)
+		eps_cg = 0.5;
+
 	int pos = 0;
 	int neg = 0;
 	for(int i=0;i<prob->l;i++)
 		if(prob->y[i] > 0)
 			pos++;
 	neg = prob->l - pos;
-
 	double primal_solver_tol = eps*max(min(pos,neg), 1)/prob->l;
 
 	function *fun_obj=NULL;
@@ -2204,7 +2209,7 @@ static void train_one(const problem *prob, const parameter *param, double *w, do
 					C[i] = Cn;
 			}
 			fun_obj=new l2r_lr_fun(prob, C);
-			TRON tron_obj(fun_obj, primal_solver_tol);
+			TRON tron_obj(fun_obj, primal_solver_tol, eps_cg);
 			tron_obj.set_print_string(liblinear_print_string);
 			tron_obj.tron(w);
 			delete fun_obj;
@@ -2222,7 +2227,7 @@ static void train_one(const problem *prob, const parameter *param, double *w, do
 					C[i] = Cn;
 			}
 			fun_obj=new l2r_l2_svc_fun(prob, C);
-			TRON tron_obj(fun_obj, primal_solver_tol);
+			TRON tron_obj(fun_obj, primal_solver_tol, eps_cg);
 			tron_obj.set_print_string(liblinear_print_string);
 			tron_obj.tron(w);
 			delete fun_obj;
@@ -2287,6 +2292,36 @@ static void train_one(const problem *prob, const parameter *param, double *w, do
 	}
 }
 
+// Calculate the initial C for parameter selection
+static double calc_start_C(const problem *prob, const parameter *param)
+{
+	int i;
+	double xTx,max_xTx;
+	max_xTx = 0;
+	for(i=0; i<prob->l; i++)
+	{
+		xTx = 0;
+		feature_node *xi=prob->x[i];
+		while(xi->index != -1)
+		{
+			double val = xi->value;
+			xTx += val*val;
+			xi++;
+		}
+		if(xTx > max_xTx)
+			max_xTx = xTx;
+	}
+
+	double min_C = 1.0;
+	if(param->solver_type == L2R_LR)
+		min_C = 1.0 / (prob->l * max_xTx);
+	else if(param->solver_type == L2R_L2LOSS_SVC)
+		min_C = 1.0 / (2 * prob->l * max_xTx);
+
+	return pow( 2, floor(log(min_C) / log(2.0)) );
+}
+
+
 //
 // Interface functions
 //
@@ -2310,7 +2345,7 @@ model* train(const problem *prob, const parameter *param)
 		model_->w = Malloc(double, w_size);
 		model_->nr_class = 2;
 		model_->label = NULL;
-		train_one(prob, param, &model_->w[0], 0, 0);
+		train_one(prob, param, model_->w, 0, 0);
 	}
 	else
 	{
@@ -2380,8 +2415,15 @@ model* train(const problem *prob, const parameter *param)
 					sub_prob.y[k] = +1;
 				for(; k<sub_prob.l; k++)
 					sub_prob.y[k] = -1;
+
+				if(param->init_sol != NULL)
+					for(i=0;i<w_size;i++)
+						model_->w[i] = param->init_sol[i];
+				else
+					for(i=0;i<w_size;i++)
+						model_->w[i] = 0;
 
-				train_one(&sub_prob, param, &model_->w[0], weighted_C[0], weighted_C[1]);
+				train_one(&sub_prob, param, model_->w, weighted_C[0], weighted_C[1]);
 			}
 			else
 			{
@@ -2400,6 +2442,13 @@ model* train(const problem *prob, const parameter *param)
 					for(; k<sub_prob.l; k++)
 						sub_prob.y[k] = -1;
 
+					if(param->init_sol != NULL)
+						for(j=0;j<w_size;j++)
+							w[j] = param->init_sol[j*nr_class+i];
+					else
+						for(j=0;j<w_size;j++)
+							w[j] = 0;
+
 					train_one(&sub_prob, param, w, weighted_C[i], param->C);
 
 					for(int j=0;j<w_size;j++)
@@ -2480,6 +2529,148 @@ void cross_validation(const problem *prob, const parameter *param, int nr_fold,
 	free(perm);
 }
 
+void find_parameter_C(const problem *prob, const parameter *param, int nr_fold, double start_C, double max_C, double *best_C, double *best_rate)
+{
+	// variables for CV
+	int i;
+	int *fold_start;
+	int l = prob->l;
+	int *perm = Malloc(int, l);
+	double *target = Malloc(double, prob->l);
+	struct problem *subprob = Malloc(problem,nr_fold);
+
+	// variables for warm start
+	double ratio = 2;
+	double **prev_w = Malloc(double*, nr_fold);
+	for(i = 0; i < nr_fold; i++)
+		prev_w[i] = NULL;
+	int num_unchanged_w = 0;
+	struct parameter param1 = *param;
+	void (*default_print_string) (const char *) = liblinear_print_string;
+
+	if (nr_fold > l)
+	{
+		nr_fold = l;
+		fprintf(stderr,"WARNING: # folds > # data. Will use # folds = # data instead (i.e., leave-one-out cross validation)\n");
+	}
+	fold_start = Malloc(int,nr_fold+1);
+	for(i=0;i<l;i++) perm[i]=i;
+	for(i=0;i<l;i++)
+	{
+		int j = i+rand()%(l-i);
+		swap(perm[i],perm[j]);
+	}
+	for(i=0;i<=nr_fold;i++)
+		fold_start[i]=i*l/nr_fold;
+
+	for(i=0;i<nr_fold;i++)
+	{
+		int begin = fold_start[i];
+		int end = fold_start[i+1];
+		int j,k;
+
+		subprob[i].bias = prob->bias;
+		subprob[i].n = prob->n;
+		subprob[i].l = l-(end-begin);
+		subprob[i].x = Malloc(struct feature_node*,subprob[i].l);
+		subprob[i].y = Malloc(double,subprob[i].l);
+
+		k=0;
+		for(j=0;j<begin;j++)
+		{
+			subprob[i].x[k] = prob->x[perm[j]];
+			subprob[i].y[k] = prob->y[perm[j]];
+			++k;
+		}
+		for(j=end;j<l;j++)
+		{
+			subprob[i].x[k] = prob->x[perm[j]];
+			subprob[i].y[k] = prob->y[perm[j]];
+			++k;
+		}
+
+	}
+
+	*best_rate = 0;
+	if(start_C <= 0)
+		start_C = calc_start_C(prob,param);
+	param1.C = start_C;
+
+	while(param1.C <= max_C)
+	{
+		//Output diabled for running CV at a particular C
+		set_print_string_function(&print_null);
+
+		for(i=0; i<nr_fold; i++)
+		{
+			int j;
+			int begin = fold_start[i];
+			int end = fold_start[i+1];
+
+			param1.init_sol = prev_w[i];
+			struct model *submodel = train(&subprob[i],&param1);
+
+			int total_w_size;
+			if(submodel->nr_class == 2)
+				total_w_size = subprob[i].n;
+			else
+				total_w_size = subprob[i].n * submodel->nr_class;
+
+			if(prev_w[i] != NULL && num_unchanged_w >= 0)
+			{
+				double norm_w_diff = 0;
+				for(j=0; j<total_w_size; j++)
+				{
+					norm_w_diff += (submodel->w[j] - prev_w[i][j])*(submodel->w[j] - prev_w[i][j]);
+					prev_w[i][j] = submodel->w[j];
+				}
+				norm_w_diff = sqrt(norm_w_diff);
+
+				if(norm_w_diff > 1e-15)
+					num_unchanged_w = -1;
+			}
+			else
+			{
+				prev_w[i] = Malloc(double, total_w_size);
+				for(j=0; j<total_w_size; j++)
+					prev_w[i][j] = submodel->w[j];
+			}
+
+			for(j=begin; j<end; j++)
+				target[perm[j]] = predict(submodel,prob->x[perm[j]]);
+
+			free_and_destroy_model(&submodel);
+		}
+		set_print_string_function(default_print_string);
+
+		int total_correct = 0;
+		for(i=0; i<prob->l; i++)
+			if(target[i] == prob->y[i])
+				++total_correct;
+		double current_rate = (double)total_correct/prob->l;
+		if(current_rate > *best_rate)
+		{
+			*best_C = param1.C;
+			*best_rate = current_rate;
+		}
+
+		info("log2c=%7.2f\trate=%g\n",log(param1.C)/log(2.0),100.0*current_rate);
+		num_unchanged_w++;
+		if(num_unchanged_w == 3)
+			break;
+		param1.C = param1.C*ratio;
+	}
+
+	if(param1.C > max_C && max_C > start_C) 
+		info("warning: maximum C reached.\n");
+	free(fold_start);
+	free(perm);
+	free(target);
+	for(i=0; i<nr_fold; i++)
+		free(prev_w[i]);
+	free(prev_w);
+}
+
 double predict_values(const struct model *model_, const struct feature_node *x, double *dec_values)
 {
 	int idx;
@@ -2839,6 +3030,8 @@ void destroy_param(parameter* param)
 		free(param->weight_label);
 	if(param->weight != NULL)
 		free(param->weight);
+	if(param->init_sol != NULL)
+		free(param->init_sol);
 }
 
 const char *check_parameter(const problem *prob, const parameter *param)
@@ -2865,6 +3058,10 @@ const char *check_parameter(const problem *prob, const parameter *param)
 		&& param->solver_type != L2R_L1LOSS_SVR_DUAL)
 		return "unknown solver type";
 
+	if(param->init_sol != NULL 
+		&& param->solver_type != L2R_LR && param->solver_type != L2R_L2LOSS_SVC)
+		return "Initial-solution specification supported only for solver L2R_LR and L2R_L2LOSS_SVC";
+
 	return NULL;
 }
 

diff --git a/linear.h b/linear.h
@@ -32,6 +32,7 @@ struct parameter
 	int *weight_label;
 	double* weight;
 	double p;
+	double *init_sol;
 };
 
 struct model
@@ -46,6 +47,7 @@ struct model
 
 struct model* train(const struct problem *prob, const struct parameter *param);
 void cross_validation(const struct problem *prob, const struct parameter *param, int nr_fold, double *target);
+void find_parameter_C(const struct problem *prob, const struct parameter *param, int nr_fold, double start_C, double max_C, double *best_C, double *best_rate);
 
 double predict_values(const struct model *model_, const struct feature_node *x, double* dec_values);
 double predict(const struct model *model_, const struct feature_node *x);

diff --git a/matlab/README b/matlab/README
@@ -131,7 +131,12 @@ nr_feature, bias, Label, w]:
 
 If the '-v' option is specified, cross validation is conducted and the
 returned model is just a scalar: cross-validation accuracy for 
-classification and mean-squared error for regression.
+classification and mean-squared error for regression. If the '-C' option
+is specified, the best parameter C is found by cross validation. The 
+returned model is a two dimensional vector, where the first value is 
+the best C and the second value is the corresponding cross-validation 
+accuracy. The parameter selection utility is supported by only -s 0
+and -s 2.
 
 Result of Prediction
 ====================
@@ -184,6 +189,11 @@ For probability estimates, you need '-b 1' only in the testing phase:
 
 matlab> [predict_label, accuracy, prob_estimates] = predict(heart_scale_label, heart_scale_inst, model, '-b 1');
 
+Use the best parameter to train (only supported by -s 0 and -s 2):
+
+matlab> best = train(heart_scale_label, heart_scale_inst, '-C -s 0');
+matlab> model = train(heart_scale_label, heart_scale_inst, sprintf('-c %f -s 0', best(1))); % use the same solver: -s 0 
+
 Additional Information
 ======================