Using the smile framework for text classification and additional preprocessing
Text Classify

Build Status

Out of the box text classification library in java built on top of the smile framework.

Sentiment analysis

      TrainingData data = new TrainingData("data/training.txt");
      ArrayList<TextFeature> textFeatures = data.getTextFeatures();
      ArrayList<Integer> labels = data.getLabels();
      FeatureUtils.shuffle(textFeatures, labels);

      NormalizingTextTokenizer normalizingTextTokenizer = new NormalizingTextTokenizer();
      StemmingNGrammProcessor filteringProcessor = new StemmingNGrammProcessor(3, 3, 25);

      Set<Token> corpus =;
      DefaultTextPipeline pipeline = new DefaultTextPipeline(new BagOfWordsFeatureExtractor(corpus), normalizingTextTokenizer);

      BinaryTextClassifierTrainer binaryTextClassifierTrainer = new BinaryTextClassifierTrainer(0.5, 0.5, pipeline);

      // cross validate
      BinaryUtils.BinaryConfusionMatrixMeasure confusionMatrixMeasure = binaryTextClassifierTrainer
              .crossValidate(textFeatures.toArray(new TextFeature[0]),;" confusion matrix\n" + confusionMatrixMeasure);

      // train and validate
      int trainSize = 6000;
      TextFeature[] trainX =[]::new);
      int[] trainY =;

      TextFeature[] testX =[]::new);
      int[] testY =;

      TextClassifier classifier = binaryTextClassifierTrainer.train(trainX, trainY, null);
      int[] predictions =;

      int correct = 0;
      for (int i = 0; i < testY.length; i++) {
         if (predictions[i] == testY[i]) {


      double accurancy = correct / (double) testY.length;"Accuracy: " + accurancy);