In [1]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

%matplotlib inline

In [2]:
# compute classifier accuracy
def get_accuracy(y, y_hat):
  return np.float64(sum(y == y_hat)) / np.float64(y.size)

def tf_idf_classifier(X_train, y_train, X_test, y_test):
  bow = CountVectorizer()
  tdchow = bow.fit(X_train)
  tdmat = bow.fit_transform(X_train)

  tfidf_transformer = TfidfTransformer()
  X_train_tfidf = tfidf_transformer.fit_transform(tdmat)
  X_train_tfidf.shape

  clf = MultinomialNB().fit(X_train_tfidf, y_train)

  test_tdmat = bow.transform(X_test)
  X_test_tfidf = tfidf_transformer.transform(test_tdmat)

  y_predict = clf.predict(X_test_tfidf)
  acc = np.mean(y_test == y_predict)

  return acc, tdchow


In [3]:
# load data
data = pd.read_csv('sentiment_analysis.csv', delimiter=',', header=0)
data  = data.values

# classes
NUM_CLASSES = 2
NUM_EXAMPLES = 2000
NUM_TEST = 200
NUM_TRAIN = 800

all_examples = np.arange(0,NUM_EXAMPLES // 2)

acc = np.empty((20,1))

for i in range(1):
  pidx = np.random.choice(NUM_EXAMPLES // 2, NUM_TEST, replace=False)
  nidx = np.random.choice(np.arange(1000,NUM_EXAMPLES), NUM_TEST, replace=False)

  # load data
  X_train = np.concatenate((data[np.setdiff1d(all_examples, pidx, assume_unique=True),1], 
                           data[np.setdiff1d(all_examples+1000, nidx, assume_unique=True),1]), axis=0)
  y_train = np.concatenate((np.ones(NUM_TRAIN), np.zeros(NUM_TRAIN)))
  X_test = np.concatenate((data[pidx,1], data[nidx,1]), axis=0)
  y_test = np.concatenate((np.ones(NUM_TEST), np.zeros(NUM_TEST)))

  acc[i], tdchow = tf_idf_classifier(X_train, y_train, X_test, y_test)
  
print(np.mean(acc))

0.04125


In [5]:
print(tdchow.vocabulary_)

