### Sanity Check! 
#### Using tf-idf and SVM to ensure that labels & documents are appropriately matched

In [1]:
# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload
# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from src.models import metrics
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

# incorporate only frequent labels
def isolate_frequent_labels(X, label_column, threshold_count):
    # returns: dataframe with only infrequent labels
    df = X.groupby(label_column).size()[X.groupby(label_column).size() > threshold_count].reset_index()
    frequent_labels = df.iloc[:,0]
    return X[X.label.isin(frequent_labels)]

OHSUcsv = pd.read_csv("../data/processed/ohsumed_abstracts.csv", index_col ="Unnamed: 0")
data = isolate_frequent_labels(OHSUcsv, 'label', 200)
#data = OHSUcsv[OHSUcsv.label.isin(frequent_labels)]

train_posts = data.loc[data.split == 'train', 'doc']
train_tags = data.loc[data.split == 'train', 'label']
test_posts = data.loc[data.split == 'test', 'doc']
test_tags = data.loc[data.split == 'test', 'label']

  from ._conv import register_converters as _register_converters


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

def vectorize_documents(documents):
    # create the transform and build the vocabulary 
    vectorizer = TfidfVectorizer()
    vectorizer.fit(documents)
    
    # create document vectors
    vectorized_documents = []
    for document in documents:
        document = [document]
        vectorized_documents.append(vectorizer.transform(document))
    return vectorized_documents

train_posts_tfidf = vectorize_documents(train_posts)


[<1x29868 sparse matrix of type '<class 'numpy.float64'>'
 	with 159 stored elements in Compressed Sparse Row format>,
 <1x29868 sparse matrix of type '<class 'numpy.float64'>'
 	with 32 stored elements in Compressed Sparse Row format>,
 <1x29868 sparse matrix of type '<class 'numpy.float64'>'
 	with 79 stored elements in Compressed Sparse Row format>,
 <1x29868 sparse matrix of type '<class 'numpy.float64'>'
 	with 66 stored elements in Compressed Sparse Row format>,
 <1x29868 sparse matrix of type '<class 'numpy.float64'>'
 	with 115 stored elements in Compressed Sparse Row format>,
 <1x29868 sparse matrix of type '<class 'numpy.float64'>'
 	with 111 stored elements in Compressed Sparse Row format>,
 <1x29868 sparse matrix of type '<class 'numpy.float64'>'
 	with 88 stored elements in Compressed Sparse Row format>,
 <1x29868 sparse matrix of type '<class 'numpy.float64'>'
 	with 138 stored elements in Compressed Sparse Row format>,
 <1x29868 sparse matrix of type '<class 'numpy.float

In [13]:
from sklearn.naive_bayes import MultinomialNB

text_clf = MultinomialNB().fit(train_posts_tfidf, train_tags)

test_posts_tfidf = vectorize_documents(test_posts)
predicted = text_clf.predict(test_posts_tfidf)
np.mean(predicted == test_tags)

ValueError: Expected 2D array, got 1D array instead:
array=[<1x33149 sparse matrix of type '<class 'numpy.float64'>'
	with 93 stored elements in Compressed Sparse Row format>
 <1x33149 sparse matrix of type '<class 'numpy.float64'>'
	with 97 stored elements in Compressed Sparse Row format>
 <1x33149 sparse matrix of type '<class 'numpy.float64'>'
	with 74 stored elements in Compressed Sparse Row format>
 ...
 <1x33149 sparse matrix of type '<class 'numpy.float64'>'
	with 114 stored elements in Compressed Sparse Row format>
 <1x33149 sparse matrix of type '<class 'numpy.float64'>'
	with 92 stored elements in Compressed Sparse Row format>
 <1x33149 sparse matrix of type '<class 'numpy.float64'>'
	with 102 stored elements in Compressed Sparse Row format>].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.