In [1]:
import numpy as np
import pandas as pd
import csv
import pickle
import prepare_functions
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [2]:
def preprocess(df, bow_transformer):
    """
    Preprocess the data in ``df``. The following preprocessing steps are done.

    1. The labels "ham" and "spam" are converted to 0 and 1 respectively.
    2. All messages are converted to tokens.
    3. A count vectorizer is used to convert the tokenized messages to vectors.
    4. Finally, the vectors are transformed using inverse document frequency and then normalized.
    
    :param df: Dataframe containing the data to be preprocessed.
    :param bow_transformer: A ``CountVectorizer`` to transform the sentences in ``df``.

    :returns: Tuple (result_matrix, df["label"])
    """

    # convert string labels to numeric labels
    df["label"].replace(["ham", "spam"], [0, 1], inplace=True)
    sparse_bow = bow_transformer.transform(df["message"])
    tfidf_transformer = TfidfTransformer().fit(sparse_bow)
    result_matrix = tfidf_transformer.transform(sparse_bow)
    return (result_matrix, df["label"])

In [3]:
with open('bow_transformer.pickle', 'rb') as f:
    bow_transformer = pickle.load(f)

In [6]:
train_data = prepare_functions.load_data("data/train.csv", separator=',')
validation_data = prepare_functions.load_data("data/validation.csv", separator=',')
test_data = prepare_functions.load_data("data/test.csv", separator=',')

print(train_data.shape)
print(validation_data.shape)
print(test_data.shape)

In [10]:
train_matrix, train_labels = preprocess(train_data, bow_transformer)
validation_matrix, validation_labels = preprocess(validation_data, bow_transformer)
test_matrix, test_labels = preprocess(test_data, bow_transformer)

print(train_matrix.shape)
print(validation_matrix.shape)
print(test_matrix.shape)

(3901, 8731)
(836, 8731)
(837, 8731)
