In [13]:
import numpy as np
import pandas as pd
import csv
import pickle
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [14]:
def load_data(file_path):
    """
    Load data from ``file_path`` as a dataframe.
    
    :param file_path: Path of the data file.
    """
    return pd.read_csv(file_path, sep='\t', names=["label" ,"message"], quoting=csv.QUOTE_NONE)

def save_data(file_path, df):
    """
    Save ``df`` to the given ``file_path`` as a CSV file.
    
    :param file_path: Path of the file to be saved.
    :param df: Dataframe containing the data to be saved.
    """
    df.to_csv(file_path, index=False)

def preprocess(df, bow_transformer):
    """
    Preprocess the data in ``df``. The following preprocessing steps are done.

    1. The labels "ham" and "spam" are converted to 0 and 1 respectively.
    2. All messages are converted to tokens.
    3. A count vectorizer is used to convert the tokenized messages to vectors.
    4. Finally, the vectors are transformed using inverse document frequency and then normalized.
    
    :param df: Dataframe containing the data to be preprocessed.
    :param bow_transformer: A ``CountVectorizer`` to transform the sentences in ``df``.

    :returns: Tuple (result_matrix, df["label"])
    """

    # convert string labels to numeric labels
    df["label"].replace(["ham", "spam"], [0, 1], inplace=True)
    sparse_bow = bow_transformer.transform(df["message"])
    tfidf_transformer = TfidfTransformer().fit(sparse_bow)
    result_matrix = tfidf_transformer.transform(sparse_bow)
    return (result_matrix, df["label"])
    
    
def prepare_train_validation_test_split(df, train_percent=0.7, test_percent=0.15, seed=None):
    """
    Create a randomized 70-15-15 train/validation/test split.
    
    :param df: Dataframe containing the data to be split.
    :param train_percent: Percentage allotted to training data.
    :param validate_percent: Percentage allotted to validation data.
    :param seed: Seed for random number generator.
    
    :returns: A tuple (data_train, data_validate, data_test)
    """
    data, data_test = train_test_split(df, test_size=test_percent, random_state=seed)
    data_train, data_validate = train_test_split(data, train_size=(train_percent)/(1 - test_percent), random_state=seed)
    return (data_train, data_validate, data_test)

In [15]:
messages = load_data("data/SMSSpamCollection")
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5569,spam,This is the 2nd time we have tried 2 contact u...
5570,ham,Will ü b going to esplanade fr home?
5571,ham,"Pity, * was in mood for that. So...any other s..."
5572,ham,The guy did some bitching but I acted like i'd...


In [16]:
train_data, validation_data, test_data = prepare_train_validation_test_split(messages)
print(train_data.shape)
print(validation_data.shape)
print(test_data.shape)

(3901, 2)
(836, 2)
(837, 2)


In [17]:
save_data("data/train.csv", train_data)
save_data("data/validation.csv", validation_data)
save_data("data/test.csv", test_data)

In [18]:
def split_into_lemmas(message):
    message = message.lower()
    words = TextBlob(message).words
    # for each word, take its "base form" = lemma 
    return [word.lemma for word in words]

bow_transformer = CountVectorizer(analyzer=split_into_lemmas).fit(messages['message'])
with open('bow_transformer.pickle', 'wb') as f:
    pickle.dump(bow_transformer, f, pickle.HIGHEST_PROTOCOL)