In [49]:
import numpy as np
import pandas as pd
import csv
import pickle
import prepare_functions
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [50]:
# loading the dataset
messages = prepare_functions.load_data("data/SMSSpamCollection")
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [51]:
# splitting the dataset into training, validation and testing data
# here, we use random seed = 1
train_data, validation_data, test_data = prepare_functions.prepare_train_validation_test_split(messages, seed=1)
print(train_data.shape)
print(validation_data.shape)
print(test_data.shape)

(3900, 2)
(836, 2)
(836, 2)


In [52]:
# saving the three datasets as csv files
prepare_functions.save_data("data/train.csv", train_data)
prepare_functions.save_data("data/validation.csv", validation_data)
prepare_functions.save_data("data/test.csv", test_data)

In [53]:
# function to get distribution of splitted data
def get_distribution():
    """
    Print the distribution (number of 0s and 1s) of the training,
    validation and test data.
    """
    training_data = prepare_functions.load_data("data/train.csv", separator=',')
    validation_data = prepare_functions.load_data("data/validation.csv", separator=',')
    test_data = prepare_functions.load_data("data/test.csv", separator=',')

    print("Distribution for training data: \n", training_data["label"].value_counts(), "\n")
    print("Distribution for validation data: \n", validation_data["label"].value_counts(), "\n")
    print("Distribution for test data: \n", test_data["label"].value_counts(), "\n")

In [54]:
# getting the distribution for the first version of data (seed = 1)
get_distribution()

Distribution for training data: 
 ham     3381
spam     519
Name: label, dtype: int64 

Distribution for validation data: 
 ham     721
spam    115
Name: label, dtype: int64 

Distribution for test data: 
 ham     723
spam    113
Name: label, dtype: int64 



In [55]:
# getting the distribution for the updated version of data (seed = 10)
# this checks out the latest version of data.dvc
import os
os.system("git checkout HEAD data.dvc >> garbage && dvc checkout data.dvc >> garbage") # checkout the latest version of data.dvc
get_distribution()

Updated 0 paths from 65d0987


Distribution for training data: 
 ham     3394
spam     506
Name: label, dtype: int64 

Distribution for validation data: 
 ham     709
spam    127
Name: label, dtype: int64 

Distribution for test data: 
 ham     722
spam    114
Name: label, dtype: int64 



In [56]:
# saving the count vectorizer to be used later while preprocessing
bow_transformer = CountVectorizer(analyzer=prepare_functions.split_into_lemmas).fit(messages['message'])
with open('bow_transformer.pickle', 'wb') as f:
    pickle.dump(bow_transformer, f, pickle.HIGHEST_PROTOCOL)