In [None]:
import pandas as pd
from pydatagen.functions import textify
from fastai.vision.all import *

raw = pd.read_csv("../datagen/small_phone_number_sample.csv")
raw.head()

Lots of data here.  The phone column contains positive examples, the rest essentially are negative examples.  However, we essentially need to turn each individual column into a set of key,value pairs (and label each row appropriately) before we can feed the data to a classifier.

First we'll create a label column, initialized with all 0's.

In [None]:
raw['is_phone'] = 0

Now use the 'melt' function to make each column a different key, and let the 'is_phone' column be the value.

In [None]:
# melted = raw.melt(id_vars=['is_phone'], value_vars=['locale','name','language','phone','license','address','city','country'])
melted = raw.melt(id_vars=['is_phone'], value_vars=['locale','name','language','phone','license','address','city','country'])
melted.head()


Obviously a lot of duplicates for locale here (which is more of a categorical variable), so we might as well remove those.

In [None]:
melted = melted.drop_duplicates()
melted.head()

In [None]:
group_counts = melted.groupby(['variable']).size()

x_pos = [i for i, _ in enumerate(group_counts)]

plt.figure(figsize=(9, 5))
plt.bar(x_pos, group_counts.values)
plt.ylabel("Count")
plt.xlabel("Type")
plt.xticks(x_pos, group_counts.index)

plt.show()

We have roughly the same number of records for phone (positive examples), as we do for name, license, and address (negative examples).  We'll
create a 50-50 mix of positive to negative examples and draw a stratified sample for the negatives from each of name, license, and address.

In [None]:
# set 'is_phone' column on the phone values
melted.loc[melted['variable'] == 'phone', 'is_phone'] = 1

positives = melted.loc[melted['is_phone'] == 1]
positives

Now create the negative examples of roughly the same size.

In [None]:
negatives = melted.loc[melted.variable.isin(['name', 'address', 'license'])].sample(frac=0.33, replace=False, random_state=42)
negatives

Let's check our negatives count just to be sure.

In [None]:
neg_counts = negatives.groupby(['variable']).size()

x_pos = [i for i, _ in enumerate(neg_counts)]

plt.figure(figsize=(5, 5))
plt.bar(x_pos, neg_counts.values)
plt.ylabel("Count")
plt.xlabel("Type")
plt.xticks(x_pos, neg_counts.index)

plt.show()

In [None]:
len(positives), len(negatives)

In [None]:
full_data_df = pd.concat([positives, negatives])
textified_df = pd.DataFrame({'textified': full_data_df['value'].astype(str).apply(lambda x: textify(x, length=30))})
textified_df

In [None]:
pd.DataFrame(textified_df['textified'].to_list())
# .loc[:,:29].values


Not sure why but our 30 element textified list is somehow being split into 69 columns.  Need to select out only the columns we want.

In [None]:
feature_df = pd.DataFrame(textified_df['textified'].to_list()).loc[:,:29]
feature_df

In [None]:
label_df = full_data_df['is_phone']
label_df

In [None]:
from sklearn.model_selection import train_test_split

# train_x_df = feature_df.sample(frac=0.9, replace=False, random_state=42)

train_x_df, valid_x_df, train_y_df, valid_y_df = train_test_split(feature_df, label_df, test_size=0.10, random_state=42)
train_x_df.shape, valid_x_df.shape, train_y_df.shape, valid_y_df.shape

In [None]:
train_x_tens = torch.tensor(train_x_df.values)
train_y_tens = torch.tensor(train_y_df.values).unsqueeze(1)
valid_x_tens = torch.tensor(valid_x_df.values)
valid_y_tens = torch.tensor(valid_y_df.values).unsqueeze(1)

train_x_tens.shape, train_y_tens.shape, valid_x_tens.shape, valid_y_tens.shape

In [None]:
trainset = list(zip(train_x_tens.float(), train_y_tens))
validset = list(zip(valid_x_tens.float(), valid_y_tens))

In [None]:
# batch_size of about 100 seems best so far
traindl = DataLoader(trainset, batch_size=256)
# xb,yb = first(traindl)
# print("Training example: {}, {}".format(xb,yb))

# validdl = DataLoader(validset, batch_size=30)
validdl = DataLoader(validset, batch_size=256)
# xt, yt = first(validdl)
# print("Validation example: {}, {}".format(xt.shape, yt.shape))

In [None]:
def phone_loss(predictions, targets):
    predictions = predictions.sigmoid()
    return torch.where(targets == 1, 1-predictions, predictions).mean()

def phone_accuracy(xb, yb):
    preds = xb.sigmoid()
    correct = (preds > 0.5) == yb
    return correct.float().mean()

In [None]:
dls = DataLoaders(traindl, validdl)

simple_net = nn.Sequential(
    nn.Linear(30, 20),
    nn.ReLU(),
    nn.Linear(20, 10),
    nn.ReLU(),
    nn.Linear(10, 1)
)

In [None]:
learn = Learner(dls, simple_net, opt_func=SGD, loss_func=phone_loss, metrics=phone_accuracy)

In [None]:
learn.fit(80, 0.1)

In [None]:
# try with resnet
learn = cnn_learner(dls, resnet18, pretrained=False,
                    loss_func=F.cross_entropy, metrics=phone_accuracy)
learn.fit_one_cycle(1, 0.1)
