In [19]:
import torch
import torchtext
import torchdata
import portalocker
import time
import pandas as pd
import os, re
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier

import matplotlib.pyplot as plt
import time
%matplotlib inline

torch.set_default_dtype(torch.float64)


In [2]:
os.chdir("..")
os.path.abspath(os.curdir)

'/Users/claireboyd/courses/advanced_ml/dirty_comments_and_clean_plates'

#### Preprocessing using Custom Vectorizer

In [3]:
class Vectorizer(object):
    def __init__(self, df_filepath, ngram_range=None, clean_regex=None, max_features=None, stop_words=None):
        # read in reviews to vectorizor object
        self.df = pd.read_csv(df_filepath)
        self.texts=self.df['reviews']

        # vectorizor params
        self.clean_regex = clean_regex
        self.max_features = max_features #vocab size
        self.stopwords = stop_words #if we want to remove these or not
        self.ngram_range = ngram_range #size of ngrams to use as each observation
        self.tfidf = TfidfVectorizer(analyzer='word',
                                     stop_words=self.stopwords,
                                     ngram_range=self.ngram_range, 
                                     max_features=self.max_features)

    def clean_texts(self):
        cleaned = []
        for text in self.texts:
            if self.clean_regex is not None:
                text = re.sub(self.clean_regex," ",text)
            text = text.lower().strip()
            cleaned.append(text)
        return cleaned
    
    def set_tfidf(self,cleaned_texts):
        self.tfidf.fit(cleaned_texts)
    
    def build_vectorizer(self):
        cleaned_texts = self.clean_texts()
        self.set_tfidf(cleaned_texts)
        
    def vectorizeTexts(self):
        cleaned_texts = self.clean_texts()
        return self.tfidf.transform(cleaned_texts)


In [4]:
vectorizer = Vectorizer(df_filepath="data/phila/labeled_inspections_with_reviews.csv",
                        clean_regex="[^a-zA-Z0-9]",
                        max_features=7000,
                        ngram_range=(1,2), 
                        stop_words="english")
vectorizer.build_vectorizer()
vectorized_x = vectorizer.vectorizeTexts().toarray()
vectorized_x.shape

(2165, 7000)

#### Custom Dataloader

In [5]:
class ReviewsDataset(torch.utils.data.Dataset):
    def __init__(self, vectorized_reviews, df_filepath, y_col="y", features=None):

        # read in data and encode outcome variable
        self.df = pd.read_csv(df_filepath)
        self.df[[y_col]] = 0
        self.df.loc[self.df.loc[:,'Overall Compliance'] == "No",y_col] = 1
        
        self.text = vectorized_reviews
        self.labels = self.df.loc[:,'y']
        self.features = features
        
        if self.features is not None:
            self.features = self.df.loc[:,features]
    
    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        sample = {}
        sample["text"] = self.text[idx]
        sample["labels"] = self.labels[idx]
        if self.features is not None:
            sample["features"] = self.features[idx] #.to_xarray()
        return sample

In [6]:
full_dataset = ReviewsDataset(vectorized_reviews=vectorized_x,
                              df_filepath="data/phila/labeled_inspections_with_reviews.csv")

In [7]:
train_freq = 0.8
val_freq = 0.5 #of remaining % from train

train_size = int(train_freq * len(full_dataset))
val_test_size = len(full_dataset) - train_size
train_dataset, val_test_dataset = torch.utils.data.random_split(full_dataset, [train_size, val_test_size])

val_size = int(val_freq * len(val_test_dataset))
test_size = len(val_test_dataset) - val_size
val_dataset, test_dataset = torch.utils.data.random_split(val_test_dataset, [val_size, test_size])

In [110]:
# wrap with any params here: https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader
BATCH_SIZE = 1

train_dataloader = torch.utils.data.DataLoader(train_dataset,batch_size=BATCH_SIZE,shuffle=True)
valid_dataloader = torch.utils.data.DataLoader(val_dataset,batch_size=BATCH_SIZE,shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_dataset,batch_size=BATCH_SIZE,shuffle=True)

In [111]:
# see data in dataloader
for i, data in enumerate(train_dataloader):
    text = data['text']
    labels = data['labels']
    #features = data['features']
    print(text)
    print(labels)
    if i > 2:
        break

tensor([[0., 0., 0.,  ..., 0., 0., 0.]])
tensor([0])
tensor([[0., 0., 0.,  ..., 0., 0., 0.]])
tensor([0])
tensor([[0.0659, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]])
tensor([0])
tensor([[0., 0., 0.,  ..., 0., 0., 0.]])
tensor([0])


#### Model Testing

Functions needed across all model testing

In [122]:
def train_an_epoch(train_dataloader, optimizer, model, loss_function):
    model.train() # Sets the module in training mode.
    log_interval = 200
    for idx, data in enumerate(train_dataloader):
        model.zero_grad()
        log_probs = model(data['text'])
        loss = loss_function(log_probs, data['labels'])
        loss.backward()
        optimizer.step()
        if idx % log_interval == 0 and idx > 0:
            print(f'At iteration {idx} the loss is {loss:.3f}.')

In [123]:
def get_accuracy(validation_dataloader, model):
    model.eval()
    with torch.no_grad():
        ## WRITE YOUR CODE BELOW.    
        n_correct = 0
        n_examples = 0
        for data in validation_dataloader:
            # get predicted probabilities, and labels with highest probability
            outputs = model(data['text'])
            _, predicted_labels = torch.max(outputs, 1)
            
            # count correct predictions & update counts
            batch_correct = (predicted_labels == data['labels']).sum().item()
            n_correct += batch_correct
            n_examples += len(data['labels'])
    
    # calculate average accuracy across all batches
    average_accuracy = n_correct / n_examples
    return average_accuracy

In [124]:
def plot_accuracy(train_dataloader, valid_dataloader, optimizer, model, loss_function, epochs):
    optimizer = torch.optim.SGD(model.parameters(), lr=3)
    accuracies=[]
    for epoch in range(1, epochs + 1):
        epoch_start_time = time.time()
        train_an_epoch(train_dataloader, optimizer, model, loss_function)
        accuracy = get_accuracy(valid_dataloader, model)
        accuracies.append(accuracy)
        time_taken = time.time() - epoch_start_time
        print()
        print(f'After epoch {epoch} the validation accuracy is {accuracy:.3f}.')
        print()
        
    plt.plot(range(1, epochs+1), accuracies)

In [134]:
# MAIN INPUTS
# model = XGBClassifier(nthread=4,
#                       verbosity=1,
#                       #n_estimators=2, 
#                       #max_depth=2, 
#                       learning_rate=0.3, #default 
#                       #lambda=1, #default
#                       #objective='binary:logistic',
#                       #objective='binary:logistic'
#                      )

# simple SVM
# https://github.com/kazuto1011/svm-pytorch/blob/master/main.py
model = torch.nn.Linear(7000, 2)
loss_function = torch.nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=3)
epochs=10

In [135]:
train_an_epoch(train_dataloader, optimizer, model, loss_function)

At iteration 200 the loss is -479.545.
At iteration 400 the loss is -974.109.
At iteration 600 the loss is -1477.942.
At iteration 800 the loss is -1851.174.
At iteration 1000 the loss is -2331.146.
At iteration 1200 the loss is -946.508.
At iteration 1400 the loss is -3370.203.
At iteration 1600 the loss is -3825.008.


In [127]:
outputs = model(data['text'])

In [128]:
outputs

tensor([[4161.7091, 1206.6766]], grad_fn=<AddmmBackward0>)

In [129]:
data

{'text': tensor([[0., 0., 0.,  ..., 0., 0., 0.]]), 'labels': tensor([0])}

In [131]:
#why is my accuracy staying constant?
#plot_accuracy(train_dataloader, valid_dataloader, optimizer, model, loss_function, epochs)