In [None]:
import os

import csv

import nltk

nltk.download('stopwords')

import pandas as pd
import csv
import numpy as np
import torch

from nltk.tokenize import TextTilingTokenizer

from imblearn.over_sampling import RandomOverSampler

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from torch import nn
from google.colab import drive
from torch.utils.data import Dataset, DataLoader

drive.mount('/content/drive')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Mounted at /content/drive


In [None]:
%cd drive/MyDrive/CSE572_final_project

/content/drive/.shortcut-targets-by-id/1CNAp-Xs1N3eIhL96mqNkfaYE413By43q/CSE572_final_project


In [None]:
dictionary = {}
dictionary_reverse = []
with open("dictionary.txt", "r") as dictionary_file:
    for i, line in enumerate(dictionary_file):
        dictionary[line.strip()] = i
        dictionary_reverse.append(line.strip())

# Model Code

In [None]:
class DataframeSeriesDataset(Dataset):

    def __init__(self, x_dataframe, y_series):
        '''
        Takes a data frame of x values and associated y series
        where x_dataframe.iloc[i]'s corresponding y value
        is y_series.iloc[i]
        '''
        self.x_data_frame = x_dataframe
        self.y_series = y_series
        self.len = len(x_dataframe)

    def __getitem__(self, index):
        return np.array(self.x_data_frame.iloc[index]).astype(np.float32), np.array(self.y_series.iloc[index]).astype(np.float32)

    def __len__(self):
        return self.len

def train_model(model,
    dataset,
    batch_sz=100,
    learning_rate=0.1,
    loss_fn=nn.BCELoss(),
    optimizer_type="sgd",
    num_epochs=100):

    train_dataloader = DataLoader(dataset=dataset, batch_size=batch_sz, shuffle=True)

    if optimizer_type == "sgd":
        optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
    elif optimizer_type == "adam":
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    elif optimizer_type == "rmsprop":
        optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)

    loss_values = []

    for _ in range(num_epochs):
        for X, y in train_dataloader:
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            pred = model(X).squeeze()
            loss = loss_fn(pred, y.squeeze())
            loss_values.append(loss.item())
            loss.backward()
            optimizer.step()

    return loss_values

class TwoHiddenLayerCategorizingNeuralNetwork(nn.Module):
    def __init__(self, input_dim):
        super(TwoHiddenLayerCategorizingNeuralNetwork, self).__init__()
        self.hidden_1 = nn.Linear(input_dim, 128)
        nn.init.kaiming_uniform_(self.hidden_1.weight, nonlinearity="relu")
        self.hidden_2 = nn.Linear(128, 128)
        nn.init.kaiming_uniform_(self.hidden_2.weight, nonlinearity="relu")
        self.out = nn.Linear(128, 1)

    def forward(self, x):
        x = torch.nn.functional.relu(self.hidden_1(x))
        x = torch.nn.functional.relu(self.hidden_2(x))
        x = torch.nn.functional.sigmoid(self.out(x))

        return x

## Standard Code to Run a 5 Fold Experiment

In [None]:
def run_experiment(
        train_dataframe,
        batch_sz=100,
        learning_rate=0.1,
        loss_fn=nn.BCELoss(),
        optimizer_type="sgd",
        num_epochs=100):
    input_features = len(train_dataframe.columns) - 1

    kf = KFold(n_splits=5, shuffle=True, random_state=400)

    train_accuracies = []
    train_stds = []
    validation_accuracies = []
    validation_stds = []

    x = train_dataframe[train_dataframe.columns[:-1]]
    y_binary = train_dataframe["label"].map(lambda x: 0 if x == "non-sponsor" else 1)

    for train_index, validation_index in kf.split(x):

        train_x = x.iloc[train_index]
        validation_x = x.iloc[validation_index]
        train_y = y_binary[train_index]
        validation_y = y_binary[validation_index]

        model = TwoHiddenLayerCategorizingNeuralNetwork(input_features)

        train_model(model,
                    DataframeSeriesDataset(train_x, train_y),
                    batch_sz,
                    learning_rate,
                    loss_fn,
                    optimizer_type,
                    num_epochs)

        with torch.no_grad():
            train_output = model(torch.from_numpy(np.array(train_x).astype(np.float32)))
            train_predictions = train_output.numpy() > 0.5
            train_correct = np.sum(train_predictions.squeeze() == train_y)

            validation_output = model(torch.from_numpy(np.array(validation_x).astype(np.float32)))
            validation_predictions = validation_output.numpy() > 0.5
            validation_correct = np.sum(validation_predictions.squeeze() == validation_y)

        train_accuracies.append(train_correct/len(train_x))
        validation_accuracies.append(validation_correct/len(validation_x))

    np_train_accuracies = np.array(train_accuracies)
    np_validation_accuracies = np.array(validation_accuracies)
    return train_accuracies, validation_accuracies, np.average(np_train_accuracies), np.std(np_train_accuracies), np.average(np_validation_accuracies), np.std(np_validation_accuracies)

## Hyperparameter Optimization

In [None]:
def find_best_optimizer(learning_rates, optimizers, batch_szs, train_dataframe):
  best_learning_rate = None
  best_optimizer = None
  best_batch_sz = None
  best_train_accuracy = 0.0
  best_train_std = 0.0
  best_validation_accuracy = 0.0
  best_validation_std = 0.0

  results = []

  for learning_rate in learning_rates:
      for optimizer in optimizers:
          print(best_validation_accuracy)
          for batch_sz in batch_szs:
              print(learning_rate, optimizer, batch_sz)
              _, _, average_train_accuracy, std_train_accuracy, average_validation_accuracy, std_validation_accuracy = run_experiment(
                  train_dataframe,
                  learning_rate=learning_rate,
                  optimizer_type=optimizer,
                  batch_sz=batch_sz,
                  num_epochs=15)
              if average_validation_accuracy > best_validation_accuracy:
                  best_train_accuracy = average_train_accuracy
                  best_train_std = std_train_accuracy
                  best_validation_accuracy = average_validation_accuracy
                  best_validation_std = std_validation_accuracy
                  best_learning_rate = learning_rate
                  best_optimizer = optimizer
                  best_batch_sz = batch_sz
              results.append(((learning_rate, optimizer, batch_sz), average_validation_accuracy, std_validation_accuracy))

  print(f"Best train accuracy: {best_train_accuracy}")
  print(f"Best train std: {best_train_std}")
  print(f"Best validation accuracy: {best_validation_accuracy}")
  print(f"Best validation std: {best_validation_std}")
  print(f"Best learning rate: {best_learning_rate}")
  print(f"Best optimizer: {best_optimizer}")
  print(f"Best batch size: {best_batch_sz}")
  print(results)

# Training YouTube Model


## Preparing Data

In [None]:
segment_texts = []
segment_labels = []

with open("segments.csv") as f:
  reader = csv.reader(f)
  next(reader)
  for row in reader:
    segment_labels.append(row[2])
    segment_texts.append(row[3])

vectorizer = CountVectorizer(vocabulary=dictionary.keys())
train_matrix = vectorizer.transform(segment_texts).toarray()

vectorizer_dataframe = pd.DataFrame(train_matrix, columns=list(map(lambda word: f"{word}_presence", dictionary.keys())))
vectorizer_dataframe["label"] = segment_labels

x_train, x_test, y_train, y_test_actual = train_test_split(train_matrix, segment_labels, train_size=0.8, test_size=0.2, random_state=400)

train_dataframe = pd.DataFrame(x_train, columns=list(map(lambda word: f"{word}_presence", dictionary.keys())))
train_dataframe["label"] = y_train

## 5-Fold Cross Validation for Optimizing Hyperparameters

In [None]:
# learning_rates = [0.0001, 0.001, 0.01]
# optimizers = ["adam", "rmsprop"]
# batch_szs = [10, 100, 500]

learning_rates = [0.0001]
optimizers = ["rmsprop"]
batch_szs = [10]

find_best_optimizer(learning_rates, optimizers, batch_szs, train_dataframe)



0.0
0.0001 rmsprop 10
Best train accuracy: 0.9479875252270755
Best train std: 0.002524263714953769
Best validation accuracy: 0.855721479423511
Best validation std: 0.01627572654379301
Best learning rate: 0.0001
Best optimizer: rmsprop
Best batch size: 10
[((0.0001, 'rmsprop', 10), 0.855721479423511, 0.01627572654379301)]


## Model Training with Optimized Hyperparameters

In [None]:
input_features = len(train_dataframe.columns) - 1

yt_model = TwoHiddenLayerCategorizingNeuralNetwork(input_features)

x = train_dataframe[train_dataframe.columns[:-1]]
y_binary = train_dataframe["label"].map(lambda x: 0 if x == "non-sponsor" else 1)

train_model(yt_model, DataframeSeriesDataset(x, y_binary), batch_sz=15, optimizer_type="rmsprop", num_epochs=15, learning_rate=0.0001)

y_test_predict = yt_model(torch.from_numpy(np.array(x_test).astype(np.float32))).detach().numpy() > 0.5
y_test_actual_binary = np.array(list(map(lambda x: 0 if x == "non-sponsor" else 1, y_test_actual)))
test_correct = np.sum(y_test_predict.squeeze() == np.array(y_test_actual_binary))
print(test_correct/len(x_test))

0.8661844484629295


## Sample Output

In [None]:

with open("example_transcript.txt") as f:
  raw_transcript = f.readline()

transcript = raw_transcript.replace(".", ".\n\n")

segmenter = TextTilingTokenizer(cutoff_policy=0)
segments = segmenter.tokenize(transcript)

spotify_test_vectorization = vectorizer.transform(segments).toarray()

spotify_predicted_labels = yt_model(torch.from_numpy(np.array(spotify_test_vectorization).astype(np.float32)))

for segment, label in zip(segments, spotify_predicted_labels):
  print(f"{'Sponsor' if label > 0.5 else 'Non-Sponsor'}: {segment}")


Non-Sponsor: Hi, I'm Kenya and I am one year in you're listening to the GC youth Ministries podcast.

 You plus me plus we welcome welcome pull up a chair grab a seed and listening.
Non-Sponsor: 

  All right, so welcome.

 I'm Anya.

 Hi Kenya.

 Are you feeling tired tired? You don't know dying.

 Hmm.

 I can assure you that much.

 I can only imagine so hello to everyone that's listening in welcome.

 Hey, how are you sir? How are y'all? How are you doing? Kenya? You asked me how I was doing.

 Yeah.

 Look at that politeness.

  Not that person.

 I'm doing good doing better feeling excited.

 I'm excited about this.
Non-Sponsor: 

 Me too.

 Yeah, because I feel like this is fine like our baby.

 Hmm.

 I think Kenny and I have been trying to do a podcast for like a year outside of thousands of years and then and then pasta gay was like go ahead and gave us the green light because that's what it's all about.

 Right supporting our young people and letting them just like Flex thei

# Training Podcast Model

## Preparing Data

In [None]:
podcast_segment_texts = []
podcast_segment_labels = []

with open("spotify-train.csv") as f:
  reader = csv.reader(f)
  next(reader)
  for row in reader:
    podcast_segment_labels.append(row[1])
    podcast_segment_texts.append(row[0])

vectorizer = CountVectorizer(vocabulary=dictionary.keys())
podcast_train_matrix = vectorizer.transform(podcast_segment_texts).toarray()

vectorizer_dataframe = pd.DataFrame(podcast_train_matrix, columns=list(map(lambda word: f"{word}_presence", dictionary.keys())))
vectorizer_dataframe["label"] = podcast_segment_labels

x_podcast_train, x_podcast_test, y_podcast_train, y_podcast_test_actual = train_test_split(podcast_train_matrix, podcast_segment_labels, train_size=0.8, test_size=0.2, random_state=400)

### Upsampling Ads

There are fewer ads than non ads so we oversample to remedy this.

In [None]:
ros = RandomOverSampler(random_state=400)

x_podcast_train_oversample, y_podcast_train_oversample = ros.fit_resample(x_podcast_train, y_podcast_train)

podcast_train_dataframe_oversampled = pd.DataFrame(x_podcast_train_oversample, columns=list(map(lambda word: f"{word}_presence", dictionary.keys())))
podcast_train_dataframe_oversampled["label"] = y_podcast_train_oversample

print(f"Num sponsored samples: {podcast_train_dataframe_oversampled[podcast_train_dataframe_oversampled.label=='sponsor'].shape[0]}")
print(f"Num non sponsored samples: {podcast_train_dataframe_oversampled[podcast_train_dataframe_oversampled.label=='non-sponsor'].shape[0]}")

Num sponsored samples: 332
Num non sponsored samples: 332


## 5-Fold Cross Validation for Optimizing Hyperparameters

In [None]:
learning_rates = [0.0001, 0.001, 0.01]
optimizers = ["adam", "rmsprop"]
batch_szs = [10, 100, 500]

find_best_optimizer(learning_rates, optimizers, batch_szs, podcast_train_dataframe_oversampled)



0.0
0.0001 adam 10
0.0001 adam 100
0.0001 adam 500
0.9909774436090226
0.0001 rmsprop 10
0.0001 rmsprop 100
0.0001 rmsprop 500
0.9909774436090226
0.001 adam 10
0.001 adam 100
0.001 adam 500
0.9909774436090226
0.001 rmsprop 10
0.001 rmsprop 100
0.001 rmsprop 500
0.9909774436090226
0.01 adam 10
0.01 adam 100
0.01 adam 500
0.9909774436090226
0.01 rmsprop 10
0.01 rmsprop 100
0.01 rmsprop 500
Best train accuracy: 0.9951049941237274
Best train std: 0.002260596653276766
Best validation accuracy: 0.9909774436090226
Best validation std: 0.011050329666691056
Best learning rate: 0.0001
Best optimizer: adam
Best batch size: 10
[((0.0001, 'adam', 10), 0.9909774436090226, 0.011050329666691056), ((0.0001, 'adam', 100), 0.9082365003417635, 0.05885659646483553), ((0.0001, 'adam', 500), 0.7816700843016633, 0.10561239298221829), ((0.0001, 'rmsprop', 10), 0.9909774436090226, 0.008768348714053098), ((0.0001, 'rmsprop', 100), 0.9352927773980404, 0.03206252490538751), ((0.0001, 'rmsprop', 500), 0.923160173160

In [None]:
result = [((0.0001, 'adam', 10), 0.9909774436090226, 0.011050329666691056), ((0.0001, 'adam', 100), 0.9082365003417635, 0.05885659646483553), ((0.0001, 'adam', 500), 0.7816700843016633, 0.10561239298221829), ((0.0001, 'rmsprop', 10), 0.9909774436090226, 0.008768348714053098), ((0.0001, 'rmsprop', 100), 0.9352927773980404, 0.03206252490538751), ((0.0001, 'rmsprop', 500), 0.9231601731601732, 0.01999470633878093), ((0.001, 'adam', 10), 0.9879471405787197, 0.003702280172604136), ((0.001, 'adam', 100), 0.9894736842105264, 0.007667698516680888), ((0.001, 'adam', 500), 0.9563454089769878, 0.02786861020553428), ((0.001, 'rmsprop', 10), 0.983447254499886, 0.005608300962977921), ((0.001, 'rmsprop', 100), 0.9864547732968786, 0.008760559929171798), ((0.001, 'rmsprop', 500), 0.9894736842105264, 0.007667698516680888), ((0.01, 'adam', 10), 0.9879585326953748, 0.0036695349947571424), ((0.01, 'adam', 100), 0.9894736842105264, 0.0060150375939849576), ((0.01, 'adam', 500), 0.9894736842105264, 0.007667698516680888), ((0.01, 'rmsprop', 10), 0.989462292093871, 0.0036742004684999882), ((0.01, 'rmsprop', 100), 0.9909774436090226, 0.005626552461314191), ((0.01, 'rmsprop', 500), 0.9894736842105264, 0.007667698516680888)]

with open("nncvoptimization.csv", "w+") as f:
  writer = csv.writer(f)
  writer.writerow(["lr", "optimizer", "batchsz", "avg_accuracy", "std_accuracy"])
  for config, avg_accuracy, std_accuracy in result:
    lr, optimizer, batch_sz = config
    writer.writerow([lr, optimizer, batch_sz, avg_accuracy, std_accuracy])

## Model Training with Optimized Hyperparameters

In [None]:
input_features = len(podcast_train_dataframe_oversampled.columns) - 1

podcast_model = TwoHiddenLayerCategorizingNeuralNetwork(input_features)

x = podcast_train_dataframe_oversampled[podcast_train_dataframe_oversampled.columns[:-1]]
y_binary = podcast_train_dataframe_oversampled["label"].map(lambda x: 0 if x == "non-sponsor" else 1)

train_model(podcast_model, DataframeSeriesDataset(x, y_binary), batch_sz=15, optimizer_type="rmsprop", num_epochs=15, learning_rate=0.0001)

y_podcast_test_predict = podcast_model(torch.from_numpy(np.array(x_podcast_test).astype(np.float32))).detach().numpy() > 0.5
y_podcast_test_actual_binary = np.array(list(map(lambda x: 0 if x == "non-sponsor" else 1, y_podcast_test_actual)))
y_podcast_test_actual_binary_array = np.array(y_podcast_test_actual_binary)
test_correct = np.sum(y_podcast_test_predict.squeeze() == y_podcast_test_actual_binary_array)
original_sponsors = y_podcast_test_actual_binary_array == 1
new_sponsors = y_podcast_test_predict.squeeze() == 1
recall = np.sum(y_podcast_test_predict.squeeze()[original_sponsors] == y_podcast_test_actual_binary_array[original_sponsors])
precision = np.sum(y_podcast_test_predict.squeeze()[new_sponsors] == y_podcast_test_actual_binary_array[new_sponsors])

print(f"Overall correct: {test_correct/len(x_podcast_test)}")
print(f"Recall: {recall/np.sum(original_sponsors)}")
print(f"Precision: {precision/np.sum(new_sponsors)}")

Overall correct: 0.9775280898876404
Recall: 0.875
Precision: 0.875


## Test Comparison with Original Model

In [None]:
y_podcast_test_predict = yt_model(torch.from_numpy(np.array(x_podcast_test).astype(np.float32))).detach().numpy() > 0.5
y_podcast_test_actual_binary = np.array(list(map(lambda x: 0 if x == "non-sponsor" else 1, y_podcast_test_actual)))
y_podcast_test_actual_binary_array = np.array(y_podcast_test_actual_binary)
test_correct = np.sum(y_podcast_test_predict.squeeze() == y_podcast_test_actual_binary_array)
original_sponsors = y_podcast_test_actual_binary_array == 1
new_sponsors = y_podcast_test_predict.squeeze() == 1
recall = np.sum(y_podcast_test_predict.squeeze()[original_sponsors] == y_podcast_test_actual_binary_array[original_sponsors])
precision = np.sum(y_podcast_test_predict.squeeze()[new_sponsors] == y_podcast_test_actual_binary_array[new_sponsors])

print(f"Overall correct: {test_correct/len(x_podcast_test)}")
print(f"Recall: {recall/np.sum(original_sponsors)}")
print(f"Precision: {precision/np.sum(new_sponsors)}")

Overall correct: 0.6179775280898876
Recall: 1.0
Precision: 0.19047619047619047


## End to End Labeling of Unseen Podcasts

In [None]:
transcript_files = os.listdir("test_transcripts")

for transcript_file in transcript_files:
  with open("test_transcripts/" + transcript_file) as f:
    raw_transcript = f.readline()

  transcript = raw_transcript.replace(".", ".\n\n")

  segmenter = TextTilingTokenizer(cutoff_policy=0)
  segments = segmenter.tokenize(transcript)

  spotify_test_vectorization = vectorizer.transform(segments).toarray()

  spotify_predicted_labels = podcast_model(torch.from_numpy(np.array(spotify_test_vectorization).astype(np.float32)))

  with open("test_segmentations_nn/" + transcript_file.replace(".txt", ".csv"), "w+") as f:
    writer = csv.writer(f)
    writer.writerow(['label', 'text'])
    for segment, label in zip(segments, spotify_predicted_labels):
      writer.writerow(['sponsor' if label > 0.5 else 'non-sponsor', segment])
