# Imports

In [1]:
from dataProcessing import getTextsInDateRange, getPairs, getFullWordset, buildTokenizer
import numpy as np
from keras.models import Model
from keras.layers import Input, Dense

import random
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/ebolton/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ebolton/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Conda env checking
From article here:
https://towardsdatascience.com/get-your-conda-environment-to-show-in-jupyter-notebooks-the-easy-way-17010b76e874

If conda env is being used correctly, the environment variable below should be `base`.

In [2]:
import os
print(os.environ['CONDA_DEFAULT_ENV'])

base


# Constants

In [14]:
CHANNEL_MAPPINGS = {
#     "Daily Wire": "UCroKPvbmaQKGK5tjtQsvaDw",
    "Ben Shapiro": "UCnQC_G5Xsjhp9fEJKuIcrSw",
#     "Daily Wire Plus": "UCaeO5vkdj5xOQHp4UmIN6dw",
#     "Matt Walsh": "UCO01ytfzgXYy4glnPJm4PPQ",
#     "Michael Knowls": "UCr4kgAUTFkGIwlWSodg43QA",
#     "Candace Owens": "UCkY4fdKOFk3Kiq7g5LLKYLw"
}

# Parameters

In [15]:
# channels = ['Ben Shapiro']
channels = list(CHANNEL_MAPPINGS.keys())
start = 2021
stop = 2023

# Get Texts

In [16]:
nltk.download('omw-1.4')
channelIds = [CHANNEL_MAPPINGS[c] for c in channels]
texts = getTextsInDateRange(channelIds, start, stop, cleaned=True)
channelstext = "\n  - ".join(channels)
print([t[:30] for t in texts[:10]])
print()
print(f'Found {len(texts)} videos for channels: \n  - {channelstext}')

[nltk_data] Downloading package omw-1.4 to /Users/ebolton/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


[['sad', 'left', 'california', 'tell', 'thought', 'one', 'day', 'decision', 'one', 'best', 'decision', 'ever', 'made', 'family', 'know', 'orthodox', 'jewish', 'faith', 'mean', 'move', 'place', 'significant', 'jewish', 'resource', 'inundation', 'homelessness', 'area', 'like', 'kid', 'could'], ['bizarre', 'new', 'world', 'created', 'counter-reality', 'new', 'world', "we've", 'created', 'assigning', 'sex', 'baby', 'dc', 'forcing', 'label', 'mother', "that's", 'case', 'throne', 'mother', 'whole', 'birth', 'certificate', 'thing', 'really', 'really', 'causing', 'lot', 'oh', 'i'], ['end', 'entire', 'group', 'people', 'single', 'woman', 'falling', 'behind', 'married', 'woman', 'engaged', 'social', 'structure', 'important', 'mediating', 'institution', 'human', 'existence', 'marriage', 'ripped', 'away', "they've", 'told', 'longer', 'essential', 'important', 'fact', 'many', 'way', 'institution'], ['idea', 'left', 'everybody', 'right', 'keep', 'saying', "there's", 'election', 'fraud', "that's", 'r

# Build Pairs

In [17]:
pairs = getPairs(texts, window=2)
print(pairs[:10])
print(random.sample(pairs, 100))
print(f'Built {len(pairs)} pairs')

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 583/583 [00:00<00:00, 759.48it/s]


[('sad', 'left'), ('sad', 'california'), ('left', 'california'), ('left', 'sad'), ('left', 'tell'), ('california', 'tell'), ('california', 'left'), ('california', 'thought'), ('california', 'sad'), ('tell', 'thought')]
[('boy', 'penis'), ('activity', 'year'), ('corporation', 'tech'), ('itself', 'way'), ('daily', 'copyright'), ('run', "he'll"), ('side', 'always'), ('abortion', 'concluded'), ('preventing', 'election'), ('and', 'want'), ('house', '41'), ('coveted', 'saying'), ('like', 'make'), ('leverage', 'additional'), ('anger', 'hatred'), ('republican', 'announces'), ('doubling', 'basically'), ('happily', "he's"), ('around', 'medical'), ('devil', 'satanist'), ("there's", 'like'), ('even', 'right'), ('amendment', 'constitutional'), ('hurt', 'dot'), ('staring', 'biden'), ('daca', 'right'), ('represented', 'descent'), ('poll', 'two'), ('already', 'actual'), ('safe', 'hand'), ('exactly', 'new'), ('ceiling', 'could'), ('assimilation', 'worried'), ('putin', 'vladimir'), ('red', 'red'), ('dem

# Get wordset and build tokenizer

In [18]:
wordset = getFullWordset(texts)
word2idx, idx2word = buildTokenizer(wordset)
print(list(wordset)[:20])
print(random.sample(list(wordset), 20))
print(f'Wordset is size: {len(wordset)}')

['', 'crossover', 'upping', 'dismayed', 'trespasser', 'non-issue', 'houston', 'bluetooth', 'fargo', 'lib', 'absurdly', 'infuses', 'fleece', 'usps', '76.1', 'squanto', 'self-declared', 'effectuate', 'demonstrative', 'visor']
['hampering', 'sherpa', 'eighteen', 'sotomayor', 'arkham', 'delicate', "bond's", 'staked', 'wrestler', 'hovering', 'istanbul', '1968', "christ's", 'catfishing', "sock'em", 'wear', 'debating', "fascia's", 'nationless', 'conflation']
Wordset is size: 30493


# Count pairs for creating weights

In [29]:
pairWeights = {}
for pair in pairs:
    if pair not in pairWeights:
        pairWeights[pair] = 0
    pairWeights[pair] += 1

# Convert Pairs to ints

In [34]:
def pairToInts(pair, mapping):
    a, b = pair
    return (mapping[a], mapping[b])

pairset = set(pairs)
weights = []
intpairs = []
# intpairs = [pairToInts(p, word2idx) for p in pairs]
for p in pairset:
    intpairs.append(pairToInts(p, word2idx))
    weights.append(pairWeights[p])
print(intpairs[:10])
print(weights[:10])
print(f'Total pairs: {len(intpairs)}')

[(26344, 12636), (71, 11521), (6064, 29736), (17539, 7186), (28974, 11433), (20220, 21700), (21250, 27427), (2642, 13738), (29056, 7183), (27688, 1967)]
[11, 2, 1, 1, 1, 1, 1, 2, 2, 1]
Total pairs: 1775754


# Convert ints to one-hot vectors

In [35]:
ins = [p[0] for p in intpairs]
outs = [p[1] for p in intpairs]
print(min(ins), max(ins))
print(min(outs), max(outs))
print(len(wordset))

0 30492
0 30492
30493


# Build X, Y matrices

In [37]:
import time
from tqdm import tqdm
n = np.array(intpairs)
a = n[:, 0]
b = n[:, 1]
trainingWeights = np.array(weights)
print(n.shape)
print(a.shape)
print(b.shape)
print(trainingWeights.shape)
print(f'Final size will be: {len(n), len(wordset)}')
print(f'Total points: {format(len(n)*len(wordset), ",d")}')

(1775754, 2)
(1775754,)
(1775754,)
(1775754,)
Final size will be: (1775754, 30493)
Total points: 54,148,066,722


In [40]:
print(len(intpairs))
print(len(a), len(b))
start = time.time()

X = np.zeros((a.size, a.max()+1), dtype=np.bool_)
X[np.arange(a.size), a] = 1
Y = np.zeros((b.size, b.max()+1), dtype=np.bool_)
Y[np.arange(b.size), b] = 1
    
print(f'{time.time() - start} seconds')
print(X.shape)
print(Y.shape)

1775754
1775754 1775754
7.732770919799805 seconds
(1775754, 30493)
(1775754, 30493)


# Build and compile NNet

In [52]:
EMBED_SIZE = 64
os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [53]:
inp = Input(shape=(X.shape[1],))
x = Dense(units=EMBED_SIZE, activation='linear')(inp)
x = Dense(units=Y.shape[1], activation='softmax')(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [54]:
print(model.summary())

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 30493)]           0         
                                                                 
 dense_2 (Dense)             (None, 64)                1951616   
                                                                 
 dense_3 (Dense)             (None, 30493)             1982045   
                                                                 
Total params: 3,933,661
Trainable params: 3,933,661
Non-trainable params: 0
_________________________________________________________________
None


# Train NNet

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [50]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [58]:
hist = model.fit(
    x=X,
    y=Y,
    batch_size=32,
    epochs=150,
    sample_weight=trainingWeights
)

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.

In [48]:
import tensorflow as tf
tf.config.list_physical_devices('GPU')
# tf.test.is_gpu_available()

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

# Performance Stats

In [None]:
import matplotlib.pyplot as plt
plt.plot(hist.history['loss'])
plt.show()

# Build word -> vector mapping

In [None]:
weights = model.get_weights()[0]
print(weights)
print(weights.shape)
word2vec = {}
for word in wordset:
    vec = weights[word2idx[word]]
    word2vec[word] = vec

# Check pair counts

In [None]:
print(len(pairs))
print(len(set(pairs)))
s = set(pairs)
counts = {}
for p in pairs:
    if p not in counts:
        counts[p] = 1
    else:
        counts[p] = counts[p]+1
sortedpairs = sorted(list(s), key=lambda p: counts[p], reverse=True)
for p in sortedpairs:
    print(counts[p], p)

# Get nearest neighbors

In [None]:
targetWord = 'Biden'
v = word2vec[targetWord]
print(v.shape)

dists = {}
for word, vec in word2vec.items():
    d = np.linalg.norm(v - vec)
    dists[word] = d

allwords = list(wordset)
allwords = sorted(allwords, key=lambda w: dists[w])
N = 10
for i in range(N):
    d = dists[allwords[i]]
    print(f"{i:<2} {str(round(d, 3)):<6} {allwords[i]}")

# Export wordset

In [None]:
wordtext = '\n'.join(wordset)
with open('wordset.txt', 'w') as file:
    file.write(wordtext)
    print('WROTE FILE')