# Running BERT Topic on the google drive

The following runs BERT Topic on the google drive


In [2]:
# This initializes pydrive with a set of credentials.  Credential files are not in the git repository - please ask @jintrone for this files in Slack

from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive
 
gauth = GoogleAuth()
gauth.LoadCredentialsFile("mycreds.txt")
if gauth.credentials is None:
    gauth.GetFlow()
    gauth.flow.params.update({'access_type': 'offline'})
    gauth.flow.params.update({'approval_prompt': 'force'})
    
    # Authenticate if they're not there
    gauth.LocalWebserverAuth()
elif gauth.access_token_expired:
    # Refresh them if expired
    gauth.Refresh()
else:
    # Initialize the saved creds
    gauth.Authorize()
# Save the current credentials to a file
gauth.SaveCredentialsFile("mycreds.txt")
drive = GoogleDrive(gauth)

In [3]:
# This block just sets up pointers to the relevant directories on the google drive

import re
import pandas as pd
import re

folders = {"CNN":'1Lz8u7wkDr4wYmSzsIKle0lQVWUvO1yim',
"CrooksAndLiars":'102aK5TvKde43bQ-lgS5bX1oVeh1lA6u3',
"NPR": "1ImjUNTH9tzQnQItdjQeb82HppngNx9Ir",
"Reason": "1CaeCbBCpaIsdHtmOa0OhJ-weDKjQqv-P",
"OANN":"1WFCXxukdjngqjQ9HzKVE3f11L2rF9EP4",
          "Demo":"1GwbUCc_k-3_fScEa3bspWdRF0Mpfzjjl"}

download_link = 'https://drive.google.com/uc?export=download&id={}'

driveId = '0AM-VeyaNeDrSUk9PVA'

# Simple function to get a list of files in a given folder
# Use like:  get_files("CNN")

def get_files(folder):
    fileList = drive.ListFile({'q': f"'{folders[folder]}' in parents and trashed=false",'corpora':'drive','driveId':f'{driveId}',"includeItemsFromAllDrives":"true","supportsAllDrives":"true"}).GetList()
    return [f for f in fileList]



In [6]:
# Illustrating how this works with a demo folder I set up

x = get_files("Demo")
pd.DataFrame({"id":i['id'],"name":i['title']} for i in x)

Unnamed: 0,id,name
0,1zragDSmDe5gEpZE0ptGTDfSDG31-C5En,154041465_CNN.topic_defs.csv
1,1eytMjnGAp0aEANbUXsyXWascjxlbFCIL,154041465_CNN.topics.csv
2,1gyHbAPmcrrAu-vuYQkxybRmh6V23PZB3,154041465_CNN.csv


In [7]:
from bertopic import BERTopic
import os

# Run BERTTopic on a file id - presumes a csv with tweets in a column called "text"
def build_topic_model(id):
    raw = pd.read_csv(download_link.format(id),dtype={'id':str},engine="python")
    raw = raw.dropna(subset=['text'])
    raw.text = raw.apply(lambda row: re.sub(r"http\S+", "", row.text).lower(),1)
    raw = raw.dropna(subset=['text'])
    raw.text = raw.apply(lambda row: " ".join(filter(lambda x:x[0]!="@", row.text.split())), 1)
    raw = raw.dropna(subset=['text'])
    raw.text = raw.apply(lambda row: " ".join(re.sub("[^a-zA-Z]+", " ", row.text).split()), 1)
    raw = raw.loc[(raw.text != ""), :]
    timestamps = raw.created_at.to_list()
    tweets = raw.text.to_list()
    
    topic_model = BERTopic(verbose=True)
    topics, probs = topic_model.fit_transform(tweets)
    topic_df = pd.DataFrame({"id":raw.id.tolist(),"topic":topics})
    topic_df = topic_df.loc[(topic_df.topic!=-1),:]
    topic_df['weight']=1
    topic_defs = pd.DataFrame.from_dict(topic_model.get_topics(),orient="index").reset_index().melt(id_vars=['index']).sort_values(['index','variable'])
    topic_defs = pd.concat([topic_defs["index"].reset_index(drop=True),pd.DataFrame(topic_defs['value'].tolist(),columns=["word","weight"]).reset_index(drop=True)],axis=1).rename(columns={"index":"topic"})
    return pd.pivot_table(topic_df,values = 'weight', fill_value=0, columns = 'topic',index = 'id'), topic_defs

# Upload single data file to a drive
def upload_data_file(folderid,filename,data,id=None):
    data.to_csv(filename)
    meta = {"mimeType": "text/csv", "parents": [{"id": folderid}]}
    if id:
        meta["id"] = id
    file1 = drive.CreateFile(meta)
    file1.SetContentFile(filename)
    file1.Upload(param={'supportsTeamDrives': True}) # Upload the file.
    os.remove(filename)

# Upload the processed topic model / defs to a drive
def upload_files(stub,folder,topics,defs,ids):
    fnames = {f"{stub}.topics.csv":topics,f"{stub}.topic_defs.csv":defs}
    upload_data_file(folders[folder],f"{stub}.topics.csv",topics,ids.get("topic"))
    upload_data_file(folders[folder],f"{stub}.topic_defs.csv",defs,ids.get("topic_defs"))
        

# Generates a list of raw files to process - if overwrite is True, this will also 
# replace existing files      
def get_files_to_process(folder,overwrite = False):
    f = get_files(folder)
    reject = set()
    retain = {}
    for i in f:
        name = i['title']
        if not name.endswith(".csv") or "annotation" in name:
            continue
        stub = name.split(".")[0]
        if stub not in retain:
            retain[stub] = {}
        
        if "topic_defs" in name:
            retain[stub]['topic_defs'] = i['id']
        elif "topic" in name:
            retain[stub]['topic'] = i['id']
        else:
            retain[stub]['main'] = i['id']
    
    if not overwrite:
        retain = { k:v for k,v in retain.items() if 'topic' not in v }
    return retain

# One function to rule them all
def process_folder(folder,overwrite = False):
    files = get_files_to_process(folder,overwrite)
    for f,i in files.items():
        print(f"Processing {f}:{i}")
        topics, defs = build_topic_model(i['main'])
        upload_files(f,folder,topics,defs,i)


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
test = '125jKjLQFldQUPPkWAk94WHbcffFlBlon'



In [8]:
process_folder("Reason")


Processing 102155783_Reason:{'main': '10DP8bE3xeor9QcRksVW6Q_Fbjr0GkS0B'}


Batches: 100%|██████████| 22/22 [00:00<00:00, 119.44it/s]
2022-10-14 15:36:33,973 - BERTopic - Transformed documents to Embeddings
2022-10-14 15:36:40,408 - BERTopic - Reduced dimensionality
2022-10-14 15:36:40,445 - BERTopic - Clustered reduced embeddings


Processing 275757377_Reason:{'main': '1Z8XduQCuWP2qenfCvJmyI_ao52rnD0_2'}


Batches: 100%|██████████| 97/97 [00:00<00:00, 131.58it/s]
2022-10-14 15:36:46,579 - BERTopic - Transformed documents to Embeddings
2022-10-14 15:36:56,882 - BERTopic - Reduced dimensionality
2022-10-14 15:36:57,060 - BERTopic - Clustered reduced embeddings


Processing 398960000_Reason:{'main': '1G9HNvtCZrDYKsjoe3_J8eZrhXHgyyHVc'}


Batches: 100%|██████████| 82/82 [00:00<00:00, 131.03it/s]
2022-10-14 15:37:03,368 - BERTopic - Transformed documents to Embeddings
2022-10-14 15:37:14,883 - BERTopic - Reduced dimensionality
2022-10-14 15:37:15,038 - BERTopic - Clustered reduced embeddings


Processing 68544333_Reason:{'main': '1fAosQibcu4kAZMhX6GTvkF38ULTWSUSV'}


Batches: 100%|██████████| 61/61 [00:00<00:00, 111.06it/s]
2022-10-14 15:37:21,117 - BERTopic - Transformed documents to Embeddings
2022-10-14 15:37:29,302 - BERTopic - Reduced dimensionality
2022-10-14 15:37:29,399 - BERTopic - Clustered reduced embeddings


Processing 440453364_Reason:{'main': '1hILwFglgX64qGrLthcH8gUaz5lhMCjQa'}


Batches: 100%|██████████| 42/42 [00:00<00:00, 105.18it/s]
2022-10-14 15:37:34,939 - BERTopic - Transformed documents to Embeddings
2022-10-14 15:37:40,508 - BERTopic - Reduced dimensionality
2022-10-14 15:37:40,580 - BERTopic - Clustered reduced embeddings


Processing 44803131_Reason:{'main': '1h3wVCraD891MPDMjbTCJx2Q_niX3uE0c'}


Batches: 100%|██████████| 33/33 [00:00<00:00, 131.06it/s]
2022-10-14 15:37:45,788 - BERTopic - Transformed documents to Embeddings
2022-10-14 15:37:50,220 - BERTopic - Reduced dimensionality
2022-10-14 15:37:50,270 - BERTopic - Clustered reduced embeddings


Processing 191680942_Reason:{'main': '1WlyUdUxumbFk-YwK5VF3oPapd4RUhv9f'}


Batches: 100%|██████████| 538/538 [00:04<00:00, 118.16it/s]
2022-10-14 15:38:01,862 - BERTopic - Transformed documents to Embeddings
2022-10-14 15:38:25,415 - BERTopic - Reduced dimensionality


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

2022-10-14 15:38:27,317 - BERTopic - Clustered reduced embeddings


Processing 18009089_Reason:{'main': '1lHo13Ktbq5UDJQPRw6obM3K5xYgVjf4n'}


Batches: 100%|██████████| 271/271 [00:02<00:00, 133.88it/s]
2022-10-14 15:38:37,555 - BERTopic - Transformed documents to Embeddings
2022-10-14 15:38:59,096 - BERTopic - Reduced dimensionality
2022-10-14 15:38:59,649 - BERTopic - Clustered reduced embeddings


Processing 13297002_Reason:{'main': '1-F5IuaGI03qfj1wHGreTI4ITG_dnswwn'}


Batches: 100%|██████████| 23/23 [00:00<00:00, 126.56it/s]
2022-10-14 15:39:05,510 - BERTopic - Transformed documents to Embeddings
2022-10-14 15:39:09,817 - BERTopic - Reduced dimensionality
2022-10-14 15:39:09,854 - BERTopic - Clustered reduced embeddings


Processing 22234524_Reason:{'main': '1xfjqR8FefblNwYM8iJw-zjDNx0gMtztW'}


Batches: 100%|██████████| 490/490 [00:03<00:00, 131.63it/s]
2022-10-14 15:39:22,586 - BERTopic - Transformed documents to Embeddings
2022-10-14 15:39:34,680 - BERTopic - Reduced dimensionality
2022-10-14 15:39:35,787 - BERTopic - Clustered reduced embeddings


Processing 423332623_Reason:{'main': '1eHKSZ5DhhN2FXFws3azlSyCIEDGYFcSw'}


Batches: 100%|██████████| 30/30 [00:00<00:00, 109.34it/s]
2022-10-14 15:39:42,200 - BERTopic - Transformed documents to Embeddings
2022-10-14 15:39:46,854 - BERTopic - Reduced dimensionality
2022-10-14 15:39:46,906 - BERTopic - Clustered reduced embeddings


Processing 38679608_Reason:{'main': '1ayzqNoVahhAAG2lL81wtGx3IKvz5atHt'}


Batches: 100%|██████████| 263/263 [00:02<00:00, 118.11it/s]
2022-10-14 15:39:55,773 - BERTopic - Transformed documents to Embeddings
2022-10-14 15:40:08,336 - BERTopic - Reduced dimensionality
2022-10-14 15:40:08,884 - BERTopic - Clustered reduced embeddings


Processing 35814923_Reason:{'main': '1mQBpR5qojZobFt0pzcDI1S22vHxfL-Bv'}


Batches: 100%|██████████| 928/928 [00:07<00:00, 127.72it/s]
2022-10-14 15:40:25,756 - BERTopic - Transformed documents to Embeddings
2022-10-14 15:40:45,528 - BERTopic - Reduced dimensionality
2022-10-14 15:40:47,451 - BERTopic - Clustered reduced embeddings


Processing 14607764_Reason:{'main': '11dL8A-6qpTf9T1YGAsqGimRgmM8pZL46'}


Batches: 100%|██████████| 278/278 [00:02<00:00, 116.92it/s]
2022-10-14 15:40:59,668 - BERTopic - Transformed documents to Embeddings
2022-10-14 15:41:13,273 - BERTopic - Reduced dimensionality
2022-10-14 15:41:13,904 - BERTopic - Clustered reduced embeddings


Processing 18963489_Reason:{'main': '1IXwAToBuFj97gEsn_AFTdIyTJnmP6eBE'}


Batches: 100%|██████████| 57/57 [00:00<00:00, 116.57it/s]
2022-10-14 15:41:19,999 - BERTopic - Transformed documents to Embeddings
2022-10-14 15:41:27,114 - BERTopic - Reduced dimensionality
2022-10-14 15:41:27,216 - BERTopic - Clustered reduced embeddings


Processing 376030198_Reason:{'main': '1fDvOcPtQCC2Df82ivBlvGvy7bLk25Oi3'}


Batches: 100%|██████████| 318/318 [00:02<00:00, 127.28it/s]
2022-10-14 15:41:37,022 - BERTopic - Transformed documents to Embeddings
2022-10-14 15:41:48,180 - BERTopic - Reduced dimensionality
2022-10-14 15:41:48,944 - BERTopic - Clustered reduced embeddings


Processing 28048361_Reason:{'main': '18o9F5u1wP2j1pcRWhtw_9p8oD395W_4G'}


Batches: 100%|██████████| 88/88 [00:00<00:00, 125.81it/s]
2022-10-14 15:41:56,370 - BERTopic - Transformed documents to Embeddings
2022-10-14 15:42:10,821 - BERTopic - Reduced dimensionality
2022-10-14 15:42:10,968 - BERTopic - Clustered reduced embeddings


Processing 25139043_Reason:{'main': '1_Tgt85CL_5kyxprDGYd9i1KNLaOY_nJ8'}


Batches: 100%|██████████| 384/384 [00:03<00:00, 110.62it/s]
2022-10-14 15:42:22,749 - BERTopic - Transformed documents to Embeddings
2022-10-14 15:42:32,525 - BERTopic - Reduced dimensionality
2022-10-14 15:42:33,095 - BERTopic - Clustered reduced embeddings


Processing 19725637_Reason:{'main': '1qq1rp0v55yoRTT1ob1VhuiAvBb3Pjg8i'}


Batches: 100%|██████████| 687/687 [00:05<00:00, 135.04it/s]
2022-10-14 15:42:47,357 - BERTopic - Transformed documents to Embeddings
2022-10-14 15:43:02,369 - BERTopic - Reduced dimensionality
2022-10-14 15:43:03,584 - BERTopic - Clustered reduced embeddings


Processing 209097259_Reason:{'main': '1bz3h3N1FmLP0D1yS8UrJdXWQvlNY7j8s'}


Batches: 100%|██████████| 88/88 [00:00<00:00, 132.93it/s]
2022-10-14 15:43:10,704 - BERTopic - Transformed documents to Embeddings
2022-10-14 15:43:23,536 - BERTopic - Reduced dimensionality
2022-10-14 15:43:23,689 - BERTopic - Clustered reduced embeddings


Processing 124786881_Reason:{'main': '1P1elmDxyTBQ8FMtje5dL-kH4NFPjL-tq'}


Batches: 100%|██████████| 544/544 [00:04<00:00, 133.39it/s]
2022-10-14 15:43:36,833 - BERTopic - Transformed documents to Embeddings
2022-10-14 15:43:48,021 - BERTopic - Reduced dimensionality
2022-10-14 15:43:49,079 - BERTopic - Clustered reduced embeddings


Processing 77938670_Reason:{'main': '1ZlVWa8a55hsQLbWLsxrCn9YtbxC1Y3n_'}


Batches: 100%|██████████| 70/70 [00:00<00:00, 118.02it/s]
2022-10-14 15:43:56,126 - BERTopic - Transformed documents to Embeddings
2022-10-14 15:44:05,169 - BERTopic - Reduced dimensionality
2022-10-14 15:44:05,288 - BERTopic - Clustered reduced embeddings


Processing 24495964_Reason:{'main': '1yA9AJA4QfMlT5XycYSbppknPI1734aNJ'}


Batches: 100%|██████████| 33/33 [00:00<00:00, 116.03it/s]
2022-10-14 15:44:10,381 - BERTopic - Transformed documents to Embeddings
2022-10-14 15:44:14,771 - BERTopic - Reduced dimensionality
2022-10-14 15:44:14,828 - BERTopic - Clustered reduced embeddings


Processing 153874383_Reason:{'main': '1UDoZn1q-A2j2aL5LMrTV93oImMlkCHAv'}


Batches: 100%|██████████| 29/29 [00:00<00:00, 101.74it/s]
2022-10-14 15:44:20,102 - BERTopic - Transformed documents to Embeddings
2022-10-14 15:44:24,613 - BERTopic - Reduced dimensionality
2022-10-14 15:44:24,661 - BERTopic - Clustered reduced embeddings


Processing 24139632_Reason:{'main': '1vHk9Pg6Rt6mSNj3HTQblKDs3ajwaUswt'}


Batches: 100%|██████████| 1075/1075 [00:07<00:00, 136.95it/s]
2022-10-14 15:44:43,007 - BERTopic - Transformed documents to Embeddings
2022-10-14 15:45:08,647 - BERTopic - Reduced dimensionality
2022-10-14 15:45:10,843 - BERTopic - Clustered reduced embeddings


Processing 234229170_Reason:{'main': '1vefxXBHTtlzfjlcIb91c2BNm2CKvj1bQ'}


Batches: 100%|██████████| 25/25 [00:00<00:00, 121.83it/s]
2022-10-14 15:45:19,775 - BERTopic - Transformed documents to Embeddings
2022-10-14 15:45:23,475 - BERTopic - Reduced dimensionality
2022-10-14 15:45:23,517 - BERTopic - Clustered reduced embeddings


Processing 322668208_Reason:{'main': '1XU30VwEEqLj8BvM3gfeyTdjUbO-gyWwL'}


Batches: 100%|██████████| 309/309 [00:02<00:00, 131.16it/s]
2022-10-14 15:45:32,078 - BERTopic - Transformed documents to Embeddings
2022-10-14 15:45:47,126 - BERTopic - Reduced dimensionality
2022-10-14 15:45:47,485 - BERTopic - Clustered reduced embeddings


Processing 27468554_Reason:{'main': '1NT2bTdWZEl4ZShwI_lBU7sftt45Z3VIu'}


Batches: 100%|██████████| 708/708 [00:05<00:00, 134.17it/s]
2022-10-14 15:46:03,098 - BERTopic - Transformed documents to Embeddings
2022-10-14 15:46:19,671 - BERTopic - Reduced dimensionality
2022-10-14 15:46:21,119 - BERTopic - Clustered reduced embeddings


Processing 60962714_Reason:{'main': '19iz2awSAXZg51XnSfD930EJBb2lSrfx6'}


Batches: 100%|██████████| 66/66 [00:00<00:00, 128.27it/s]
2022-10-14 15:46:29,555 - BERTopic - Transformed documents to Embeddings
2022-10-14 15:46:37,542 - BERTopic - Reduced dimensionality
2022-10-14 15:46:37,597 - BERTopic - Clustered reduced embeddings


Processing 63640738_Reason:{'main': '1VBCh-SiI8CWz5LvBzhASY1CIB2R8Y7JN'}


Batches: 100%|██████████| 146/146 [00:01<00:00, 109.21it/s]
2022-10-14 15:46:46,011 - BERTopic - Transformed documents to Embeddings
2022-10-14 15:46:58,860 - BERTopic - Reduced dimensionality
2022-10-14 15:46:58,988 - BERTopic - Clustered reduced embeddings


Processing 106863668_Reason:{'main': '153MTbTZJ0PTq27bqvF4JZjpKYwJiXh2B'}


Batches: 100%|██████████| 1393/1393 [00:10<00:00, 133.03it/s]
2022-10-14 15:47:20,323 - BERTopic - Transformed documents to Embeddings
2022-10-14 15:47:49,518 - BERTopic - Reduced dimensionality
2022-10-14 15:47:52,667 - BERTopic - Clustered reduced embeddings


Processing 14352000_Reason:{'main': '1FBr4fyPfhytSfFrkehEz_id7aP6v7cvR'}


HTTPError: HTTP Error 403: Forbidden