# Applying Cosine Similarity Between TikToks and NYT Articles

In [1]:
import pandas as pd
import numpy as np
from numpy.linalg import norm
import string
import seaborn as sns 
import matplotlib.pyplot as plt
import os
import json
import datetime as dt
import time
import using_nyt_api as nyt

main_dir = os.path.dirname(os.getcwd())
analysis_dir = f'{main_dir}/analysis'
pyktok_dir = f'{main_dir}/pyktok-results'

## Cosine Similarity

In [2]:
def getVocabulary(textchunk):
    """Given some text, create the vocabulary of unique words."""
    textchunk = textchunk.lower()
    cleantext = "".join(char for char in textchunk if char not in string.punctuation)
    words = set(cleantext.split())
    voc = sorted(words)
    return voc

def text2vector(sentence, voc):
    """Given a sentence and the vocabulary for the problem,
    turn every sentence into a vector.
    """
    cleantext = "".join(char for char in sentence if char not in string.punctuation)
    words = cleantext.lower().split()
    vector = [words.count(w) for w in voc]
    return vector

def cosineSimilarity(vec1,vec2):
    """Calculate the cosine similarity between two vectors."""
    V1 = np.array(vec1)
    V2 = np.array(vec2)
    cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))
    return cosine

def rankDocuments(query,sentences):
    """Given a query and some sentences, rank the sentences for
    which are the most similar to the query"""
    # Step 1: create vocabulary
    voc = getVocabulary(" ".join(sentences))

    # Step 2: generate vector for query
    queryVec = text2vector(query,voc)

    # Step 3: generate vector for sentences and calculate cosine similarity at once
    similarities = []
    for sent in sentences:
        sentVec = text2vector(sent,voc)
        sim = cosineSimilarity(queryVec,sentVec)
        similarities.append((round(sim,4),sent)) # keep track of sentences

    similarities.sort(reverse=True)
    return similarities # a tuple of (cosine similarity, sentence)

## NYTimes API

In [3]:
def check_file_exists(filepath,filename):
    if filename in os.listdir(filepath):
        return True
    return False

def get_nyt_data(date,key,save_csv):
    '''Takes a date, NYT API key, and boolean for whether to save data as csv. 
    Returns a df containing the abstract, lead_paragraph, pub_date, 
    document_type, section_name, type_of_material, headline, and keywords 
    of articles for that date and saves df to {date}-articles.csv.'''
    # check if articles for given date exist
    filepath = f'{analysis_dir}/nyt_data/' # save files to analysis/nyt_data folder
    filename = f'{date}-articles.csv'
    file_exists = check_file_exists(filepath,filename)
    
    if file_exists: # reads df from existing file if file exists
        nyt_data = pd.read_csv(f'{filepath}/{filename}')
    else: # gets articles by calling NYT API and creates df
        articles = nyt.get_articles_by_date(date,key)
        nyt_data = nyt.create_df(articles,date,filepath,write_csv=save_csv)
    return nyt_data

In [4]:
# Testing on one date
date = '2024-03-13'
key = '1DFmIMxxqdYl8wJBPqAFxtHkimk86Qtn'
save_csv = True
nyt_data = get_nyt_data(date,key,save_csv)
nyt_data.head(3)

Unnamed: 0.1,Unnamed: 0


## Tiktoks with News Hashtags

In [5]:
def get_pyktok_metadata(code):
    '''Returns a df of metadata collected from pyktok for the given code'''
#     filename = f"pyktok-results/results_{code}.csv"
    filename = f'{pyktok_dir}/results_{code}.csv'
    metadata = pd.read_csv(filename)
    return metadata

def get_news_vids_by_code(code):
    '''Get the videos with news hashtags based on code'''
#     filename = f'analysis/videos-newsHashtags-{code}.json'
    filename = f'{analysis_dir}/videos-newsHashtags-{code}.json'
    with open(filename, 'r') as inFile:
        data = json.load(inFile)
    news_vids = pd.read_json(data, orient="split")
    return news_vids

def add_date_to_news_vids(metadata,news_vids):
    updated_df = news_vids.merge(metadata[['video_id','video_timestamp']], on='video_id', how='left') # keeps all ids in news_vids
    updated_df['date'] = updated_df['video_timestamp'].astype(str).apply(lambda timestamp: timestamp[:10])
    return updated_df

In [6]:
code = 12345

# Pyktok Metadata is located within pyktok-results folder
metadata = get_pyktok_metadata(code)
print(f'Metadata DF Shape: {metadata.shape}')

# JSON files containing news-related videos located in analysis folder
news_vids = get_news_vids_by_code(code)
print(f'Shape of News-related Videos DF: {news_vids.shape}')

Metadata DF Shape: (5951, 20)
Shape of News-related Videos DF: (361, 7)


  news_vids = pd.read_json(data, orient="split")


In [7]:
metadata.head(3)

Unnamed: 0,video_id,video_timestamp,video_duration,video_locationcreated,suggested_words,video_diggcount,video_sharecount,video_commentcount,video_playcount,video_description,video_is_ad,video_stickers,author_username,author_name,author_followercount,author_followingcount,author_heartcount,author_videocount,author_diggcount,author_verified
0,7331569476502064414,2024-02-03T21:34:29,7.0,US,"Hamsters, Cute Hamster, Hamsters Of TikTok, Ha...",70000.0,727.0,148.0,345800.0,Lol pronoun check ! #gay #hamster #nyc i love ...,False,,kyiskuel,Ky,,,,,,False
1,7334088719881882913,2024-02-10T16:30:26,46.0,IE,,8977.0,144.0,74.0,132200.0,Dennis was sculpted by the gods #alwayssunny #...,False,,itsalwayssunnydaily,ItsAlwaysFunnyInPhilidelphia,,,,,,False
2,7317823927156329771,2023-12-28T20:34:44,5.0,US,"baddieshub, baddiehub noemy, noemy baddieshub ...",753300.0,3817.0,3700.0,9300000.0,Huh?? #fyp #funny #viral,False,,noemyiscool_,Noemy,,,,,,False


In [8]:
news_vids.head(3)

Unnamed: 0,video_id,author_username,video_description,hashtags,suggested_words,isNewsAccount,isNewsHashtag
56,7343058460025703722,msnbc,NBC News and MSNBC Correspondent Shaquille Bre...,"[fyp, foryou, politicaltiktok, politics, 2024p...","haley, nbc, nicki halley",False,True
334,7340809195807132971,blakes_takes1,#greenscreen #greenscreenvideo Rebecca Ferguso...,"[greenscreen, greenscreenvideo, movie, news, d...","Rebecca Ferguson, hugh jackman, rebecca fergus...",False,True
460,7341877699629108523,randomusa10,TEXAS THO😬👎🏼 #states #50statessong #50niftyuni...,"[states, 50statessong, 50niftyunitedstates, vi...","rhode island, wyoming state, 50 state song, ar...",False,True


Let's also find the dates that these videos with news-related hashtags were watched.
This way we'll be able to get the data of the relevant NYT headlines for that date.

In [9]:
# Add video_timestamp from metadata df to news_vids df
updated_news_vids = add_date_to_news_vids(metadata,news_vids)
updated_news_vids.head(3)

Unnamed: 0,video_id,author_username,video_description,hashtags,suggested_words,isNewsAccount,isNewsHashtag,video_timestamp,date
0,7343058460025703722,msnbc,NBC News and MSNBC Correspondent Shaquille Bre...,"[fyp, foryou, politicaltiktok, politics, 2024p...","haley, nbc, nicki halley",False,True,2024-03-05T20:37:20,2024-03-05
1,7340809195807132971,blakes_takes1,#greenscreen #greenscreenvideo Rebecca Ferguso...,"[greenscreen, greenscreenvideo, movie, news, d...","Rebecca Ferguson, hugh jackman, rebecca fergus...",False,True,2024-02-28T19:09:27,2024-02-28
2,7341877699629108523,randomusa10,TEXAS THO😬👎🏼 #states #50statessong #50niftyuni...,"[states, 50statessong, 50niftyunitedstates, vi...","rhode island, wyoming state, 50 state song, ar...",False,True,2024-03-02T16:15:33,2024-03-02


## Cosine Similarity between some TikToks and some NYT articles

Each of your transcribed videos should have both the date and the various text features (transcript, description, hashtags, or suggested words). Ideally, we will create embeddings of each of them separately and even an embedding of transcript + description + suggested words (hashtags are already in the description) and then compare each of them against the respective NYT headlines of the day. 

However, it is okay if you pick only one category to get the embedding and calculate the cosine similarity. 
For each post, save the most similar NYT article and the cosine score. Put everything in a dataframe so that you can sort to see the most similar pairs and the least similar ones, and also use describe() to see the range of cosine similarity scores. 

For each post, save the most similar NYT article and the cosine score. Put everything in a dataframe so that you can sort to see the most similar pairs and the least similar ones, and also use describe() to see the range of cosine similarity scores.

In [10]:
def get_all_nyt_data(dates):
    '''Saves and returns cleaned dfs for each date'''
    nyt_dfs = []
    for date in dates:
        df = get_nyt_data(date,key,True)
        print(f'\nShape of df for {date}: {df.shape}')
        nyt_dfs.append(df)
    return nyt_dfs

In [14]:
%%time
# Get NYT data for the dates where user watched a news-related video
key = '1DFmIMxxqdYl8wJBPqAFxtHkimk86Qtn'
dates = updated_news_vids['date']
len(dates)

nyt_dfs = get_all_nyt_data(dates)


Shape of df for 2024-03-05: (308, 9)

Shape of df for 2024-02-28: (173, 9)

Shape of df for 2024-03-02: (91, 9)

Shape of df for 2024-03-01: (171, 9)

Shape of df for 2024-03-02: (91, 9)

Shape of df for 2024-02-03: (77, 9)

Shape of df for 2024-02-27: (144, 9)

Shape of df for 2024-03-01: (171, 9)

Shape of df for 2024-03-01: (171, 9)

Shape of df for 2024-02-23: (163, 9)

Shape of df for 2024-02-28: (173, 9)

Shape of df for 2024-02-28: (173, 9)

Shape of df for 2024-02-28: (173, 9)

Shape of df for 2024-02-28: (173, 9)

Shape of df for 2024-02-28: (173, 9)

Shape of df for 2024-02-27: (144, 9)

Shape of df for 2024-02-27: (144, 9)

Shape of df for 2024-02-27: (144, 9)

Shape of df for 2024-02-25: (67, 9)

Shape of df for 2024-02-15: (173, 9)

Shape of df for 2024-02-22: (155, 9)

Shape of df for 2024-02-25: (67, 9)

Shape of df for 2023-06-12: (130, 9)

Shape of df for 2023-02-26: (61, 9)

Shape of df for 2022-10-10: (99, 9)

Shape of df for 2023-10-31: (79, 9)

Shape of df for 202

In [15]:
def apply_cosine_similarity_by_feature(news_vids,video_id,key,feature):
    '''Takes df of videos, video id (as int64), NYT API key, and an feature
    of the news_vids df (suggested_words, video_description, etc), and finds the 
    cosine similarity between that attribute and all the NYTimes headlines 
    from the date the video was watched.'''
    # Get the relevant tiktok video and its video description
    video = news_vids[news_vids['video_id']==video_id].iloc[0] # get 1 video
    feature_text = video[feature] # gets feature, such as description, suggested words, etc
    date = video['date']

    # Load the relevant nytdata for given date
    nyt_filepath = f'{analysis_dir}/nyt_data'
    nyt_filename = f'{date}-articles.csv'
    print(f'Looking for {nyt_filepath}/{nyt_filename}')
    
    try:
        nyt_df = pd.read_csv(f'{nyt_filepath}/{nyt_filename}')
        print(f'Loaded data from {nyt_filename}')
    except FileNotFoundError as fnfe:
        print(fnfe)
        print(f'File {nyt_filename} not found. Creating it instead.')
        nyt_df = get_nyt_data(date,key,True)

    # Get all the headlines for the date
    try:
        headlines = nyt_df['headline'].values
    except KeyError:
        print("\'headline\' not found in nyt_df.")
        headlines = []

    # Apply cosine similarity
    similarities = rankDocuments(feature_text,headlines) # a tuple of (cosine similarity, sentence)
    return similarities, feature_text, date

In [17]:
len(os.listdir(f'{analysis_dir}/nyt_data'))

157

In [18]:
%%time
code = 12345
metadata = get_pyktok_metadata(code)
news_vids = get_news_vids_by_code(code)
updated_news_vids = add_date_to_news_vids(metadata,news_vids)
video_id = updated_news_vids['video_id'].values[0] # get a particular video id
feature = 'video_description' #'suggested_words' 

similarities,feature_text,date = apply_cosine_similarity_by_feature(updated_news_vids,video_id,key,feature)

print(f'\nTikTok User Code: {code}')
print(f'\nVideo ID: {video_id}')
print(f'\nVideo\'s {feature}: {feature_text}')
print(f'\nDate watched: {date}')
print(f'\nTop 5 similarities:')
for sim in similarities[:5]:
    print(sim)

Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-03-05-articles.csv
Loaded data from 2024-03-05-articles.csv

TikTok User Code: 12345

Video ID: 7343058460025703722

Video's video_description: NBC News and MSNBC Correspondent Shaquille Brewster speaks to a North Carolina Trump voter who would not consider voting for Nikki Haley because she's a woman. "A woman’s not gonna be a good president," he tells Brewster, "All a woman's good for in my book is having babies and taking care of the house." #fyp #foryou #politicaltiktok #politics #2024president #election2024 #trump #haley #vote 

Date watched: 2024-03-05

Top 5 similarities:
(0.4616, 'The Lindbergh Baby Kidnapping: A Grisly Theory and a Renewed Debate')
(0.3967, 'A Grainy Photo and a Dilemma: How U.K. Papers Are Covering Princess Catherine')
(0.3696, 'A Trans-Atlantic Crackdown')
(0.3659, 'A Shape-Shifting House in Los Angeles')
(0.3474, 'A Deadly Aid Delivery and Growing Threat of Famine in Gaza')


  news_vids = pd.read_json(data, orient="split")


In [26]:
# def get_all_cosine_similarities_by_feature(news_vids,video_ids,key,feature):
#     results = []

#     for video_id in video_ids:
#         print(f'Video ID: {video_id}')
#         try:
#             result_dict = {}
#             similarities,video_feature,date = apply_cosine_similarity_by_feature(news_vids,video_id,key,feature)
#             result_dict['video_id'] = video_id
#             result_dict['date'] = date
#             result_dict['video_feature']
#             result_dict['similarities'] = similarities
#             print(f'Date Watched: {date}')
#             print(f'{feature}: {video_feature}')
#             print(f'Top 5 most similar NYT headlines:')
#             for sim in similarities[:5]: 
#                 print(sim)
#             print()
#             results.append((similarities,video_feature,date))
#         except TypeError as te:
#             print(f'Problem with getting the cosine similarities for video {video_id}')
#             print(te)

#     return results


def get_all_cosine_similarities_by_feature(updated_news_vids,video_ids,key,feature):
    results = []

    for video_id in video_ids:
        print(f'Video ID: {video_id}')
        try:
            result_dict = {}
            similarities,feature_text,date = apply_cosine_similarity_by_feature(updated_news_vids,video_id,key,feature)
            result_dict['video_id'] = video_id
            result_dict['date'] = date
            result_dict[f'{feature}'] = feature_text
            result_dict['similarities'] = similarities
            cos_sim_scores = [sim[0] for sim in similarities]
            nyt_headlines = [sim[1] for sim in similarities]
            result_dict['cos_sim_scores'] = cos_sim_scores
            result_dict['nyt_headlines'] = nyt_headlines
            results.append(result_dict)
        except TypeError as te:
            print(f'Problem with getting the cosine similarities for video {video_id}')
            print(te)

    return results

In [27]:
# %%time
# feature = 'video_description'
# video_ids = news_vids['video_id'].values
# # results = get_all_cosine_similarities(news_vids,video_ids,key)
# results = get_all_cosine_similarities_by_feature(news_vids,video_ids,key,feature)

## Putting Everything Together

In [28]:
key = '1DFmIMxxqdYl8wJBPqAFxtHkimk86Qtn'

In [32]:
def get_results(key,code,feature):
    # Get data for given code
    metadata = get_pyktok_metadata(code)
    news_vids = get_news_vids_by_code(code)
    updated_news_vids = add_date_to_news_vids(metadata,news_vids)
    video_ids = updated_news_vids['video_id'].values 

    results = get_all_cosine_similarities_by_feature(updated_news_vids,video_ids,key,feature)
    df = pd.DataFrame(results)
    
    # Save results
    filename = f'cosine_sim_{feature}_{code}.csv'
    df.to_csv(filename)
    
    return df

# print(f'\nTikTok User Code: {code}')
# print(f'\nVideo ID: {video_id}')
# print(f'\nVideo\'s {feature}: {feature_text}')
# print(f'\nDate watched: {date}')
# print(f'\nTop 5 similarities:')
# for sim in similarities[:5]:
#     print(sim)

In [34]:
code = 12345
feature = "suggested_words"#"video_description" 
df = get_results(key,code,feature)
df

  news_vids = pd.read_json(data, orient="split")
  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Video ID: 7343058460025703722
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-03-05-articles.csv
Loaded data from 2024-03-05-articles.csv
Video ID: 7340809195807132971
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-28-articles.csv
Loaded data from 2024-02-28-articles.csv
Video ID: 7341877699629108523
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-03-02-articles.csv
Loaded data from 2024-03-02-articles.csv
Video ID: 7341485301010877728
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-03-01-articles.csv
Loaded data from 2024-03-01-articles.csv
Video ID: 7341694662866046254
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-03-02-articles.csv
Loaded data from 2024-03-02-articles.csv
Video ID: 7331462452195970337
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-03-articles.csv
Loaded da

Video ID: 7338958081285459231
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-23-articles.csv
Loaded data from 2024-02-23-articles.csv
Video ID: 7339169514493447454
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-24-articles.csv
Loaded data from 2024-02-24-articles.csv
Video ID: 7339041283610037534
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-24-articles.csv
Loaded data from 2024-02-24-articles.csv
Problem with getting the cosine similarities for video 7339041283610037534
'NoneType' object is not iterable
Video ID: 7337716616479788331
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-20-articles.csv
Loaded data from 2024-02-20-articles.csv
Video ID: 7337716616479788331
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-20-articles.csv
Loaded data from 2024-02-20-articles.csv
Video ID: 7337716616479788331
L

Video ID: 7333747699298127150
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-09-articles.csv
Loaded data from 2024-02-09-articles.csv
Video ID: 7332966069767802154
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-07-articles.csv
Loaded data from 2024-02-07-articles.csv
Video ID: 7332797813983677717
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-07-articles.csv
Loaded data from 2024-02-07-articles.csv
Video ID: 7332174452706397486
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-05-articles.csv
Loaded data from 2024-02-05-articles.csv
Video ID: 7331850550503329066
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-04-articles.csv
Loaded data from 2024-02-04-articles.csv
Video ID: 7328492050683333921
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-26-articles.csv
Loaded da

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Video ID: 7326690380009852202
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-21-articles.csv
Loaded data from 2024-01-21-articles.csv
Video ID: 7325842797091736864
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-19-articles.csv
Loaded data from 2024-01-19-articles.csv
Video ID: 7326473620648414510
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-21-articles.csv
Loaded data from 2024-01-21-articles.csv
Video ID: 7326606097400302878
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-21-articles.csv
Loaded data from 2024-01-21-articles.csv
Video ID: 7326672765593996590
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-21-articles.csv
Loaded data from 2024-01-21-articles.csv
Video ID: 7326660951997041963
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/nan-articles.csv
Loaded data from

Loaded data from nan-articles.csv
'headline' not found in nyt_df.
Problem with getting the cosine similarities for video 7313353147525745922
'NoneType' object is not iterable
Video ID: 7312616929229868330
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/nan-articles.csv
Loaded data from nan-articles.csv
'headline' not found in nyt_df.
Video ID: 7306595312922021163
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/nan-articles.csv
Loaded data from nan-articles.csv
'headline' not found in nyt_df.
Problem with getting the cosine similarities for video 7306595312922021163
'NoneType' object is not iterable
Video ID: 7312638495430823211
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/nan-articles.csv
Loaded data from nan-articles.csv
'headline' not found in nyt_df.
Video ID: 7312566726024056106
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/nan-articles.csv
Loaded data from nan-a

Loaded data from nan-articles.csv
'headline' not found in nyt_df.
Video ID: 7279046355992071457
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/nan-articles.csv
Loaded data from nan-articles.csv
'headline' not found in nyt_df.
Problem with getting the cosine similarities for video 7279046355992071457
'NoneType' object is not iterable
Video ID: 7279704894506945798
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/nan-articles.csv
Loaded data from nan-articles.csv
'headline' not found in nyt_df.
Problem with getting the cosine similarities for video 7279704894506945798
'NoneType' object is not iterable
Video ID: 7280016453565156650
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/nan-articles.csv
Loaded data from nan-articles.csv
'headline' not found in nyt_df.
Problem with getting the cosine similarities for video 7280016453565156650
'NoneType' object is not iterable
Video ID: 7279568128894340395
Looking

Unnamed: 0,video_id,date,suggested_words,similarities,cos_sim_scores,nyt_headlines
0,7343058460025703722,2024-03-05,"haley, nbc, nicki halley","[(nan, Trump Is Running on Dystopian Fantasies...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[Trump Is Running on Dystopian Fantasies, Micr..."
1,7340809195807132971,2024-02-28,"Rebecca Ferguson, hugh jackman, rebecca fergus...","[(0.2236, E-Bike Battery Caused Fire That Kill...","[0.2236, 0.2041, 0.1826, 0.0, 0.0, 0.0, 0.0, 0...",[E-Bike Battery Caused Fire That Killed Young ...
2,7341877699629108523,2024-03-02,"rhode island, wyoming state, 50 state song, ar...","[(0.4243, Scenes From the Smokehouse Creek Fir...","[0.4243, 0.3536, 0.3207, 0.291, 0.2828, 0.2774...",[Scenes From the Smokehouse Creek Fire in Texa...
3,7341485301010877728,2024-03-01,"Wendy Williams, denial is a river in egypt, we...","[(0.3105, If You Had $1 Billion to Give Away, ...","[0.3105, 0.27, 0.252, 0.2224, 0.2224, 0.1782, ...","[If You Had $1 Billion to Give Away, What Char..."
4,7341694662866046254,2024-03-02,"royal family news, kate middleton brother, lad...","[(0.3015, News Leaders Around the World Pledge...","[0.3015, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0....",[News Leaders Around the World Pledge Support ...
...,...,...,...,...,...,...
231,7278032406232223019,,"the fault in our stars, The Fault In Our Stars...",[],[],[]
232,7277248227693333802,,"margaritaville, trump ai singing, margaritavil...",[],[],[]
233,7277312227831549230,,"college, College Student, College Life, Time-L...",[],[],[]
234,7274906751294524704,,"history, History Tiktok, jewish, Middle Easter...",[],[],[]


### Account 12345

In [None]:
# Get pyktok data and videos with news-related hashtags
metadata_12345 = get_pyktok_metadata(12345)
news_vids_12345 = get_news_vids_by_code(12345)
news_vids_12345 = add_date_to_news_vids(metadata_12345,news_vids_12345)
print(news_vids_12345.shape)
news_vids_12345.head(3)

In [None]:
%%time
# Get NYT Article Data for all the unique dates
dates_12345 = news_vids_12345['date'].unique()
# get_all_nyt_data(dates_12345)

In [None]:
# Perform Analysis for 12345 using 'video description'
video_ids_12345 = news_vids_12345['video_id'].values

In [None]:
%%time
feature = 'video_description'
results_description_12345 = get_all_cosine_similarities_by_feature(news_vids_12345,video_ids_12345,key,feature)

In [None]:
%%time
feature = 'suggested_words'
results_suggestedwords_12345 = get_all_cosine_similarities_by_feature(news_vids_12345,video_ids_12345,key,feature)

In [None]:
# Find the videos who had the highest cosine similarities



### Account 50405

In [None]:
# Get pyktok data and videos with news-related hashtags
# metadata_50405 = get_pyktok_metadata(50405)
# news_vids_50405 = get_news_vids_by_code(50405)
# news_vids_date_50405 = add_date_to_news_vids(metadata_50405,news_vids_50405)
# news_vids_date_50405