# Applying Cosine Similarity Between TikToks and NYT Articles

## Cosine Similarity

In [1]:
import pandas as pd
import numpy as np
from numpy.linalg import norm
import string
import seaborn as sns 
import matplotlib.pyplot as plt
import os
import json
import datetime as dt
import time

In [2]:
def getVocabulary(textchunk):
    """Given some text, create the vocabulary of unique words."""
    textchunk = textchunk.lower()
    cleantext = "".join(char for char in textchunk if char not in string.punctuation)
    words = set(cleantext.split())
    voc = sorted(words)
    return voc

def text2vector(sentence, voc):
    """Given a sentence and the vocabulary for the problem,
    turn every sentence into a vector.
    """
    cleantext = "".join(char for char in sentence if char not in string.punctuation)
    words = cleantext.lower().split()
    vector = [words.count(w) for w in voc]
    return vector

def cosineSimilarity(vec1,vec2):
    """Calculate the cosine similarity between two vectors."""
    V1 = np.array(vec1)
    V2 = np.array(vec2)
    cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))
    return cosine

def rankDocuments(query,sentences):
    """Given a query and some sentences, rank the sentences for
    which are the most similar to the query"""
    # Step 1: create vocabulary
    voc = getVocabulary(" ".join(sentences))

    # Step 2: generate vector for query
    queryVec = text2vector(query,voc)

    # Step 3: generate vector for sentences and calculate cosine similarity at once
    similarities = []
    for sent in sentences:
        sentVec = text2vector(sent,voc)
        sim = cosineSimilarity(queryVec,sentVec)
        similarities.append((round(sim,4),sent)) # keep track of sentences

    similarities.sort(reverse=True)
    return similarities

## NYTimes API

In [3]:
import using_nyt_api as nyt

def get_nyt_data(date,key,save_csv):
    '''Takes a date, NYT API key, and boolean for whether to save data as csv. 
    Returns a df containing the abstract, lead_paragraph, pub_date, 
    document_type, section_name, type_of_material, headline, and keywords 
    of articles for that date and saves df to {date}-articles.csv.'''
    articles = nyt.get_articles_by_date(date,key)
    nyt_data = nyt.create_df(articles,date,write_csv=save_csv)
    return nyt_data

In [4]:
# Testing on one date
date = '2024-03-12'
key = '1DFmIMxxqdYl8wJBPqAFxtHkimk86Qtn'

nyt_data = get_nyt_data(date,key,True)
nyt_data.shape

Successful get request.
1700 documents found for 2024-3.
93 documents found for 2024-03-12.
File 2024-03-12-articles.csv already exists


(93, 8)

In [5]:
nyt_data.head(3)

Unnamed: 0,abstract,lead_paragraph,pub_date,document_type,section_name,type_of_material,headline,keywords
0,The Republican Party apparatus is undergoing a...,Days after allies took over the Republican Nat...,2024-03-12T00:03:05+0000,article,U.S.,News,"Trump Aides, Taking Over R.N.C., Order Mass La...",Republican National Committee;Republican Party...
1,ABC’s telecast of the 96th Academy Awards on S...,The comeback of live event TV continues.,2024-03-12T00:09:17+0000,article,Business Day,News,"‘Barbenheimer,’ and an Early Start, Boost Osca...",Academy Awards (Oscars);Ratings (Audience Meas...
2,People spit out the word “Zionist” when they r...,"Every time I write, as I did last week, that I...",2024-03-12T00:15:17+0000,article,Opinion,Op-Ed,Where Antisemitism and Anti-Zionism Collide,Israel;Gaza Strip;Zionism;Palestinians;Jews an...


## Tiktoks with News Hashtags

In [6]:
def get_pyktok_metadata(code):
    '''Returns a df of metadata collected from pyktok for the given code'''
    filename = f"pyktok-results/results_{code}.csv"
    metadata = pd.read_csv(filename)
    return metadata

def get_news_vids_by_code(code):
    '''Get the videos with news hashtags based on code'''
    filename = f'analysis/videos-newsHashtags-{code}.json'
    with open(filename, 'r') as inFile:
        data = json.load(inFile)
    news_vids = pd.read_json(data, orient="split")
    return news_vids

def add_date_to_news_vids(metadata,news_vids):
    updated_df = news_vids.merge(metadata[['video_id','video_timestamp']], on='video_id', how='left') # keeps all ids in news_vids
    updated_df['date'] = updated_df['video_timestamp'].astype(str).apply(lambda timestamp: timestamp[:10])
    return updated_df

In [7]:
os.chdir('/Users/edithpo/Documents/GitHub/CS315_Project2_Group1/')
cwd = os.getcwd()
code = 10824

# Pyktok Metadata is located within pyktok-results folder
metadata = get_pyktok_metadata(code)
print(f'Metadata DF Shape: {metadata.shape}')

# JSON files containing news-related videos located in analysis folder
news_vids = get_news_vids_by_code(code)
print(f'Shape of News-related Videos DF: {news_vids.shape}')
os.chdir('/Users/edithpo/Documents/GitHub/CS315_Project2_Group1/analysis')

Metadata DF Shape: (460, 20)
Shape of News-related Videos DF: (9, 7)


  news_vids = pd.read_json(data, orient="split")


In [8]:
metadata.head(3)

Unnamed: 0,video_id,video_timestamp,video_duration,video_locationcreated,suggested_words,video_diggcount,video_sharecount,video_commentcount,video_playcount,video_description,video_is_ad,video_stickers,author_username,author_name,author_followercount,author_followingcount,author_heartcount,author_videocount,author_diggcount,author_verified
0,7328672125441658117,2024-01-27T02:10:53,6.0,US,"when you see cute guy in public, when u see gu...",479600.0,7373.0,6081.0,5000000.0,🫣,False,,elsarca,Elsarca,,,,,,False
1,7326611895509585185,2024-01-21T12:56:09,18.0,GB,"Eggs, Ghanaian Food, Cracking Egg, Egg Breakfa...",36700.0,722.0,797.0,204800.0,Sad story 💔 #ghanatiktok🇬🇭 #ghana #egg #emicha...,False,,e.michael_,Michael,,,,,,False
2,7324765984139644192,2024-01-16T13:33:06,178.0,IT,"ateez mama 2023, Ateez, treasure, idols reacti...",449200.0,886.0,1230.0,3000000.0,TREASURE ZB1 reaction to ATEEZ - Bouncy🔥 #atee...,False,,jaehyunsocks_2,:),,,,,,False


In [9]:
news_vids.head(3)

Unnamed: 0,video_id,author_username,video_description,hashtags,suggested_words,isNewsAccount,isNewsHashtag
70,7333618266436406561,thehistorygossip,all sh1ts & giggles until someone giggles & sh...,"[history, historyfacts, victorian, learnontikt...",,False,True
81,7337266783147101446,damolaa__,They’ll cook up any reason for you to transfer...,"[fyp, relatable, police, nigeria, damolaa__]",,False,True
168,7339991777320979754,abc7ny,The Montefiore Albert Einstein School of Medic...,"[nyc, college, news, wow]","albert einstein college of medicine, ruth gott...",False,True


Let's also find the dates that these videos with news-related hashtags were watched.
This way we'll be able to get the data of the relevant NYT headlines for that date.

In [10]:
# Add video_timestamp from metadata df to news_vids df
news_vids = add_date_to_news_vids(metadata,news_vids)
news_vids.head(3)

Unnamed: 0,video_id,author_username,video_description,hashtags,suggested_words,isNewsAccount,isNewsHashtag,video_timestamp,date
0,7333618266436406561,thehistorygossip,all sh1ts & giggles until someone giggles & sh...,"[history, historyfacts, victorian, learnontikt...",,False,True,2024-02-09T10:04:30,2024-02-09
1,7337266783147101446,damolaa__,They’ll cook up any reason for you to transfer...,"[fyp, relatable, police, nigeria, damolaa__]",,False,True,2024-02-19T06:02:34,2024-02-19
2,7339991777320979754,abc7ny,The Montefiore Albert Einstein School of Medic...,"[nyc, college, news, wow]","albert einstein college of medicine, ruth gott...",False,True,2024-02-26T14:17:05,2024-02-26


## Cosine Similarity between some TikToks and some NYT articles

**FINDING COSINE SIMILARITY TASK**  

Each of your transcribed videos should have both the date and the various text features (transcript, description, hashtags, or suggested words). Ideally, we will create embeddings of each of them separately and even an embedding of transcript + description + suggested words (hashtags are already in the description) and then compare each of them against the respective NYT headlines of the day. 

However, it is okay if you pick only one category to get the embedding and calculate the cosine similarity. 
For each post, save the most similar NYT article and the cosine score. Put everything in a dataframe so that you can sort to see the most similar pairs and the least similar ones, and also use describe() to see the range of cosine similarity scores. 

For each post, save the most similar NYT article and the cosine score. Put everything in a dataframe so that you can sort to see the most similar pairs and the least similar ones, and also use describe() to see the range of cosine similarity scores.

In [11]:
def get_all_nyt_data(dates):
    '''Saves and returns cleaned dfs for each date'''
    nyt_dfs = []
    for date in dates:
        df = get_nyt_data(date,key,True)
        print(df.shape)
        nyt_dfs.append(df)
    return nyt_dfs

In [14]:
%%time
# Get NYT data for the dates where user 10824 watched a news-related video
os.chdir('/Users/edithpo/Documents/GitHub/CS315_Project2_Group1/analysis') # To save in analysis/nyt_data folder
date = '2024-03-10'
key = '1DFmIMxxqdYl8wJBPqAFxtHkimk86Qtn'
# dates = news_vids['date']
dates = ['2024-03-12']

nyt_dfs = get_all_nyt_data(dates)
print(len(nyt_dfs))
print(type(nyt_dfs[0]))

Successful get request.
1700 documents found for 2024-3.
93 documents found for 2024-03-12.
Saving new file: nyt_data/2024-03-12-articles.csv


OSError: Cannot save file into a non-existent directory: 'nyt_data'

In [30]:
# def apply_cosine_similarity(news_vids,video_id,key):
#     '''Takes video id (as int64) gets NYTimes headlines from that date that have
#       the highest cosine similarity with the video's DESCRIPTION'''
#     # Get the relevant tiktok video and its video description
#     video = news_vids[news_vids['video_id']==video_id].iloc[0] # get 1 video
#     video_description = video['video_description']
#     date = video['date']

#     # Load the relevant nytdata for given date
#     try:
#         nyt_filename = f'{date}-articles.csv'
#         nyt_df = pd.read_csv(nyt_filename)
#         print(f'Loaded data from {nyt_filename}')
#     except:
#         print(f'File {nyt_filename} not found. Creating it instead.')
#         nyt_df = get_nyt_data(date,key,True)

#     # Get all the headlines for the date
#     try:
#         headlines = nyt_df['headline'].values
#     except KeyError:
#         print("\'headline\' not found in nyt_df.")
#         headlines = []

#     # Apply cosine similarity
#     similarities = rankDocuments(video_description,headlines)
#     return similarities, video_description, date


def apply_cosine_similarity_by_feature(news_vids,video_id,key,feature):
    '''Takes df of videos, video id (as int64), NYT API key, and an feature
    of the news_vids df (suggested_words, video_description, etc, and finds the 
    cosine similarity between that attribute and all the NYTimes headlines 
    from the date the video was watched.'''
    # Get the relevant tiktok video and its video description
    video = news_vids[news_vids['video_id']==video_id].iloc[0] # get 1 video
    video_feature = video[feature]
    date = video['date']

    # Load the relevant nytdata for given date
    try:
        nyt_filename = f'{date}-articles.csv'
        nyt_df = pd.read_csv(nyt_filename)
        print(f'Loaded data from {nyt_filename}')
    except:
        print(f'File {nyt_filename} not found. Creating it instead.')
        nyt_df = get_nyt_data(date,key,True)

    # Get all the headlines for the date
    try:
        headlines = nyt_df['headline'].values
    except KeyError:
        print("\'headline\' not found in nyt_df.")
        headlines = []

    # Apply cosine similarity
    similarities = rankDocuments(video_feature,headlines)
    return similarities, video_feature, date

In [15]:
# %%time
# video_feature = 'suggested_words' #'video_description'
# similarities3, video_feature3,date3 = apply_cosine_similarity_by_feature(news_vids,7326246871024930091,key, video_feature)
# print(video_feature)
# print(date3)
# print(video_feature3)
# print(similarities3[:5])

In [32]:
# def get_all_cosine_similarities(news_vids,video_ids,key):
#     results = []
#     for video_id in video_ids:
#         similarities,video_description,date = apply_cosine_similarity(news_vids,video_id,key)

#         print(f'Video ID: {video_id}')
#         print(f'Date Watched: {date}')
#         print(f'Video Description: {video_description}')
#         print(f'Top 5 most similar NYT headlines:')
#         for sim in similarities[:5]: 
#             print(sim)
#         print()
#         results.append((similarities,video_description,date))
#     return results

def get_all_cosine_similarities_by_feature(news_vids,video_ids,key,feature):
    results = []

    for video_id in video_ids:
        print(f'Video ID: {video_id}')
        try:
            result_dict = {}
            similarities,video_feature,date = apply_cosine_similarity_by_feature(news_vids,video_id,key,feature)
            result_dict['video_id'] = video_id
            result_dict['date'] = date
            result_dict['video_feature']
            result_dict['similarities'] = similarities
            print(f'Date Watched: {date}')
            print(f'{feature}: {video_feature}')
            print(f'Top 5 most similar NYT headlines:')
            for sim in similarities[:5]: 
                print(sim)
            print()
            results.append((similarities,video_feature,date))
        except TypeError as te:
            print(f'Problem with getting the cosine similarities for video {video_id}')
            print(te)

    return results


In [17]:
# %%time
# feature = 'video_description'
# video_ids = news_vids['video_id'].values
# # results = get_all_cosine_similarities(news_vids,video_ids,key)
# results = get_all_cosine_similarities_by_feature(news_vids,video_ids,key,feature)

## Putting Everything Together

In [18]:
os.chdir('/Users/edithpo/Documents/GitHub/CS315_Project2_Group1/')

In [19]:
key = '1DFmIMxxqdYl8wJBPqAFxtHkimk86Qtn'

### Account 12345

In [27]:
# Get pyktok data and videos with news-related hashtags
metadata_12345 = get_pyktok_metadata(12345)
news_vids_12345 = get_news_vids_by_code(12345)
news_vids_12345 = add_date_to_news_vids(metadata_12345,news_vids_12345)
print(news_vids_12345.shape)
news_vids_12345.head(3)

(381, 9)


  news_vids = pd.read_json(data, orient="split")


Unnamed: 0,video_id,author_username,video_description,hashtags,suggested_words,isNewsAccount,isNewsHashtag,video_timestamp,date
0,7343058460025703722,msnbc,NBC News and MSNBC Correspondent Shaquille Bre...,"[fyp, foryou, politicaltiktok, politics, 2024p...","haley, nbc, nicki halley",False,True,2024-03-05T20:37:20,2024-03-05
1,7340809195807132971,blakes_takes1,#greenscreen #greenscreenvideo Rebecca Ferguso...,"[greenscreen, greenscreenvideo, movie, news, d...","Rebecca Ferguson, hugh jackman, rebecca fergus...",False,True,2024-02-28T19:09:27,2024-02-28
2,7341877699629108523,randomusa10,TEXAS THO😬👎🏼 #states #50statessong #50niftyuni...,"[states, 50statessong, 50niftyunitedstates, vi...","rhode island, wyoming state, 50 state song, ar...",False,True,2024-03-02T16:15:33,2024-03-02


In [None]:
%%time
# Get NYT Article Data for all the unique dates
dates_12345 = news_vids_12345['date'].unique()
# get_all_nyt_data(dates_12345)

In [24]:
# Perform Analysis for 12345 using 'video description'
video_ids_12345 = news_vids_12345['video_id'].values

In [37]:
%%time
feature = 'video_description'
results_description_12345 = get_all_cosine_similarities_by_feature(news_vids_12345,video_ids_12345,key,feature)

Video ID: 7343058460025703722
Loaded data from 2024-03-05-articles.csv
Date Watched: 2024-03-05
video_description: NBC News and MSNBC Correspondent Shaquille Brewster speaks to a North Carolina Trump voter who would not consider voting for Nikki Haley because she's a woman. "A woman’s not gonna be a good president," he tells Brewster, "All a woman's good for in my book is having babies and taking care of the house." #fyp #foryou #politicaltiktok #politics #2024president #election2024 #trump #haley #vote 
Top 5 most similar NYT headlines:
(0.4616, 'The Lindbergh Baby Kidnapping: A Grisly Theory and a Renewed Debate')
(0.3967, 'A Grainy Photo and a Dilemma: How U.K. Papers Are Covering Princess Catherine')
(0.3696, 'A Trans-Atlantic Crackdown')
(0.3659, 'A Shape-Shifting House in Los Angeles')
(0.3474, 'A Deadly Aid Delivery and Growing Threat of Famine in Gaza')

Video ID: 7340809195807132971
Loaded data from 2024-02-28-articles.csv
Date Watched: 2024-02-28
video_description: #greenscre

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))
  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))
  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Loaded data from 2024-02-14-articles.csv
Date Watched: 2024-02-14
video_description: #newyork #subway #foryou #fyp #crime #70s #80s #nostalgic #oldtimes #viral #eerie #interesting #guardianangels #bronx #brroklyn #queens #trains
Top 5 most similar NYT headlines:
(nan, 'The Isolationist G.O.P., Again')
(nan, 'Talks in Cairo Aim for a Deal to Halt Gaza War and Free Hostages')
(nan, 'A Feared Ex-General Appears Set to Become Indonesia’s New Leader')
(nan, 'New Menendez Details: A Diamond Ring, a ‘James Bond’ Phone and Covid Tests')
(nan, 'Mayorkas Was Impeached. What Happens Next?')

Video ID: 7336244840558087455
Loaded data from 2024-02-16-articles.csv
Date Watched: 2024-02-16
video_description: Mitt Romney: “I will not be voting for former President Trump. I must admit that I find sexual assault to be a line I will not cross in the people I select to be my president.” #republican #accountability #gop #nevertrump #never #trump #politics #political #fyp #foryou #washingtondc #conservative

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Date Watched: 2024-02-04
video_description: #stitch with @ as a lesbian…🤞 #ameliaearhart #queerhistory #lgbtqhistory #history #historytok 
Top 5 most similar NYT headlines:
(0.6124, 'A Slide Show of a Neighborhood, as Some Saw It')
(0.3922, 'How Toilets Got a Starring Role in a Wim Wenders Movie')
(0.3922, 'A Dead Child, Too Much Booze and a Family in Crisis')
(0.3651, 'Amid a Fraught Process, a Philadelphia Museum Entombs Remains of 19 Black People')
(0.2673, 'Photos From a Firestorm: Chile’s Deadly Blazes')

Video ID: 7331610060587535646
Loaded data from 2024-02-04-articles.csv
Date Watched: 2024-02-04
video_description: Looks like Nikki Haley on Saturday  Night Libe was not popular decision. But Ayo Edebiri asked Haley #SNL “I was just curious, what would you say was the main cause of the civil war, & do you think it starts with an 'S' and ends with a 'LAVERY?'" Haley giggled. Here is part of it #NikkiHaley #Trump #republican #SNL #Politics 
Top 5 most similar NYT headlines:
(0.4193

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Date Watched: 2024-01-26
video_description: Bushisms (part 4) #politics #politicaltiktok #democrat #republican #conservative #liberal #foryoupage #fyp #america #usa 
Top 5 most similar NYT headlines:
(0.2041, 'Republican-Appointed Judge Denounces Republican Distortions of Jan. 6')
(0.1667, 'Why Nikki Haley Has So Few Friends Left in South Carolina Politics')
(0.1491, 'The Best Part of Tiny Living? The ‘Freedom We Have Created for Ourselves.’')
(0.0, '‘Zoomers’ Review: Just Don’t Kill the Vibes')
(0.0, '‘The Door’s Open!’')

Video ID: 7328828915198397742
Loaded data from 2024-01-27-articles.csv
Date Watched: 2024-01-27
video_description: new hyperfixation incoming #history #americanhistory #historytok #worldhistory #princessdiana #joanofarc #ladyjanegrey #romanov 
Top 5 most similar NYT headlines:
(0.2236, 'Ukraine Criticizes HBO, Saying New ‘White Lotus’ Actor Supports War')
(0.2236, 'New Jersey Political Races, Often Raucous, Are Bananas This Year')
(0.2236, 'Israel Has History of Fri

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Loaded data from 2024-01-26-articles.csv
Date Watched: 2024-01-26
video_description: #foryou #history #doctorwho #vincentvangogh #art #rosaparks #bus #robinhood #sherwood #shakespeare #germany #pompeii #italy #volcano #agathachristie #books #charlesdickens #book #fypage #foryoupage #drwho #england #murraygold #music #composer #classicalcomposer #goviral #historicalepisodes #episode #historytok #fypシ #historybooks #travel #time #timetravel #tardis #thedoctor 
Top 5 most similar NYT headlines:
(0.2, 'New Music for Your Weekend')
(0.169, 'Picture Books About the Way We Look')
(0.1581, 'Suggest Events for Flashback, Our Weekly History Quiz')
(0.1581, 'George Carlin’s Estate Sues Podcasters Over A.I. Episode')
(0.1581, 'Flashback: Your Weekly History Quiz, January 27, 2024')

Video ID: 7328507661446581537
Loaded data from 2024-01-26-articles.csv
Date Watched: 2024-01-26
video_description: Coming to netflix 9th February! #deadwaterfell #davidtennant #netflix #crime #drama #fypシ #fypシ゚viral #

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Date Watched: 2024-01-23
video_description: #relatable #barbie #oscars 
Top 5 most similar NYT headlines:
(nan, 'Is the Vibecession Finally Coming to an End?')
(nan, 'Arno A. Penzias, 90, Dies; Nobel Physicist Confirmed Big Bang Theory')
(nan, 'With Border Deal Near, Parole and Money Take Center Stage in Senate Talks')
(nan, 'Israel-E.U. Meeting on Gaza’s Future Yields Division and Confusion')
(nan, 'Trump Rakes In Endorsements in Final Hours Before New Hampshire Votes')

Video ID: 7326987827630083371
Loaded data from 2024-01-22-articles.csv
Date Watched: 2024-01-22
video_description: Bruh i love the internet #math #mathematics #learnontiktok #stem #science 
Top 5 most similar NYT headlines:
(0.3333, 'The Power-Building Workout')
(0.3333, 'The Emasculation of Ron DeSantis by the Bully Donald Trump')
(0.3333, 'The Connections Companion')
(0.3203, 'The Hot New Accessory From the Paris Runways: A Robot Baby')
(0.2981, 'Norman Jewison, 97, Dies; Directed ‘In the Heat of the Night’ and ‘Moo

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Date Watched: 2023-02-17
video_description: we love a factual queen 🥰 #fy #foryoupage #fyp #foryou #feminism #feminist #feminine #feminista #politics 
Top 5 most similar NYT headlines:
(0.3651, 'A Signature Ferragamo Clasp Becomes a Gravity-Defying Heel')
(0.3651, 'A Relationship That Moved Like a Rolling Stone')
(0.3482, 'In a Violent America, Safety Becomes a Sales Pitch')
(0.3482, 'A Study Abroad Cut Short. A Relationship That Endured.')
(0.3203, 'How a Tiny\xa0NATO Nation Tackled a Big Problem: Arming Ukraine')

Video ID: 7201164179913329925
Loaded data from 2023-02-17-articles.csv
Date Watched: 2023-02-17
video_description: we love a factual queen 🥰 #fy #foryoupage #fyp #foryou #feminism #feminist #feminine #feminista #politics 
Top 5 most similar NYT headlines:
(0.3651, 'A Signature Ferragamo Clasp Becomes a Gravity-Defying Heel')
(0.3651, 'A Relationship That Moved Like a Rolling Stone')
(0.3482, 'In a Violent America, Safety Becomes a Sales Pitch')
(0.3482, 'A Study Abroad Cut 

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Date Watched: 2024-01-12
video_description: Part 1 Woman calmly handles arrest on 911 call about DUI over 'School board feud' #bodycam #cops #copsoftiktok #police #policeofficer #policeoftiktok #policeman #chase #tiktok #fyp #foryou 
Top 5 most similar NYT headlines:
(0.2462, 'Former D.C. Police Officer Convicted of Selling Information About Crash Victims')
(0.2357, 'House Spends $40,000 on New Member Pins as Republicans Fume Over Spending')
(0.1826, 'BlackRock’s Giant Bet on Infrastructure')
(0.1543, 'When Trailers Hit Mute on the Musical')
(0.1543, 'Federal Scientists Recommend Easing Restrictions on Marijuana')

Video ID: 7321847184893152558
Loaded data from 2024-01-08-articles.csv
Date Watched: 2024-01-08
video_description: vc: @Maggie Brewer | Mag N. Cheez like why did he do that at every show 😭 and why did we think it was hot💀 #brendonurie #patd #nonbinary #theythempronouns #mentallyilltiktok #michigan #fyp 
Top 5 most similar NYT headlines:
(0.2649, '20 Looks That Did the Most a

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Loaded data from 2024-01-04-articles.csv
Date Watched: 2024-01-04
video_description: Cant show the full vid because violence but its on reddit and yt #breakingnews #foryou 
Top 5 most similar NYT headlines:
(0.4529, 'In Europe, Trains Are Full, and More Are on the Way')
(0.4082, '‘Society of the Snow’ Review: The Stranded and the Faithful')
(0.3693, 'The Man Who Destroys $3,000 Handbags on the Internet')
(0.3397, 'The N.Y.P.D. Dance Team Walks the Beat and Feels It Too')
(0.2887, 'Homes for Sale in Manhattan and the Bronx')

Video ID: 7320299248153644331
Loaded data from 2024-01-04-articles.csv
Date Watched: 2024-01-04
video_description: A Nevada judge was attacked Wednesday by a defendant in a felony battery case who leaped over a defense table and the judge's bench, landing atop her and sparking a bloody brawl involving court officials and attorneys. #lasvegas #court #judge #news #attack #crime 
Top 5 most similar NYT headlines:
(0.3604, 'Luna Luna: A Fantasy That Comes With a Price 

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Date Watched: 2023-12-12
video_description: yapping yap yap yap #scotus #apgov #apush #history #memes
Top 5 most similar NYT headlines:
(nan, 'Watch What People Do, Not What They Say About the Economy')
(nan, 'A Texas Case Shows That Abortion Ban Exemptions Are a Sham')
(nan, 'Google Loses Antitrust Court Battle With Makers of Fortnite Video Game')
(nan, 'Review: In ‘Translations,’ What’s Lost When Language Is Looted')

Video ID: 7321857225390083361
Loaded data from 2024-01-08-articles.csv
Date Watched: 2024-01-08
video_description: Yes I did sob while weiting the text. Literally all I can focus on is her hair. It’s so easy to focus on the girls. Its so facinating. But think of the horror when they realised what had happened and a girl with braid in her hair #archaeology #archaeologytiktok #archaeologist #learnontiktok #history #historytok #archaeologynews #egyptianmummy #mummy #ancienthistory #ancientegypt #sadarchaeology #sadhistory 
Top 5 most similar NYT headlines:
(0.3904, 'The St

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Loaded data from 2024-01-03-articles.csv
Date Watched: 2024-01-03
video_description: I am NO expert in this area by any means lol i just think abt this sometimes #foryou #roanokecolony #historytok 
Top 5 most similar NYT headlines:
(0.3727, 'I Just Learned My Son Is a Webcam Model. Should I Be Troubled?')
(0.2887, 'What’s Going On in This Graph? | Local News Outlets')
(0.2182, 'I Left My Faith. God Didn’t Flinch.')
(0.1925, 'Why I Welcome New York City’s Congestion Pricing Plan')
(0.1826, 'An Easy, Thrilling Tofu Recipe for Just About Any Resolution')

Video ID: 7320684994173685035
Loaded data from 2024-01-05-articles.csv
Date Watched: 2024-01-05
video_description: CNN's Elizabeth Wagmeister interviews Gypsy Rose Blanchard, who was released from prison after serving eight years over a plot to kill her abusive mother, Dee Dee Blanchard. She pleaded guilty to second-degree murder in 2016 and admitted that she convinced her then-boyfriend to stab her mother to death. Blanchard was a victi

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))
  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Date Watched: 2024-01-01
video_description: A devastating earthquake hit central Japan today prompting mass evacuations along much of the country's coastline.   The quakes rang out shortly after 4pm local time and the first tsunami waves, more than a metre high, have already begun lashing the north coast of central Japan. #fyp #japan #earthquake #tsunami #magnitude #breakingnews 
Top 5 most similar NYT headlines:
(0.4276, 'A Gaza Mayor’s View of the Conflict')
(0.3771, 'Powerful Earthquake Hits Japan, and Officials Warn of Aftershocks')
(0.3536, 'Map: Earthquake Strikes Japan')
(0.3266, 'The Connections Companion')

Video ID: 7319012445689941294
Loaded data from 2024-01-01-articles.csv
Date Watched: 2024-01-01
video_description: Neil Patrick Harris and Andy Cohen give Anderson Cooper the giggles with their answers during Never Have I Ever. #cnn #nye #andersoncooper #andycohen #neilpatrickharris 
Top 5 most similar NYT headlines:
(0.2942, 'Lemony White Bean Soup With Turkey and Greens t

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Date Watched: 2023-12-18
video_description: Stick around for 4 minutes and you’ll have a strong understanding of how this will affect you.  #currentevents #storytime #politics #news #breakingnews #lizkuhn #todaywelearned #todayilearned #iwishiknew 
Top 5 most similar NYT headlines:
(0.2981, 'How a 6-Second Video Turned a Campus Protest Into a National Firestorm')
(0.2582, 'Shave and a Haircut? No Thanks.')
(0.239, 'Notable Deaths 2023: Politics and Public Affairs')
(0.2372, 'Do You Prefer Gifts That You’ve Asked For or Ones That Are a Surprise?')
(0.2236, 'Lizzo, Travis Scott, and the Limits of Persona')

Video ID: 7309353690501090606
Loaded data from 2023-12-06-articles.csv
Date Watched: 2023-12-06
video_description: She was so remarkable in the show and deserved recognition for it. She truly deserves her flowers. #fyp #fypシ゚viral #fyppppppppppppppppppppppp #thisisus #football #superbowl #nbc #mandymoore #miloventimiglia #rebecca #kate #randall #emmys #emmy 
Top 5 most similar NYT hea

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Date Watched: 2023-09-15
video_description: #timotheechalamet spits bars🔥 #grahamnorton #thegrahamnortonshow #wonka #rappers #newmusic #statistics #learnontiktok #hilarious
Top 5 most similar NYT headlines:
(nan, 'Trump Lawsuit Against Judge Could Delay N.Y. Fraud Trial')
(nan, 'Studios Say Talks With Striking Writers May Resume Next Week')
(nan, 'Mitt Romney’s Tragic Ambivalence')
(nan, '‘Death, Let Me Do My Show’ Review: Rachel Bloom Can’t Shake the Dread')
(nan, 'Iowa Teen Found Guilty of Fatally Shooting 2 Students')

Video ID: 7311560440146857259
Loaded data from 2023-12-11-articles.csv
Date Watched: 2023-12-11
video_description: #memecut #viralvideo #trend #funny #fyp #ftpシ #edi #CapCut #trains #economy #rail #railroads #concerts #meme #politics #railway #history #maps #historytok #urbanism #trainsoftiktok #funny #15minutecity #annarbor #lansing #detoit #grandrapids #america #US #traffic #technology #modern #dd #taxes #traffic #congestion #cars #subways #ilovetrains #benefits #tr

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))
  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Date Watched: 2023-12-07
video_description: Replying to @Arum Natzorkhang The Toaster project! #learnontiktok #science #learneclecticthings 
Top 5 most similar NYT headlines:
(0.6708, 'The Latest Challenge to the Voting Rights Act')
(0.6124, 'Behind the Plan to Eliminate the Deer on Catalina Island')
(0.5345, 'Jon Fosse Wants to Say the Unsayable')
(0.5, '‘Our Son’ Review: The Right to Break Up')
(0.4714, 'The College Presidents and the ‘Genocide’ Question')

Video ID: 7288036531309235458
Loaded data from 2023-10-09-articles.csv
Date Watched: 2023-10-09
video_description: ⏱️ #arresteddevelopment #tv #politics 
Top 5 most similar NYT headlines:
(0.2357, 'Does the Supreme Court’s Cherry-Picking Inject Politics Into Judging?')
(0.2236, 'What We Can Do to Make American Politics Less Dysfunctional')
(0.2132, 'What’s on TV This Week: ‘Saturday Night Live’ and ‘Shining Vale’')
(0.2132, 'Do You Know These Science Fiction Novels That Became TV Shows?')
(0.0, '‘The Mill’ Review: A Cog in the Mac

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Loaded data from 2023-04-01-articles.csv
Date Watched: 2023-04-01
video_description: Send this to someone who loves Doctor Who! #doctor #doctorwho #doctorwhotiktok #thedoctor #whovian #9thdoctor #10thdoctor #11thdoctor #12thdoctor #mattsmith #davidtennant #petercapaldi #60thanniversary #doctorwho60thanniversary #dwsr #news #whovian #tardis #trivia #fyp #trending #viral #aprilfools #soupysoups 
Top 5 most similar NYT headlines:
(0.3162, 'What We Had to Lose')
(0.2887, 'Let’s Really Try to Abolish Poverty')
(0.2673, 'Far-Left Canadians Susceptible to Russian Influence Too')
(0.25, '10 Places to See Wildflowers in the West')
(0.2357, 'How Trump’s Playboy Persona Came Back to Haunt Him')

Video ID: 7308434456371596577
Loaded data from 2023-12-03-articles.csv
Date Watched: 2023-12-03
video_description: The love of my life and I no longer have access #Humanities #Graduate #Jstor #HistoryTok #LiteratureMemes #Booktok #University #Postgrad #Academia #Undergrad 
Top 5 most similar NYT headlines

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Date Watched: 2023-11-29
video_description: #duet with @bayashi.tiktok #fyp #biden #election #2024 #president
Top 5 most similar NYT headlines:
(0.2981, 'Arizona Officials Charged With Conspiring to Delay Election Results')
(0.1581, 'With an Artist’s Help, Paddington Can Go Anywhere')
(0.1581, 'How to Actually Work Out With Your Kids')
(0.1414, 'Harris to Stand In for Biden at U.N. Climate Conference')
(0.1414, 'Harris Not Worried About Biden Trailing Trump in Key Polls')

Video ID: 7301067655862127905
Loaded data from 2023-11-13-articles.csv
Date Watched: 2023-11-13
video_description: the price we pay #ukpolitics #doctorwho #davidcameron #davidtennant #politics 
Top 5 most similar NYT headlines:
(0.3849, 'Because the Sides Are Really the Star')
(0.3482, 'What We Know About the Criminal Investigation Into Eric Adams’s Campaign')
(0.3333, 'The Two-State Platitude')
(0.3333, 'The Two-State Platitude')
(0.3333, 'The Connections Companion')

Video ID: 7305952098984807711
Loaded data from 2

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))
  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Date Watched: 2023-11-20
video_description: Historical photos that might change ur perspective. pt.26 #phototok #viral #fyp #historytok #oldphotos #historyvideos #foryou #sadhistory 
Top 5 most similar NYT headlines:
(0.2887, 'Clothes that Conjure the Holiday Spirit')
(0.2132, 'Dinners That Don’t Feel Insane to Cook the Week of Thanksgiving')
(0.1961, '‘A Beautiful Place That Has a Dragon’: Where Hurricane Risk Meets Booming Growth')
(0.1961, 'This Is Why Google Paid Billions for Apple to Change a Single Setting')
(0.1961, 'Hundreds Sentenced in Trial That Sought to Break Mafia’s Grip on Southern Italy')

Video ID: 7305368352174755074
Loaded data from 2023-11-25-articles.csv
Date Watched: 2023-11-25
video_description: I feel like my character is developing #history #historymemes #fyp #tbt #vietnam #history101 #historytok 
Top 5 most similar NYT headlines:
(0.2182, 'Sebastian Maniscalco’s Toughest Audience Is His Kids')
(0.1925, 'Everybody Knows Flo From Progressive. Who Is Stephanie Co

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Date Watched: 2023-09-19
video_description: #fyp #xiandivyne #laurenboebert #politics 
Top 5 most similar NYT headlines:
(nan, 'Michigan State Seeks to Fire Football Coach Amid Sexual Harassment Case')
(nan, 'The Restaurant List 2023')
(nan, 'The Kids on the Night Shift')
(nan, 'Symbol of Royalty')
(nan, 'A Vanishing Nomadic Clan, With a Songlike Language All Their Own')

Video ID: 7249058636037934382
Loaded data from 2023-06-26-articles.csv
Date Watched: 2023-06-26
video_description: Ranking SEC school academics #college #sec #mississippi #olemiss #arkansas #lsu #kentucky #alabama #missouri #collegeranking #collegefootball #football #fyp #foryoupage 
Top 5 most similar NYT headlines:
(0.2887, 'Diwali Is Set to Become a School Holiday in New York City')
(0.0, '‘The Idol’ Season 1, Episode 4 Recap: Just a Jealous Guy')
(0.0, '‘National Dish’ Charts a World Tour of Iconic Dishes')
(0.0, '‘Invisible’ Review: Brown, British and Overlooked')
(0.0, '‘Flamin’ Hot’ Is So Bad, It Burns')

Video

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))
  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Loaded data from 2023-09-18-articles.csv
Date Watched: 2023-09-18
video_description: Replying to @iguana_delrey #historytiktok #historytok  
Top 5 most similar NYT headlines:
(0.5774, '‘Purlie Victorious’: Ossie Davis’s ‘Gospel to Humanity’ Returns to Broadway')
(0.4082, 'How to Cool Down a City')
(0.378, 'The Road From Mitt Romney to MAGA')
(0.378, 'Striking Autoworkers Are Cool to Biden’s Embrace')
(0.378, 'Bill Maher Reverses Decision to Restart Show')

Video ID: 7279204670474603818
Loaded data from 2023-09-15-articles.csv
Date Watched: 2023-09-15
video_description: #stitch with @kaisa she’s the new “ok boomer”, which still works btw #leftist #kaisa #okboomer #agitprop #propaganda 
Top 5 most similar NYT headlines:
(0.4, 'Biden Sides With the Autoworkers')
(0.3162, 'The Weekender')
(0.3162, 'The Anti-Abortion Movement Won the Legal Battle, but It’s Losing the War')
(0.2981, 'The New York Times News Quiz, Sept. 15, 2023')
(0.2981, 'The High Stakes Behind the U.A.W.’s Strike')

Video 

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Date Watched: 2023-09-13
video_description: Surveillance video from a performing arts theater in Denver, shows Republican Rep. Lauren Boebert being escorted out of the musical “Beetlejuice” after "multiple complaints" from patrons. In an incident report shared with CNN, theater officials escorted two patrons out of the theater after they received three different complaints that they were “vaping, singing, causing a disturbance.” Boebert’s name was not listed in the incident report, nor could theater officials confirm that she was the individual escorted out from the theater. Boebert later confirmed she attended the performance on social media. #cnn #news #laurenboebert #colorado #denver #beetlejuice #republican #vaping #singing #denver #musical 
Top 5 most similar NYT headlines:
(0.4229, 'As a Doctor, a Mother and the Head of the C.D.C., I Recommend That You Get the Latest Covid Booster')
(0.3873, 'The Crisis in Libya')
(0.3514, 'The Pandemic Was a Time Machine')
(0.345, 'Saratoga Spri

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Loaded data from 2023-09-14-articles.csv
Date Watched: 2023-09-14
video_description: #JoJoSiwa - #Craigs In #WestHollywood (📸: Backgrid) TheHollywoodFix.net #fyp #fypシ #fypシ゚viral #foryoupage #viral #viralvideo #viraltiktok #news #explore #trend #trending #fame #famous #celeb #celebrity #clout #paparazzi #photography #fashion #spotted #hollywood #lol #omg #cute #love #funny #memes #popular #lifestyle #look #tmz #thehollywoodfix #drama #tea #news #mtv #vh1 #abc #nbc #cnn #rumor #promo
Top 5 most similar NYT headlines:
(0.2279, 'Today’s Top News: The Effect of an Auto Strike, and More')
(0.2182, 'SpongeBob Lives in a Pineapple. These Sharks Live in Sponges.')
(0.169, '36 Hours in Charleston, S.C.')
(0.1543, 'In Rome, It’s Luxury vs. Squalor')
(0.1429, 'Homes for Sale in Manhattan and Brooklyn')

Video ID: 7278821094021172523
Loaded data from 2023-09-14-articles.csv
Date Watched: 2023-09-14
video_description: Selina Meyer is SHAKING with rage rn #veep #cnn 
Top 5 most similar NYT headline

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Loaded data from 2023-09-08-articles.csv
Date Watched: 2023-09-08
video_description: history buffs beware #shanegillis #standup #comedy #history #historybuffs #politics
Top 5 most similar NYT headlines:
(0.3536, 'Flashback: Your Weekly History Quiz, September 10, 2023')
(0.0, '‘You Wanted to Know Where the Towers Had Gone’')
(0.0, '‘There Was Definitely a Thumb on the Scale to Get Boys’')
(0.0, '‘Dumb Money’ Lampoons Wall Street Titans With a Knowing Eye')
(0.0, '‘Billions’ Season 7, Episode 5 Recap: A Plan Starts to Form')

Video ID: 7276635366172527914
Loaded data from 2023-09-08-articles.csv
Date Watched: 2023-09-08
video_description: While CNN's Kyung Lah reports ahead of former President Donald Trump's rally in Rapid City, South Dakota, Anderson Cooper tries to figure out what music is playing in the background. #CNN #News #DonaldTrump 
Top 5 most similar NYT headlines:
(0.4899, 'What to See in N.Y.C. Galleries in September')
(0.345, 'Islamists Kill Dozens of Civilians and Soldier

In [None]:
%%time
feature = 'suggested_words'
results_suggestedwords_12345 = get_all_cosine_similarities_by_feature(news_vids_12345,video_ids_12345,key,feature)

In [36]:
# Find the videos who had the highest cosine similarities



'NBC News and MSNBC Correspondent Shaquille Brewster speaks to a North Carolina Trump voter who would not consider voting for Nikki Haley because she\'s a woman. "A woman’s not gonna be a good president," he tells Brewster, "All a woman\'s good for in my book is having babies and taking care of the house." #fyp #foryou #politicaltiktok #politics #2024president #election2024 #trump #haley #vote '

### Account 50405

In [22]:
# Get pyktok data and videos with news-related hashtags
# metadata_50405 = get_pyktok_metadata(50405)
# news_vids_50405 = get_news_vids_by_code(50405)
# news_vids_date_50405 = add_date_to_news_vids(metadata_50405,news_vids_50405)
# news_vids_date_50405