# Applying Cosine Similarity Between TikToks and NYT Articles

## Cosine Similarity

In [171]:
import pandas as pd
import numpy as np
from numpy.linalg import norm
import string
import seaborn as sns 
import matplotlib.pyplot as plt
import os
import json
import datetime as dt
import time

In [172]:
def getVocabulary(textchunk):
    """Given some text, create the vocabulary of unique words."""
    textchunk = textchunk.lower()
    cleantext = "".join(char for char in textchunk if char not in string.punctuation)
    words = set(cleantext.split())
    voc = sorted(words)
    return voc

def text2vector(sentence, voc):
    """Given a sentence and the vocabulary for the problem,
    turn every sentence into a vector.
    """
    cleantext = "".join(char for char in sentence if char not in string.punctuation)
    words = cleantext.lower().split()
    vector = [words.count(w) for w in voc]
    return vector

def cosineSimilarity(vec1,vec2):
    """Calculate the cosine similarity between two vectors."""
    V1 = np.array(vec1)
    V2 = np.array(vec2)
    cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))
    return cosine

def rankDocuments(query,sentences):
    """Given a query and some sentences, rank the sentences for
    which are the most similar to the query"""
    # Step 1: create vocabulary
    voc = getVocabulary(" ".join(sentences))

    # Step 2: generate vector for query
    queryVec = text2vector(query,voc)

    # Step 3: generate vector for sentences and calculate cosine similarity at once
    similarities = []
    for sent in sentences:
        sentVec = text2vector(sent,voc)
        sim = cosineSimilarity(queryVec,sentVec)
        similarities.append((round(sim,4),sent)) # keep track of sentences

    similarities.sort(reverse=True)
    return similarities

## NYTimes API

In [183]:
import articles_to_csv as nyt

In [189]:
import articles_to_csv as nyt

def get_nyt_data(date,key):
    '''Takes a date and NYT API key. Returns a df of the abstract, lead_paragraph, 
    pub_date, document_type, section_name, type_of_material, headline, and keywords 
    of articles for that date and saves df to {date}-articles.csv.'''
    articles = nyt.get_articles_by_date(date,key)
    nyt_data = nyt.create_df(articles,date)
    return nyt_data

In [190]:
os.chdir('/Users/edithpo/Documents/GitHub/CS315_Project2_Group1/analysis')

date = '2024-03-10'
key = '1DFmIMxxqdYl8wJBPqAFxtHkimk86Qtn'

nyt_data = get_nyt_data(date,key)
nyt_data.shape

Successfully got the data.
Documents found!
Saving new file: 2024-03-10-articles.csv


(1561, 8)

In [39]:
nyt_data.head()

Unnamed: 0,abstract,lead_paragraph,pub_date,document_type,section_name,type_of_material,headline,keywords
0,"On economics, Donald Trump is just Mitch McCon...",There was a lot of breathless speculation befo...,2024-03-01T00:00:08+0000,article,Opinion,Op-Ed,Who Really Stands With American Workers?,"Michigan;Primaries and Caucuses;Biden, Joseph ..."
1,Donald Trump didn’t remake the Republican Part...,I’ve recently been reading about Warren Buffet...,2024-03-01T00:00:09+0000,article,Opinion,Op-Ed,The G.O.P. Returns to Its Bad Old Self,United States Politics and Government;United S...
2,Palestinian and Israeli officials offered diff...,Israeli forces opened fire on Thursday as a cr...,2024-03-01T00:00:22+0000,article,World,News,"As Hungry Gazans Crowd an Aid Convoy, a Crush ...",Israel-Gaza War (2023- );Israel;Defense and Mi...
3,The hit Netflix dating show seems to prove tha...,In the sixth season of the Netflix series “Lov...,2024-03-01T00:04:32+0000,article,Style,News,"After Six Seasons, Viewers Have an Answer: No,...",Dating and Relationships;Love Is Blind (TV Pro...
4,"The Alabama senator, the youngest Republican w...",Republicans have chosen Senator Katie Britt of...,2024-03-01T00:12:29+0000,article,U.S.,News,Katie Britt to Deliver Republican Response to ...,United States Politics and Government;State of...


## Tiktoks with News Hashtags

In [56]:
def get_original_metadata(code):
    '''Returns a df of metadata collected from pyktok for the given code'''
    filepath = f"results_{code}.csv"
    metadata = pd.read_csv(filepath)
    return metadata

In [57]:
os.chdir("/Users/edithpo/Documents/GitHub/CS315_Project2_Group1/pyktok-results/")
code = 10824

metadata = get_original_metadata(code)
metadata.shape

(460, 20)

In [60]:
def loadFromJSON(filename):
    '''Load data from json file that was saved with split orientation'''
    with open(filename, 'r') as inFile:
        data = json.load(inFile)
    df = pd.read_json(data, orient="split")
    return df

In [83]:
os.chdir("/Users/edithpo/Documents/GitHub/CS315_Project2_Group1/analysis/")
cwd = os.getcwd()
filename = f'{cwd}/videos-newsHashtags-{code}.json'

news_vids = loadFromJSON(filename)
news_vids.shape

  df = pd.read_json(data, orient="split")


(9, 7)

In [84]:
news_vids

Unnamed: 0,video_id,author_username,video_description,hashtags,suggested_words,isNewsAccount,isNewsHashtag
70,7333618266436406561,thehistorygossip,all sh1ts & giggles until someone giggles & sh...,"[history, historyfacts, victorian, learnontikt...",,False,True
81,7337266783147101446,damolaa__,They’ll cook up any reason for you to transfer...,"[fyp, relatable, police, nigeria, damolaa__]",,False,True
168,7339991777320979754,abc7ny,The Montefiore Albert Einstein School of Medic...,"[nyc, college, news, wow]","albert einstein college of medicine, ruth gott...",False,True
195,7326271896960503072,thehistorygossip,They said trial and error but mostly error 😍 S...,"[history, historytok, historytiktok, medieval]","medieval practices, medieval times, Medievalti...",False,True
199,7332620181874773290,altfour102,Part 1| 16 year old boys blog themselves bruta...,"[murder, crime, truecrime, crimestories]",,False,True
225,7335453895377915141,kwadwosheldonstudios,Madrid secure narrow victory over RB Leipzig +...,"[ucl, review, football, prrrr, trending, trend...",,False,True
302,7332721185731644718,altfour102,Part 4| 16 year old boys blog themselves bruta...,"[murder, crime, truecrime, crimestories]",,False,True
324,7326246871024930091,marshalldiorr,Only in Africa 😂 #afcon #CAN #marshalldior #fo...,"[afcon, can, marshalldior, football, africanti...","Afcon, Football, Football Player, football mom...",False,True
425,7339620455587925254,antycececomedy,Replying to @Starlit #antycece #antycececomedy...,"[antycece, antycececomedy#onuatv, antycecebars...","Comedy Video, Comedy Show, comedian, African C...",False,True


Let's also find the dates that these videos with news-related hashtags were watched.
This way we'll be able to get the data of the relevant NYT headlines for that date.

In [92]:
# Another method of merging the dataframes
news_vids = news_vids.merge(metadata[['video_id','video_timestamp']], on='video_id', how='left') # keeps all ids in news_vids
news_vids

Unnamed: 0,video_id,author_username,video_description,hashtags,suggested_words,isNewsAccount,isNewsHashtag,video_timestamp
0,7333618266436406561,thehistorygossip,all sh1ts & giggles until someone giggles & sh...,"[history, historyfacts, victorian, learnontikt...",,False,True,2024-02-09T10:04:30
1,7337266783147101446,damolaa__,They’ll cook up any reason for you to transfer...,"[fyp, relatable, police, nigeria, damolaa__]",,False,True,2024-02-19T06:02:34
2,7339991777320979754,abc7ny,The Montefiore Albert Einstein School of Medic...,"[nyc, college, news, wow]","albert einstein college of medicine, ruth gott...",False,True,2024-02-26T14:17:05
3,7326271896960503072,thehistorygossip,They said trial and error but mostly error 😍 S...,"[history, historytok, historytiktok, medieval]","medieval practices, medieval times, Medievalti...",False,True,2024-01-20T14:56:47
4,7332620181874773290,altfour102,Part 1| 16 year old boys blog themselves bruta...,"[murder, crime, truecrime, crimestories]",,False,True,2024-02-06T17:31:30
5,7335453895377915141,kwadwosheldonstudios,Madrid secure narrow victory over RB Leipzig +...,"[ucl, review, football, prrrr, trending, trend...",,False,True,2024-02-14T08:47:35
6,7332721185731644718,altfour102,Part 4| 16 year old boys blog themselves bruta...,"[murder, crime, truecrime, crimestories]",,False,True,2024-02-07T00:03:31
7,7326246871024930091,marshalldiorr,Only in Africa 😂 #afcon #CAN #marshalldior #fo...,"[afcon, can, marshalldior, football, africanti...","Afcon, Football, Football Player, football mom...",False,True,2024-01-20T13:19:57
8,7339620455587925254,antycececomedy,Replying to @Starlit #antycece #antycececomedy...,"[antycece, antycececomedy#onuatv, antycecebars...","Comedy Video, Comedy Show, comedian, African C...",False,True,2024-02-25T14:15:59


In [94]:
# Create date column
def to_date(timestamp):
    return timestamp[:10]

news_vids['date'] = news_vids['video_timestamp'].astype(str).apply(to_date)

news_vids

Unnamed: 0,video_id,author_username,video_description,hashtags,suggested_words,isNewsAccount,isNewsHashtag,video_timestamp,date
0,7333618266436406561,thehistorygossip,all sh1ts & giggles until someone giggles & sh...,"[history, historyfacts, victorian, learnontikt...",,False,True,2024-02-09T10:04:30,2024-02-09
1,7337266783147101446,damolaa__,They’ll cook up any reason for you to transfer...,"[fyp, relatable, police, nigeria, damolaa__]",,False,True,2024-02-19T06:02:34,2024-02-19
2,7339991777320979754,abc7ny,The Montefiore Albert Einstein School of Medic...,"[nyc, college, news, wow]","albert einstein college of medicine, ruth gott...",False,True,2024-02-26T14:17:05,2024-02-26
3,7326271896960503072,thehistorygossip,They said trial and error but mostly error 😍 S...,"[history, historytok, historytiktok, medieval]","medieval practices, medieval times, Medievalti...",False,True,2024-01-20T14:56:47,2024-01-20
4,7332620181874773290,altfour102,Part 1| 16 year old boys blog themselves bruta...,"[murder, crime, truecrime, crimestories]",,False,True,2024-02-06T17:31:30,2024-02-06
5,7335453895377915141,kwadwosheldonstudios,Madrid secure narrow victory over RB Leipzig +...,"[ucl, review, football, prrrr, trending, trend...",,False,True,2024-02-14T08:47:35,2024-02-14
6,7332721185731644718,altfour102,Part 4| 16 year old boys blog themselves bruta...,"[murder, crime, truecrime, crimestories]",,False,True,2024-02-07T00:03:31,2024-02-07
7,7326246871024930091,marshalldiorr,Only in Africa 😂 #afcon #CAN #marshalldior #fo...,"[afcon, can, marshalldior, football, africanti...","Afcon, Football, Football Player, football mom...",False,True,2024-01-20T13:19:57,2024-01-20
8,7339620455587925254,antycececomedy,Replying to @Starlit #antycece #antycececomedy...,"[antycece, antycececomedy#onuatv, antycecebars...","Comedy Video, Comedy Show, comedian, African C...",False,True,2024-02-25T14:15:59,2024-02-25


## Cosine Similarity between some TikToks and some NYT articles

Each of your transcribed videos should have both the date and the various text features (transcript, description, hashtags, or suggested words). Ideally, we will create embeddings of each of them separately and even an embedding of transcript + description + suggested words (hashtags are already in the description) and then compare each of them against the respective NYT headlines of the day. 

However, it is okay if you pick only one category to get the embedding and calculate the cosine similarity. 
For each post, save the most similar NYT article and the cosine score. Put everything in a dataframe so that you can sort to see the most similar pairs and the least similar ones, and also use describe() to see the range of cosine similarity scores. 

For each post, save the most similar NYT article and the cosine score. Put everything in a dataframe so that you can sort to see the most similar pairs and the least similar ones, and also use describe() to see the range of cosine similarity scores.**

In [101]:
def get_all_nyt_data(dates):
    '''Saves and returns cleaned dfs for each date'''
    nyt_dfs = []
    for date in dates:
        df = get_nyt_data(date,key)
        nyt_dfs.append(df)
    return nyt_dfs

In [102]:
# Get NYT data for dates user 10824 watched a news-related video
os.chdir('/Users/edithpo/Documents/GitHub/CS315_Project2_Group1/analysis')
date = '2024-03-10'
key = '1DFmIMxxqdYl8wJBPqAFxtHkimk86Qtn'
dates = news_vids['date']

nyt_dfs = get_all_nyt_data(dates)
print(len(nyt_dfs))
print(type(nyt_dfs[0]))

## NOTE: Need to update the articles_to_csv.py file to change the print statement
## when the data is not collected successfully

Successfully got the data.
Documents found!
Saving new file: 2024-02-09-articles.csv
Successfully got the data.
Documents found!
Saving new file: 2024-02-19-articles.csv
Successfully got the data.
Documents found!
Saving new file: 2024-02-26-articles.csv
Successfully got the data.
Documents found!
Saving new file: 2024-01-20-articles.csv
Successfully got the data.
Documents found!
Saving new file: 2024-02-06-articles.csv
Successfully got the data.
Documents found!
Saving new file: 2024-02-14-articles.csv
Successfully got the data.
Documents found!
Saving new file: 2024-02-07-articles.csv
Successfully got the data.
Documents found!
Saving new file: 2024-01-20-articles.csv
Did not get the data successfully.
Documents not found.
Saving new file: 2024-02-25-articles.csv
9
<class 'pandas.core.frame.DataFrame'>


In [118]:
news_vids[(news_vids['video_id']==7326246871024930091) & (news_vids['date']=="2024-01-20")]

Unnamed: 0,video_id,author_username,video_description,hashtags,suggested_words,isNewsAccount,isNewsHashtag,video_timestamp,date
7,7326246871024930091,marshalldiorr,Only in Africa 😂 #afcon #CAN #marshalldior #fo...,"[afcon, can, marshalldior, football, africanti...","Afcon, Football, Football Player, football mom...",False,True,2024-01-20T13:19:57,2024-01-20


In [200]:
def apply_cosine_similarity_id_and_date(video_id,date,key):
    '''Takes video id (as int64) and date (as string in YYYY-MM-DD format)
    and returns ranked NYTimes headlines from given date that have the highest
    cosine similarity with the video's DESCRIPTION'''
    # Get the relevant tiktok video and its video description
    video = news_vids[(news_vids['video_id']==video_id) & (news_vids['date']==date)].iloc[0]
    video_description = video['video_description']

    # Load the relevant nytdata for given date
    try:
        nyt_filename = f'{date}-articles.csv'
        nyt_df = pd.read_csv(nyt_filename)
        print(f'Loaded data from {nyt_filename}')
    except:
        print(f'File {nyt_filename} not found. Creating it instead.')
        nyt_df = get_nyt_data(date,key)

    headlines = nyt_df['headline'].values

    # Apply cosine similarity
    similarities = rankDocuments(video_description,headlines)
    return similarities,video_description # get the top 10 most similar headlines

In [201]:
def apply_cosine_similarity(video_id,key):
    '''Takes video id (as int64) gets NYTimes headlines from that date that have
      the highest cosine similarity with the video's DESCRIPTION'''
    # Get the relevant tiktok video and its video description
    video = news_vids[news_vids['video_id']==video_id].iloc[0] # get 1 video
    video_description = video['video_description']
    date = video['date']

    # Load the relevant nytdata for given date
    try:
        nyt_filename = f'{date}-articles.csv'
        nyt_df = pd.read_csv(nyt_filename)
        print(f'Loaded data from {nyt_filename}')
    except:
        print(f'File {nyt_filename} not found. Creating it instead.')
        nyt_df = get_nyt_data(date,key)

    # Get all the headlines for the date
    try:
        headlines = nyt_df['headline'].values
    except KeyError:
        print("\'headline\' not found in nyt_df.")
        headlines = []

    # Apply cosine similarity
    similarities = rankDocuments(video_description,headlines)
    return similarities, video_description, date

In [205]:
%%time
similarities1, video_description1 = apply_cosine_similarity_id_and_date(7326246871024930091,"2024-01-20",key)
print(video_description1)
print(similarities1[:10])

Loaded data from 2024-01-20-articles.csv
Only in Africa 😂 #afcon #CAN #marshalldior #football #africantiktok #african 
[(0.3397, 'How the Federal Government Can Rein In A.I. in Law Enforcement'), (0.3333, 'Only Voters Can Truly Disqualify Trump'), (0.2887, 'The Wild World of Money in College Football'), (0.2887, 'Subway Train Derails in Brooklyn in 2nd Such Episode in a Week'), (0.281, 'Suspect in Fatal Shootings of 8 in Illinois Is Found Dead in Texas'), (0.2739, 'To Be a ‘Palatable Jew’ — in Germany in 1940, in the U.S. Today'), (0.2722, 'In Praise of Whistling in Pop Music'), (0.2582, 'What to See in N.Y.C. Galleries in January'), (0.2582, 'Three Killed in Medical Helicopter Crash in Oklahoma'), (0.2582, 'Suspect in Davis Stabbings Can Now Stand Trial, Judge Rules')]
CPU times: user 13.9 s, sys: 229 ms, total: 14.1 s
Wall time: 14.4 s


In [206]:
%%time
similarities2,video_description2,date2 = apply_cosine_similarity(7326246871024930091,key)
print(date2)
print(video_description2)
print(similarities2[:10])


Loaded data from 2024-01-20-articles.csv
2024-01-20
Only in Africa 😂 #afcon #CAN #marshalldior #football #africantiktok #african 
[(0.3397, 'How the Federal Government Can Rein In A.I. in Law Enforcement'), (0.3333, 'Only Voters Can Truly Disqualify Trump'), (0.2887, 'The Wild World of Money in College Football'), (0.2887, 'Subway Train Derails in Brooklyn in 2nd Such Episode in a Week'), (0.281, 'Suspect in Fatal Shootings of 8 in Illinois Is Found Dead in Texas'), (0.2739, 'To Be a ‘Palatable Jew’ — in Germany in 1940, in the U.S. Today'), (0.2722, 'In Praise of Whistling in Pop Music'), (0.2582, 'What to See in N.Y.C. Galleries in January'), (0.2582, 'Three Killed in Medical Helicopter Crash in Oklahoma'), (0.2582, 'Suspect in Davis Stabbings Can Now Stand Trial, Judge Rules')]
CPU times: user 14.3 s, sys: 188 ms, total: 14.5 s
Wall time: 15 s


In [214]:
def get_all_cosine_similarities(video_ids,key):
    results = []
    for video_id in video_ids:
        similarities,video_description,date = apply_cosine_similarity(video_id,key)

        print(f'Video ID: {video_id}')
        print(f'Date Watched: {date}')
        print(f'Video Description: {video_description}')
        print(f'Top 5 most similar NYT headlines:')
        for sim in similarities[:5]: 
            print(sim)
        print()
        results.append((similarities,video_description,date))
    return results


In [215]:
%%time
video_ids = news_vids['video_id'].values
results = get_all_cosine_similarities(video_ids,key)

Loaded data from 2024-02-09-articles.csv
Video ID: 7333618266436406561
Date Watched: 2024-02-09
Video Description: all sh1ts & giggles until someone giggles & sh1ts (out a tapeworm) Sources: The Ugly Girl Papers: SD Powers (1874) Atlas Obscura: The Horrifying Legacy of the Victorian Tapeworm Diet History Extra: Tapeworms, arsenic and magic soap: fact-checking history’s weirdest fad diets and weight-loss gimmicks #history #historyfacts #victorian #learnontiktok #learn 
Top 5 most similar NYT headlines:
(0.5314, 'The Comedy, and the Horror, of the Infertility Plot')
(0.5186, 'The Times and the Tablet')
(0.4991, 'The Gender Split and the ‘Looming Apocalypse of the Developed World’')
(0.4881, 'The Rise and Fall of The Village Voice')
(0.4851, 'The SAT and the Supreme Court')

Loaded data from 2024-02-19-articles.csv
Video ID: 7337266783147101446
Date Watched: 2024-02-19
Video Description: They’ll cook up any reason for you to transfer 😭 #fyp #relatable #police #nigeria #damolaa__ 
Top 5 mo

KeyError: 'headline'