# Applying Cosine Similarity Between TikToks and NYT Articles

In [21]:
import pandas as pd
import numpy as np
from numpy.linalg import norm
import string
import seaborn as sns 
import matplotlib.pyplot as plt
import os
import json
import datetime as dt
import time
import using_nyt_api as nyt

main_dir = os.path.dirname(os.getcwd())
analysis_dir = f'{main_dir}/analysis'
pyktok_dir = f'{main_dir}/pyktok-results'

## Cosine Similarity

In [26]:
def getVocabulary(textchunk):
    """Given some text, create the vocabulary of unique words."""
    textchunk = textchunk.lower()
    cleantext = "".join(char for char in textchunk if char not in string.punctuation)
    words = set(cleantext.split())
    voc = sorted(words)
    return voc

def text2vector(sentence, voc):
    """Given a sentence and the vocabulary for the problem,
    turn every sentence into a vector.
    """
    cleantext = "".join(char for char in sentence if char not in string.punctuation)
    words = cleantext.lower().split()
    vector = [words.count(w) for w in voc]
    return vector

def cosineSimilarity(vec1,vec2):
    """Calculate the cosine similarity between two vectors."""
    V1 = np.array(vec1)
    V2 = np.array(vec2)
    cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))
    return cosine

def rankDocuments(query,sentences):
    """Given a query and some sentences, rank the sentences for
    which are the most similar to the query"""
    # Step 1: create vocabulary
    voc = getVocabulary(" ".join(sentences))

    # Step 2: generate vector for query
    queryVec = text2vector(query,voc)

    # Step 3: generate vector for sentences and calculate cosine similarity at once
    similarities = []
    for sent in sentences:
        sentVec = text2vector(sent,voc)
        sim = cosineSimilarity(queryVec,sentVec)
        similarities.append((round(sim,4),sent)) # keep track of sentences

    similarities.sort(reverse=True)
    return similarities # a tuple of (cosine similarity, sentence)

## NYTimes API

In [36]:
def check_file_exists(filepath,filename):
    if filename in os.listdir(filepath):
        return True
    return False

# def get_nyt_data(date,key,save_csv):
#     '''Takes a date, NYT API key, and boolean for whether to save data as csv. 
#     Returns a df containing the abstract, lead_paragraph, pub_date, 
#     document_type, section_name, type_of_material, headline, and keywords 
#     of articles for that date and saves df to {date}-articles.csv.'''
#     # check if articles for given date exist
#     filepath = f'{analysis_dir}/nyt_data/' # save files to analysis/nyt_data folder
#     filename = f'{date}-articles.csv'
#     file_exists = check_file_exists(filepath,filename)
    
#     if file_exists: # reads df from existing file if file exists
#         nyt_data = pd.read_csv(f'{filepath}/{filename}')
#     else: # gets articles by calling NYT API and creates df
#         articles = nyt.get_articles_by_date(date,key)
#         nyt_data = nyt.create_df(articles,date,filepath,write_csv=save_csv)
#     return nyt_data


def get_nyt_data(date,key,save_csv):
    '''Takes a date, NYT API key, and boolean for whether to save data as csv. 
    Returns a df containing the abstract, lead_paragraph, pub_date, 
    document_type, section_name, type_of_material, headline, and keywords 
    of articles for that date and saves df to {date}-articles.csv.'''
    # check if articles for given date exist
    filepath = f'{analysis_dir}/nyt_data/' # save files to analysis/nyt_data folder
    filename = f'{date}-articles.csv'
    file_exists = check_file_exists(filepath,filename)
    
    if file_exists: # reads df from existing file if file exists
        try:
            nyt_data = pd.read_csv(f'{filepath}/{filename}')
        except Exception as e:
            print(e)
            return e
    else: # gets articles by calling NYT API and creates df
        articles = nyt.get_articles_by_date(date,key)
        nyt_data = nyt.create_df(articles,date,filepath,write_csv=save_csv)
    return nyt_data

In [105]:
# Testing on one date
date = '2024-03-12'
key = '1DFmIMxxqdYl8wJBPqAFxtHkimk86Qtn'
save_csv = True
nyt_data = get_nyt_data(date,key,save_csv)
nyt_data.head(3)

Unnamed: 0.1,Unnamed: 0,abstract,lead_paragraph,pub_date,document_type,section_name,type_of_material,headline,keywords
0,0,The Republican Party apparatus is undergoing a...,Days after allies took over the Republican Nat...,2024-03-12T00:03:05+0000,article,U.S.,News,"Trump Aides, Taking Over R.N.C., Order Mass La...",Republican National Committee;Republican Party...
1,1,ABC‚Äôs telecast of the 96th Academy Awards on S...,The comeback of live event TV continues.,2024-03-12T00:09:17+0000,article,Business Day,News,"‚ÄòBarbenheimer,‚Äô and an Early Start, Boost Osca...",Academy Awards (Oscars);Ratings (Audience Meas...
2,2,People spit out the word ‚ÄúZionist‚Äù when they r...,"Every time I write, as I did last week, that I...",2024-03-12T00:15:17+0000,article,Opinion,Op-Ed,Where Antisemitism and Anti-Zionism Collide,Israel;Gaza Strip;Zionism;Palestinians;Jews an...


## Tiktoks with News Hashtags

In [28]:
def get_pyktok_metadata_by_code(code): # NOT USING
    '''Returns a df of metadata collected from pyktok for the given code'''
    filename = f'{pyktok_dir}/results_{code}.csv'
    print(f'Getting pyktok data from file {filename}')
    metadata = pd.read_csv(filename)
    return metadata

def get_news_vids_by_code(code): # NOT USING
    '''Get the videos with news hashtags based on code'''
    filename = f'{analysis_dir}/videos-newsHashtags-{code}.json'
    print(f'Getting news-related videos from file {filename}')
    with open(filename, 'r') as inFile:
        data = json.load(inFile)
    news_vids = pd.read_json(data, orient="split")
    return news_vids

def add_date_to_news_vids(metadata,news_vids):
    updated_df = news_vids.merge(metadata[['video_id','video_timestamp']], on='video_id', how='left') # keeps all ids in news_vids
    updated_df['date'] = updated_df['video_timestamp'].astype(str).apply(lambda timestamp: timestamp[:10])
    return updated_df


def get_pyktok_metadata_by_filename(filename):
    '''Returns a df of metadata collected from pyktok for the given file'''
    file = f'{pyktok_dir}/{filename}'
    print(f'Getting pyktok data from file {file}')
    metadata = pd.read_csv(file)
    return metadata

def get_news_vids_by_filename(filename):
    '''Get the videos with news hashtags based on file'''
    file = f'{analysis_dir}/{filename}'
    print(f'Getting news-related videos from file {file}')
    with open(file, 'r') as inFile:
        data = json.load(inFile)
    news_vids = pd.read_json(data, orient="split")
    return news_vids

In [109]:
# # Test for code 12345
# code = 12345

# # Pyktok Metadata is located within pyktok-results folder
# metadata = get_pyktok_metadata_by_code(code)
# print(f'Metadata DF Shape: {metadata.shape}')
# metadata.head(3)

# # JSON files containing news-related videos located in analysis folder
# news_vids = get_news_vids_by_code(code)
# print(f'Shape of News-related Videos DF: {news_vids.shape}')
# news_vids.head(3)

# # Add video_timestamp from metadata df to news_vids df
# updated_news_vids = add_date_to_news_vids(metadata,news_vids)
# updated_news_vids.head(3)

## Cosine Similarity between some TikToks and some NYT articles

Each of your transcribed videos should have both the date and the various text features (transcript, description, hashtags, or suggested words). Ideally, we will create embeddings of each of them separately and even an embedding of transcript + description + suggested words (hashtags are already in the description) and then compare each of them against the respective NYT headlines of the day. 

However, it is okay if you pick only one category to get the embedding and calculate the cosine similarity. 
For each post, save the most similar NYT article and the cosine score. Put everything in a dataframe so that you can sort to see the most similar pairs and the least similar ones, and also use describe() to see the range of cosine similarity scores. 

For each post, save the most similar NYT article and the cosine score. Put everything in a dataframe so that you can sort to see the most similar pairs and the least similar ones, and also use describe() to see the range of cosine similarity scores.

In [40]:
# def get_all_nyt_data(dates):
#     '''Saves and returns cleaned dfs for each date'''
#     nyt_dfs = []
#     for date in dates:
#         df = get_nyt_data(date,key,True)
#         print(f'\nShape of df for {date}: {df.shape}')
#         nyt_dfs.append(df)
#     return nyt_dfs

def get_all_nyt_data(dates):
    '''Saves and returns cleaned dfs for each date'''
    nyt_dfs = []
    for date in dates:
        df = get_nyt_data(date,key,True)
        try:
            print(f'\nShape of df for {date}: {df.shape}')
            nyt_dfs.append(df)
        except Exception as e:
            print(e)
    return nyt_dfs

In [30]:
def apply_cosine_similarity_by_feature(news_vids,video_id,key,feature):
    '''Takes df of videos, video id (as int64), NYT API key, and an feature
    of the news_vids df (suggested_words, video_description, etc), and finds the 
    cosine similarity between that attribute and all the NYTimes headlines 
    from the date the video was watched.'''
    # Get the relevant tiktok video and its video description
    video = news_vids[news_vids['video_id']==video_id].iloc[0] # get 1 video
    feature_text = video[feature] # gets feature, such as description, suggested words, etc
    date = video['date']

    # Load the relevant nytdata for given date
    nyt_filepath = f'{analysis_dir}/nyt_data'
    nyt_filename = f'{date}-articles.csv'
    print(f'Looking for {nyt_filepath}/{nyt_filename}')
    
    try:
        nyt_df = pd.read_csv(f'{nyt_filepath}/{nyt_filename}')
        print(f'Loaded data from {nyt_filename}')
    except FileNotFoundError as fnfe:
        print(fnfe)
        print(f'File {nyt_filename} not found. Creating it instead.')
        nyt_df = get_nyt_data(date,key,True)
    except Exception as e:
        print(e)
        print(f'File {nyt_filename} not found. Creating it instead.')
        nyt_df = get_nyt_data(date,key,True)
        nyt_df = pd.DataFrame() # empty dataframe

    # Get all the headlines for the date
    try:
        headlines = nyt_df['headline'].values
    except KeyError:
        print("\'headline\' not found in nyt_df.")
        headlines = []

    # Apply cosine similarity
    similarities = rankDocuments(feature_text,headlines) # a tuple of (cosine similarity, sentence)
    return similarities, feature_text, date

In [83]:
%%time
# EXAMPLE FOR ONE VIDEO ID
code = 12345
metadata_file = 'results_12345(19019 rows).csv'
news_vids_file = 'videos-newsHashtags-12345.json'
# metadata = get_pyktok_metadata_by_code(code)
# news_vids = get_news_vids_by_code(code)
metadata = get_pyktok_metadata_by_filename(metadata_file)
news_vids = get_news_vids_by_filename(news_vids_file)
updated_news_vids = add_date_to_news_vids(metadata,news_vids)
video_id = updated_news_vids['video_id'].values[0] # get a particular video id
feature = 'video_description' #'suggested_words' 

similarities,feature_text,date = apply_cosine_similarity_by_feature(updated_news_vids,video_id,key,feature)

print(f'\nTikTok User Code: {code}')
print(f'\nVideo ID: {video_id}')
print(f'\nVideo\'s {feature}: {feature_text}')
print(f'\nDate watched: {date}')
print(f'\nTop 5 similarities:')
for sim in similarities[:5]:
    print(sim)

Getting pyktok data from file /Users/edithpo/Downloads/CS315_Project2_Group1/pyktok-results/results_12345(19019 rows).csv
Getting news-related videos from file /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/videos-newsHashtags-12345.json
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-03-05-articles.csv
Loaded data from 2024-03-05-articles.csv

TikTok User Code: 12345

Video ID: 7343058460025703722

Video's video_description: NBC News and MSNBC Correspondent Shaquille Brewster speaks to a North Carolina Trump voter who would not consider voting for Nikki Haley because she's a woman. "A woman‚Äôs not gonna be a good president," he tells Brewster, "All a woman's good for in my book is having babies and taking care of the house." #fyp #foryou #politicaltiktok #politics #2024president #election2024 #trump #haley #vote 

Date watched: 2024-03-05

Top 5 similarities:
(0.4616, 'The Lindbergh Baby Kidnapping: A Grisly Theory and a Renewed Debate')
(0.

  news_vids = pd.read_json(data, orient="split")


In [31]:
def get_all_cosine_similarities_by_feature(updated_news_vids,video_ids,key,feature):
    results = []

    for video_id in video_ids:
        print(f'Video ID: {video_id}')
        try:
            result_dict = {}
            similarities,feature_text,date = apply_cosine_similarity_by_feature(updated_news_vids,video_id,key,feature)
            result_dict['video_id'] = video_id
            result_dict['date'] = date
            result_dict[f'{feature}'] = feature_text
            result_dict['similarities'] = similarities
            cos_sim_scores = [sim[0] for sim in similarities]
            nyt_headlines = [sim[1] for sim in similarities]
            result_dict['cos_sim_scores'] = cos_sim_scores
            result_dict['nyt_headlines'] = nyt_headlines
            results.append(result_dict)
        except TypeError as te:
            print(f'Problem with getting the cosine similarities for video {video_id}')
            print(te)

    return results

## Putting Everything Together

In [32]:
key = '1DFmIMxxqdYl8wJBPqAFxtHkimk86Qtn'

In [42]:
def get_results_by_key(key,code,feature):
    # Get data for given code
    metadata = get_pyktok_metadata_by_code(code)
    news_vids = get_news_vids_by_code(code)
    updated_news_vids = add_date_to_news_vids(metadata,news_vids)
    video_ids = updated_news_vids['video_id'].values 

    results = get_all_cosine_similarities_by_feature(updated_news_vids,video_ids,key,feature)
    df = pd.DataFrame(results)
    
    # Save results
    filename = f'cosine_sim_{feature}_{code}.csv'
    df.to_csv(filename)
    
    return df

def get_results_by_filenames(metadata_file,news_vid_file,feature,code):
    # Get data for given files
    metadata = get_pyktok_metadata_by_filename(metadata_file)
    news_vids = get_news_vids_by_filename(news_vid_file)
    updated_news_vids = add_date_to_news_vids(metadata,news_vids)
    video_ids = updated_news_vids['video_id'].values 

    results = get_all_cosine_similarities_by_feature(updated_news_vids,video_ids,key,feature)
    df = pd.DataFrame(results)
    
    # Save results
    filename = f'cosine_sim_{feature}_{code}.csv'
    df.to_csv(filename)
    
    return df

# print(f'\nTikTok User Code: {code}')
# print(f'\nVideo ID: {video_id}')
# print(f'\nVideo\'s {feature}: {feature_text}')
# print(f'\nDate watched: {date}')
# print(f'\nTop 5 similarities:')
# for sim in similarities[:5]:
#     print(sim)

### Account 12345

In [47]:
# Using functions to retrieve data by filename instead of code
code = 12345
metadata_file = 'results_12345(19019 rows).csv'
news_vid_file = 'videos-newsHashtags-12345.json'
feature = 'suggested_words'#'video_description'
df12345 = get_results_by_filenames(metadata_file,news_vid_file,feature,code)
df12345

Getting pyktok data from file /Users/edithpo/Downloads/CS315_Project2_Group1/pyktok-results/results_12345(19019 rows).csv
Getting news-related videos from file /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/videos-newsHashtags-12345.json
Video ID: 7343058460025703722
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-03-05-articles.csv
Loaded data from 2024-03-05-articles.csv


  news_vids = pd.read_json(data, orient="split")
  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Video ID: 7340809195807132971
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-28-articles.csv
Loaded data from 2024-02-28-articles.csv
Video ID: 7341877699629108523
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-03-02-articles.csv
Loaded data from 2024-03-02-articles.csv
Video ID: 7341485301010877728
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-03-01-articles.csv
Loaded data from 2024-03-01-articles.csv
Video ID: 7341694662866046254
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-03-02-articles.csv
Loaded data from 2024-03-02-articles.csv
Video ID: 7331462452195970337
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-03-articles.csv
Loaded data from 2024-02-03-articles.csv
Problem with getting the cosine similarities for video 7331462452195970337
'NoneType' object is not iterable
Video ID: 7340363649212550405
L

Video ID: 7339041283610037534
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-24-articles.csv
Loaded data from 2024-02-24-articles.csv
Problem with getting the cosine similarities for video 7339041283610037534
'NoneType' object is not iterable
Video ID: 7337716616479788331
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-20-articles.csv
Loaded data from 2024-02-20-articles.csv
Video ID: 7337716616479788331
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-20-articles.csv
Loaded data from 2024-02-20-articles.csv
Video ID: 7337716616479788331
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-20-articles.csv
Loaded data from 2024-02-20-articles.csv
Video ID: 7338131827300846890
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-21-articles.csv
Loaded data from 2024-02-21-articles.csv
Video ID: 7338131827300846890
L

Video ID: 7332966069767802154
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-07-articles.csv
Loaded data from 2024-02-07-articles.csv
Video ID: 7332797813983677717
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-07-articles.csv
Loaded data from 2024-02-07-articles.csv
Video ID: 7332174452706397486
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-05-articles.csv
Loaded data from 2024-02-05-articles.csv
Video ID: 7331850550503329066
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-04-articles.csv
Loaded data from 2024-02-04-articles.csv
Video ID: 7328492050683333921
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-26-articles.csv
Loaded data from 2024-01-26-articles.csv
Video ID: 7330744742084431147
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-01-articles.csv
Loaded da

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Video ID: 7325842797091736864
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-19-articles.csv
Loaded data from 2024-01-19-articles.csv
Video ID: 7326473620648414510
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-21-articles.csv
Loaded data from 2024-01-21-articles.csv
Video ID: 7326606097400302878
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-21-articles.csv
Loaded data from 2024-01-21-articles.csv
Video ID: 7326672765593996590
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-21-articles.csv
Loaded data from 2024-01-21-articles.csv
Video ID: 7326660951997041963
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-21-articles.csv
Loaded data from 2024-01-21-articles.csv
Video ID: 7325826997706345770
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-19-articles.csv
Loaded da

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Video ID: 7201164179913329925
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-02-17-articles.csv
Loaded data from 2023-02-17-articles.csv
Video ID: 7324795427918777633
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-16-articles.csv
Loaded data from 2024-01-16-articles.csv
Video ID: 7325170083137932587
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-17-articles.csv
Loaded data from 2024-01-17-articles.csv
Problem with getting the cosine similarities for video 7325170083137932587
'NoneType' object is not iterable
Video ID: 7325432729946885408
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-18-articles.csv
Loaded data from 2024-01-18-articles.csv
Problem with getting the cosine similarities for video 7325432729946885408
'NoneType' object is not iterable
Video ID: 7304443990173158699
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analy

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Video ID: 7322914431942380831
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-11-articles.csv
Loaded data from 2024-01-11-articles.csv
Video ID: 7320257794832125227
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-04-articles.csv
Loaded data from 2024-01-04-articles.csv
Video ID: 7320299248153644331
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-04-articles.csv
Loaded data from 2024-01-04-articles.csv
Video ID: 7322941190603541806
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-11-articles.csv
Loaded data from 2024-01-11-articles.csv
Video ID: 7322887601323625734
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-11-articles.csv
Loaded data from 2024-01-11-articles.csv
Video ID: 7322844328101989675
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-11-articles.csv
Loaded da

Video ID: 7316886547599609120
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-12-26-articles.csv
Loaded data from 2023-12-26-articles.csv
Video ID: 7305903832339631406
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-11-26-articles.csv
Loaded data from 2023-11-26-articles.csv
Video ID: 7317632544634195231
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-12-28-articles.csv
Loaded data from 2023-12-28-articles.csv
Video ID: 7298646488656203009
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-11-07-articles.csv
Loaded data from 2023-11-07-articles.csv
Problem with getting the cosine similarities for video 7298646488656203009
'NoneType' object is not iterable
Video ID: 7312460303105527041
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-12-14-articles.csv
Loaded data from 2023-12-14-articles.csv
Video ID: 7315639447364603138
L

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Video ID: 7311793904947498286
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-12-12-articles.csv
Loaded data from 2023-12-12-articles.csv
Video ID: 7311546043445906718
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-12-11-articles.csv
Loaded data from 2023-12-11-articles.csv
Video ID: 7283624694681472289
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-09-27-articles.csv
Loaded data from 2023-09-27-articles.csv
Video ID: 7306635194415451423
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-11-28-articles.csv
Loaded data from 2023-11-28-articles.csv
Video ID: 7311070914261585194
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-12-10-articles.csv
Loaded data from 2023-12-10-articles.csv
Video ID: 7311286465080741121
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-12-11-articles.csv
Loaded da

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Loaded data from 2023-12-07-articles.csv
Video ID: 7288036531309235458
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-10-09-articles.csv
Loaded data from 2023-10-09-articles.csv
Video ID: 7310024971839868190
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-12-07-articles.csv
Loaded data from 2023-12-07-articles.csv
Problem with getting the cosine similarities for video 7310024971839868190
'NoneType' object is not iterable
Video ID: 7285072990901898498
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-10-01-articles.csv
Loaded data from 2023-10-01-articles.csv
Video ID: 7308551872070290734
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-12-03-articles.csv
Loaded data from 2023-12-03-articles.csv
Problem with getting the cosine similarities for video 7308551872070290734
'NoneType' object is not iterable
Video ID: 7309685473230949678
Looking for /Users/edit

Loaded data from 2023-11-28-articles.csv
Video ID: 7306248074697985323
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-11-27-articles.csv
Loaded data from 2023-11-27-articles.csv
Video ID: 7306159936478743839
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-11-27-articles.csv
Loaded data from 2023-11-27-articles.csv
Problem with getting the cosine similarities for video 7306159936478743839
'NoneType' object is not iterable
Video ID: 7305990194275093802
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-11-26-articles.csv
Loaded data from 2023-11-26-articles.csv
Problem with getting the cosine similarities for video 7305990194275093802
'NoneType' object is not iterable
Video ID: 7291687866688638251
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-10-19-articles.csv
Loaded data from 2023-10-19-articles.csv
Video ID: 7303695815296535840
Looking for /Users/edit

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Loaded data from 2023-09-15-articles.csv
Problem with getting the cosine similarities for video 7279046355992071457
'NoneType' object is not iterable
Video ID: 7279704894506945798
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-09-17-articles.csv
Loaded data from 2023-09-17-articles.csv
Problem with getting the cosine similarities for video 7279704894506945798
'NoneType' object is not iterable
Video ID: 7280016453565156650
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-09-18-articles.csv
Loaded data from 2023-09-18-articles.csv
Problem with getting the cosine similarities for video 7280016453565156650
'NoneType' object is not iterable
Video ID: 7279568128894340395
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-09-16-articles.csv
Loaded data from 2023-09-16-articles.csv
Video ID: 7280455645134966048
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-09-1

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Video ID: 7276635366172527914
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-09-08-articles.csv
Loaded data from 2023-09-08-articles.csv
Problem with getting the cosine similarities for video 7276635366172527914
'NoneType' object is not iterable


Unnamed: 0,video_id,date,suggested_words,similarities,cos_sim_scores,nyt_headlines
0,7343058460025703722,2024-03-05,"haley, nbc, nicki halley","[(nan, Trump Is Running on Dystopian Fantasies...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...","[Trump Is Running on Dystopian Fantasies, Micr..."
1,7340809195807132971,2024-02-28,"Rebecca Ferguson, hugh jackman, rebecca fergus...","[(0.2236, E-Bike Battery Caused Fire That Kill...","[0.2236, 0.2041, 0.1826, 0.0, 0.0, 0.0, 0.0, 0...",[E-Bike Battery Caused Fire That Killed Young ...
2,7341877699629108523,2024-03-02,"rhode island, wyoming state, 50 state song, ar...","[(0.4243, Scenes From the Smokehouse Creek Fir...","[0.4243, 0.3536, 0.3207, 0.291, 0.2828, 0.2774...",[Scenes From the Smokehouse Creek Fire in Texa...
3,7341485301010877728,2024-03-01,"Wendy Williams, denial is a river in egypt, we...","[(0.3105, If You Had $1 Billion to Give Away, ...","[0.3105, 0.27, 0.252, 0.2224, 0.2224, 0.1782, ...","[If You Had $1 Billion to Give Away, What Char..."
4,7341694662866046254,2024-03-02,"royal family news, kate middleton brother, lad...","[(0.3015, News Leaders Around the World Pledge...","[0.3015, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0....",[News Leaders Around the World Pledge Support ...
...,...,...,...,...,...,...
235,7278032406232223019,2023-09-12,"the fault in our stars, The Fault In Our Stars...","[(0.2914, Football Season), (0.2899, N.F.L. Re...","[0.2914, 0.2899, 0.2485, 0.2203, 0.2203, 0.194...","[Football Season, N.F.L. Reporter Files Racial..."
236,7277248227693333802,2023-09-10,"margaritaville, trump ai singing, margaritavil...","[(0.2585, Covid Hero or ‚ÄòLockdown Ron‚Äô? DeSant...","[0.2585, 0.1819, 0.1715, 0.0, 0.0, 0.0, 0.0, 0...",[Covid Hero or ‚ÄòLockdown Ron‚Äô? DeSantis and Tr...
237,7277312227831549230,2023-09-10,"college, College Student, College Life, Time-L...","[(0.3706, Russia‚Äôs ‚ÄòMerchant of Death‚Äô Is Look...","[0.3706, 0.239, 0.2224, 0.1945, 0.1429, 0.138,...",[Russia‚Äôs ‚ÄòMerchant of Death‚Äô Is Looking to Fo...
238,7274906751294524704,2023-09-04,"history, History Tiktok, jewish, Middle Easter...","[(nan, Lea Michele Ends ‚ÄòDream‚Äô Run in ‚ÄòFunny ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",[Lea Michele Ends ‚ÄòDream‚Äô Run in ‚ÄòFunny Girl‚Äô ...


### Account 10824

In [45]:
code = 10824
metadata_file = 'results_10824_full.csv'
news_vid_file = 'videos-newsHashtags-10824_full.json'


metadata = get_pyktok_metadata_by_filename(metadata_file)
news_vids = get_news_vids_by_filename(news_vid_file)
updated_news_vids = add_date_to_news_vids(metadata,news_vids)
dates = updated_news_vids['date'].values
# get_all_nyt_data(dates)

Getting pyktok data from file /Users/edithpo/Downloads/CS315_Project2_Group1/pyktok-results/results_10824_full.csv
Getting news-related videos from file /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/videos-newsHashtags-10824_full.json


  news_vids = pd.read_json(data, orient="split")


In [46]:
# Use get results by filename
code = 10824
metadata_file = 'results_10824_full.csv'
news_vid_file = 'videos-newsHashtags-10824_full.json'
feature = 'video_description'#'suggested_words'
df10824 = get_results_by_filenames(metadata_file,news_vid_file,feature,code)
df10824

Getting pyktok data from file /Users/edithpo/Downloads/CS315_Project2_Group1/pyktok-results/results_10824_full.csv
Getting news-related videos from file /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/videos-newsHashtags-10824_full.json
Video ID: 7333618266436406561
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-09-articles.csv
Loaded data from 2024-02-09-articles.csv
Video ID: 7333618266436406561
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-09-articles.csv
Loaded data from 2024-02-09-articles.csv


  news_vids = pd.read_json(data, orient="split")


Video ID: 7337266783147101446
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-19-articles.csv
Loaded data from 2024-02-19-articles.csv
Video ID: 7337266783147101446
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-19-articles.csv
Loaded data from 2024-02-19-articles.csv
Video ID: 7339991777320979754
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-26-articles.csv
Loaded data from 2024-02-26-articles.csv
Video ID: 7339991777320979754
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-26-articles.csv
Loaded data from 2024-02-26-articles.csv
Video ID: 7326271896960503072
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-20-articles.csv
No columns to parse from file
File 2024-01-20-articles.csv not found. Creating it instead.
No columns to parse from file
'headline' not found in nyt_df.
Video ID: 732627189696050307

Video ID: 7339089508496969003
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-24-articles.csv
Loaded data from 2024-02-24-articles.csv
Video ID: 7339952744247102762
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-26-articles.csv
Loaded data from 2024-02-26-articles.csv
Video ID: 7339289170008919338
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-24-articles.csv
Loaded data from 2024-02-24-articles.csv
Video ID: 7338819967816666410
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-23-articles.csv
Loaded data from 2024-02-23-articles.csv
Video ID: 7326490307707669792
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-21-articles.csv
Loaded data from 2024-01-21-articles.csv
Video ID: 7339900052472630560
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-26-articles.csv
Loaded da

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Video ID: 7329539172828532000
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-29-articles.csv
Loaded data from 2024-01-29-articles.csv
Video ID: 7336273394998054176
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-16-articles.csv
Loaded data from 2024-02-16-articles.csv
Video ID: 7319853953431129377
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-03-articles.csv
Loaded data from 2024-01-03-articles.csv
Video ID: 7328575268031860010
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-26-articles.csv
Loaded data from 2024-01-26-articles.csv
Video ID: 7318887789049761066
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-12-31-articles.csv
Loaded data from 2023-12-31-articles.csv
Video ID: 7320994353005006112
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-06-articles.csv
Loaded da

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Loaded data from 2024-02-18-articles.csv
Video ID: 7338653354014297374
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-22-articles.csv
Loaded data from 2024-02-22-articles.csv
Video ID: 7337822676783795499
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-20-articles.csv
Loaded data from 2024-02-20-articles.csv
Video ID: 7339224135303728414
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-24-articles.csv
Loaded data from 2024-02-24-articles.csv
Video ID: 7334474436877258016
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-11-articles.csv
Loaded data from 2024-02-11-articles.csv
Video ID: 7325892642158431519
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-21-articles.csv
Loaded data from 2024-01-21-articles.csv
Video ID: 7338939866111085857
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/n

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Loaded data from 2024-01-28-articles.csv
Video ID: 7338602232297966890
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-22-articles.csv
Loaded data from 2024-02-22-articles.csv
Video ID: 7334689898064022789
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-12-articles.csv
Loaded data from 2024-02-12-articles.csv
Video ID: 7339176915997150506
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-24-articles.csv
Loaded data from 2024-02-24-articles.csv
Video ID: 7339304215627517227
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-24-articles.csv
Loaded data from 2024-02-24-articles.csv
Video ID: 7337282652317191454
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-19-articles.csv
Loaded data from 2024-02-19-articles.csv
Video ID: 7332957228292377861
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/n

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Video ID: 7338617273747557678
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-22-articles.csv
Loaded data from 2024-02-22-articles.csv
Video ID: 7338555590450564398
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-22-articles.csv
Loaded data from 2024-02-22-articles.csv
Video ID: 7339354994438573358
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-24-articles.csv
Loaded data from 2024-02-24-articles.csv
Video ID: 7338521826043153706
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-22-articles.csv
Loaded data from 2024-02-22-articles.csv
Video ID: 7339274835152964910
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-24-articles.csv
Loaded data from 2024-02-24-articles.csv
Video ID: 7339350159911947563
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-24-articles.csv
Loaded da

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Loaded data from 2024-01-13-articles.csv
Video ID: 7323422936705092906
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-12-articles.csv
Loaded data from 2024-01-12-articles.csv
Video ID: 7329236164794158368
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-28-articles.csv
Loaded data from 2024-01-28-articles.csv
Video ID: 7324348018424024362
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-15-articles.csv
Loaded data from 2024-01-15-articles.csv
Video ID: 7328489835084123435
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-26-articles.csv
Loaded data from 2024-01-26-articles.csv
Video ID: 7323553422500711723
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-13-articles.csv
Loaded data from 2024-01-13-articles.csv
Video ID: 7332048885537344801
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/n

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))
  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Loaded data from 2023-12-23-articles.csv
Video ID: 7330440084958842155
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-31-articles.csv
Loaded data from 2024-01-31-articles.csv
Video ID: 7333852267956686126
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-10-articles.csv
Loaded data from 2024-02-10-articles.csv
Video ID: 7331858271298637102
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-04-articles.csv
Loaded data from 2024-02-04-articles.csv
Video ID: 7333678427683228959
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-09-articles.csv
Loaded data from 2024-02-09-articles.csv
Video ID: 7329630685570845994
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-29-articles.csv
Loaded data from 2024-01-29-articles.csv
Video ID: 7333675404965793054
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/n

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Loaded data from 2024-02-01-articles.csv
Video ID: 7328708278207204650
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-27-articles.csv
Loaded data from 2024-01-27-articles.csv
Video ID: 7333257454932839723
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-08-articles.csv
Loaded data from 2024-02-08-articles.csv
Video ID: 7331513739105045802
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-02-03-articles.csv
Loaded data from 2024-02-03-articles.csv
Video ID: 7311887489206209834
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-12-12-articles.csv
Loaded data from 2023-12-12-articles.csv
Video ID: 7298001050705972522
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-11-05-articles.csv
Loaded data from 2023-11-05-articles.csv
Video ID: 7311341277436497158
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/n

Unnamed: 0,video_id,date,video_description,similarities,cos_sim_scores,nyt_headlines
0,7333618266436406561,2024-02-09,all sh1ts & giggles until someone giggles & sh...,"[(0.5658, When the Food Is Lovingly Prepared b...","[0.5658, 0.5401, 0.5401, 0.488, 0.4629, 0.4629...",[When the Food Is Lovingly Prepared by the Act...
1,7333618266436406561,2024-02-09,all sh1ts & giggles until someone giggles & sh...,"[(0.5658, When the Food Is Lovingly Prepared b...","[0.5658, 0.5401, 0.5401, 0.488, 0.4629, 0.4629...",[When the Food Is Lovingly Prepared by the Act...
2,7337266783147101446,2024-02-19,They‚Äôll cook up any reason for you to transfer...,"[(0.3849, U.S. Strike Killed Afghans Recruited...","[0.3849, 0.2582, 0.2182, 0.2182, 0.2041, 0.192...",[U.S. Strike Killed Afghans Recruited to Fight...
3,7337266783147101446,2024-02-19,They‚Äôll cook up any reason for you to transfer...,"[(0.3849, U.S. Strike Killed Afghans Recruited...","[0.3849, 0.2582, 0.2182, 0.2182, 0.2041, 0.192...",[U.S. Strike Killed Afghans Recruited to Fight...
4,7339991777320979754,2024-02-26,The Montefiore Albert Einstein School of Medic...,"[(0.5678, Quotation of the Day: Florida Man Is...","[0.5678, 0.5476, 0.501, 0.4851, 0.4766, 0.4736...",[Quotation of the Day: Florida Man Is In on th...
...,...,...,...,...,...,...
153,7298001050705972522,2023-11-05,That squeaky toy never stood a chance. #Pupper...,"[(0.5298, A Chance Encounter and a Fugitive Li...","[0.5298, 0.3849, 0.2182, 0.2041, 0.1826, 0.182...",[A Chance Encounter and a Fugitive Linked to a...
154,7311341277436497158,2023-12-11,What's the problem with Chelsea And Manchester...,"[(0.433, The Power Vacuum at the Top of the Cr...","[0.433, 0.3651, 0.3482, 0.3482, 0.3333, 0.3203...",[The Power Vacuum at the Top of the Crypto Ind...
155,7307082877282340126,2023-11-29,Someone needs to hit him so we can find out ho...,"[(0.3198, How to Actually Work Out With Your K...","[0.3198, 0.3015, 0.2417, 0.2132, 0.1907, 0.190...","[How to Actually Work Out With Your Kids, Is S..."
156,7306342533108452650,2023-11-27,Trinidadian Accent - Between the Scenes Trevor...,"[(0.3892, Biden Hopes to Alter the Trajectory ...","[0.3892, 0.3345, 0.3114, 0.3015, 0.286, 0.2727...",[Biden Hopes to Alter the Trajectory of the Wa...


### Account 50405

In [48]:
%%time
code = 50405
metadata_file = 'results_50405_full.csv'
news_vid_file = 'videos-newsHashtags-50405_full.json'
feature = 'video_description'#'suggested_words'
df50405 = get_results_by_filenames(metadata_file,news_vid_file,feature,code)
df50405

Getting pyktok data from file /Users/edithpo/Downloads/CS315_Project2_Group1/pyktok-results/results_50405_full.csv
Getting news-related videos from file /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/videos-newsHashtags-50405_full.json
Video ID: 7324495899001081134
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-15-articles.csv
Loaded data from 2024-01-15-articles.csv


  news_vids = pd.read_json(data, orient="split")


Video ID: 7328132247750528298
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-25-articles.csv
Loaded data from 2024-01-25-articles.csv
Video ID: 7328586289157066027
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-26-articles.csv
Loaded data from 2024-01-26-articles.csv
Video ID: 7328915227339263278
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-27-articles.csv
Loaded data from 2024-01-27-articles.csv
Video ID: 7326245192158252330
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-20-articles.csv
No columns to parse from file
File 2024-01-20-articles.csv not found. Creating it instead.
No columns to parse from file
'headline' not found in nyt_df.
Video ID: 7324784817080421674
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-16-articles.csv
Loaded data from 2024-01-16-articles.csv


  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))
  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Video ID: 7322099351638101290
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-09-articles.csv
Loaded data from 2024-01-09-articles.csv
Video ID: 7323305405579365664
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-12-articles.csv
Loaded data from 2024-01-12-articles.csv
Video ID: 7324795427918777633
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-16-articles.csv
Loaded data from 2024-01-16-articles.csv
Video ID: 7324838891402349866
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-16-articles.csv
Loaded data from 2024-01-16-articles.csv
Video ID: 7324740645396991263
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-16-articles.csv
Loaded data from 2024-01-16-articles.csv
Video ID: 7323644304512142634
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-13-articles.csv
Loaded da

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Video ID: 7312610914606468398
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-12-14-articles.csv
Loaded data from 2023-12-14-articles.csv
Video ID: 7324521742255328555
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-15-articles.csv
Loaded data from 2024-01-15-articles.csv
Video ID: 7311491625362951467
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-12-11-articles.csv
Loaded data from 2023-12-11-articles.csv
Video ID: 7324463384307567918
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-15-articles.csv
Loaded data from 2024-01-15-articles.csv
Video ID: 7323728490744827182
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-13-articles.csv
Loaded data from 2024-01-13-articles.csv
Video ID: 7319592285451095342
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-02-articles.csv
Loaded da

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Video ID: 7322952037258267950
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-11-articles.csv
Loaded data from 2024-01-11-articles.csv
Video ID: 7309514540629134635
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-12-06-articles.csv
Loaded data from 2023-12-06-articles.csv
Video ID: 7322620149117242670
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-10-articles.csv
Loaded data from 2024-01-10-articles.csv
Video ID: 7302223864006724910
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-11-16-articles.csv
[Errno 2] No such file or directory: '/Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-11-16-articles.csv'
File 2023-11-16-articles.csv not found. Creating it instead.
Successful get request.
3734 documents found for 2023-11.
174 documents found for 2023-11-16.
Saving new file: 2023-11-16-articles.csv
Video ID: 7322670461949611307

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))
  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Loaded data from 2024-01-06-articles.csv
Video ID: 7300811143365299462
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-11-13-articles.csv
Loaded data from 2023-11-13-articles.csv
Video ID: 7301069391104429355
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-11-13-articles.csv
Loaded data from 2023-11-13-articles.csv
Video ID: 7321433767913082143
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-07-articles.csv
Loaded data from 2024-01-07-articles.csv
Video ID: 7321541810004757802
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-07-articles.csv
Loaded data from 2024-01-07-articles.csv
Video ID: 7317864696747642158
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-12-28-articles.csv
Loaded data from 2023-12-28-articles.csv
Video ID: 7317157949695085855
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/n

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Successful get request.
3525 documents found for 2023-12.
95 documents found for 2023-12-27.
Saving new file: 2023-12-27-articles.csv
Video ID: 7320699425712540971
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-05-articles.csv
Loaded data from 2024-01-05-articles.csv
Video ID: 7302250831213120798
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-11-16-articles.csv
Loaded data from 2023-11-16-articles.csv
Video ID: 7320311584209849642
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-04-articles.csv
Loaded data from 2024-01-04-articles.csv
Video ID: 7309961465216257322
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-12-07-articles.csv
Loaded data from 2023-12-07-articles.csv
Video ID: 7311706941280013610
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-12-12-articles.csv
Loaded data from 2023-12-12-articles.csv
Video 

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


[Errno 2] No such file or directory: '/Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2022-09-12-articles.csv'
File 2022-09-12-articles.csv not found. Creating it instead.
Unsuccessful get request.
Saving new file: 2022-09-12-articles.csv
'headline' not found in nyt_df.
Video ID: 7320325173209517317
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-04-articles.csv
Loaded data from 2024-01-04-articles.csv
Video ID: 7320286987527752990
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-04-articles.csv
Loaded data from 2024-01-04-articles.csv
Video ID: 7320684994173685035
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-05-articles.csv
Loaded data from 2024-01-05-articles.csv
Video ID: 7320392676593995050
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2024-01-04-articles.csv
Loaded data from 2024-01-04-articles.csv
Video ID: 73202431389231

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Unsuccessful get request.
Saving new file: 2023-11-15-articles.csv
'headline' not found in nyt_df.
Video ID: 7316954481332112671
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-12-26-articles.csv
Loaded data from 2023-12-26-articles.csv
Video ID: 7311167714217790751
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-12-10-articles.csv
Loaded data from 2023-12-10-articles.csv
Video ID: 7318887789049761066
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-12-31-articles.csv
Loaded data from 2023-12-31-articles.csv
Video ID: 7317407685878484255
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-12-27-articles.csv
Loaded data from 2023-12-27-articles.csv
Video ID: 7317463403184868651
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-12-27-articles.csv
Loaded data from 2023-12-27-articles.csv
Video ID: 6835398606862732549
Looking for

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Unsuccessful get request.
Saving new file: 2023-12-20-articles.csv
'headline' not found in nyt_df.
Video ID: 7299235442908581163
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-11-08-articles.csv
[Errno 2] No such file or directory: '/Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-11-08-articles.csv'
File 2023-11-08-articles.csv not found. Creating it instead.
Unsuccessful get request.
Saving new file: 2023-11-08-articles.csv
'headline' not found in nyt_df.
Video ID: 7286970136056171822
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-10-06-articles.csv
[Errno 2] No such file or directory: '/Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-10-06-articles.csv'
File 2023-10-06-articles.csv not found. Creating it instead.
Unsuccessful get request.
Saving new file: 2023-10-06-articles.csv
'headline' not found in nyt_df.
Video ID: 7287238250723462446
Looking for /Users/edithpo/Downlo

  cosine = np.dot(V1,V2)/(norm(V1)*norm(V2))


Unsuccessful get request.
Saving new file: 2023-09-09-articles.csv
'headline' not found in nyt_df.
Video ID: 7273270653707390241
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-08-30-articles.csv
[Errno 2] No such file or directory: '/Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-08-30-articles.csv'
File 2023-08-30-articles.csv not found. Creating it instead.
Unsuccessful get request.
Saving new file: 2023-08-30-articles.csv
'headline' not found in nyt_df.
Video ID: 7274133453950520619
Looking for /Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-09-02-articles.csv
[Errno 2] No such file or directory: '/Users/edithpo/Downloads/CS315_Project2_Group1/analysis/nyt_data/2023-09-02-articles.csv'
File 2023-09-02-articles.csv not found. Creating it instead.
Unsuccessful get request.
Saving new file: 2023-09-02-articles.csv
'headline' not found in nyt_df.
Video ID: 7271374993664134442
Looking for /Users/edithpo/Downlo

Unnamed: 0,video_id,date,video_description,similarities,cos_sim_scores,nyt_headlines
0,7324495899001081134,2024-01-15,*removes browns sweater from cart* #browns #cl...,"[(0.3162, See the Looks From the Emmys Red Car...","[0.3162, 0.3015, 0.3015, 0.0, 0.0, 0.0, 0.0, 0...","[See the Looks From the Emmys Red Carpet, U.S...."
1,7328132247750528298,2024-01-25,#stitch with @Garza Crew #greenscreen #influen...,"[(0.1741, John Pilger, 84, Dies; Journalist an...","[0.1741, 0.1741, 0.1667, 0.1667, 0.1667, 0.160...","[John Pilger, 84, Dies; Journalist and Filmmak..."
2,7328586289157066027,2024-01-26,Erie is under a spell with all of the fog. Thi...,"[(0.3651, With ‚ÄòMasters of the Air,‚Äô a 10-Year...","[0.3651, 0.3651, 0.3273, 0.3086, 0.3062, 0.298...","[With ‚ÄòMasters of the Air,‚Äô a 10-Year Dream Li..."
3,7328915227339263278,2024-01-27,@zay dante We have a voice and a chance in the...,"[(0.5217, The S&P 500 Through the Prism of a ‚Äò...","[0.5217, 0.483, 0.4619, 0.4472, 0.4216, 0.4082...",[The S&P 500 Through the Prism of a ‚ÄòMagnifice...
4,7326245192158252330,2024-01-20,San Francisco is still a great place this is j...,[],[],[]
...,...,...,...,...,...,...
106,7273137344666963242,2023-08-30,AUSTIN RESIDENT DIES FROM BRAIN EATING INFECTI...,[],[],[]
107,7208988709159570693,2023-03-10,Best shampoos you should be using in 2023 for ...,"[(0.3062, In Dubai, a Wedding Steeped in Pales...","[0.3062, 0.3062, 0.25, 0.2236, 0.2236, 0.2132,...","[In Dubai, a Wedding Steeped in Palestinian an..."
108,7262446668824743214,2023-08-01,Road trip cold@Kristen #covid #fyp #cold,"[(0.2041, The Covid Test Company That Got Into...","[0.2041, 0.1826, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",[The Covid Test Company That Got Into the Migr...
109,7267663486900112682,2023-08-15,An 18-year-old woman has died after an apparen...,"[(0.3819, ‚ÄòReinventing Elvis: The ‚Äô68 Comeback...","[0.3819, 0.3669, 0.3586, 0.3571, 0.3273, 0.327...",[‚ÄòReinventing Elvis: The ‚Äô68 Comeback‚Äô Review:...
