In [2]:
import pandas as pd
import json
import requests
import concurrent.futures
import numpy as np
import time

In [3]:
# Load the JSON file
with open('all.json', 'r') as file:
    json_data = json.load(file)

# Normalize the nested 'data' field
df = pd.json_normalize(
    json_data, 
    record_path='data', 
    meta=['date'], 
    errors='ignore'
)


In [4]:
df.date = pd.to_datetime(df.date)

In [5]:
sorted_df = df.sort_values(by = ['date','weeks_on_chart', 'this_week'], ascending = [False, False, True])

In [6]:
sorted_df[sorted_df.artist.str.contains("Gracie Abrams")]

Unnamed: 0,song,artist,this_week,last_week,peak_position,weeks_on_chart,date
344569,"I Love You, I'm Sorry",Gracie Abrams,83,,83,1,2024-08-17
344065,Close To You,Gracie Abrams,79,60.0,49,4,2024-07-13
344060,Us.,Gracie Abrams Featuring Taylor Swift,74,36.0,36,2,2024-07-13
343946,Close To You,Gracie Abrams,60,69.0,49,3,2024-07-06
343922,Us.,Gracie Abrams Featuring Taylor Swift,36,,36,1,2024-07-06
343980,Risk,Gracie Abrams,94,,94,1,2024-07-06
343855,Close To You,Gracie Abrams,69,49.0,49,2,2024-06-29
343735,Close To You,Gracie Abrams,49,,49,1,2024-06-22
341065,"Everywhere, Everything",Noah Kahan With Gracie Abrams,79,,79,1,2023-12-16


In [7]:
no_dupes = sorted_df.drop_duplicates(subset = ["song", "artist"], keep = 'first')

In [8]:
sorted_df

Unnamed: 0,song,artist,this_week,last_week,peak_position,weeks_on_chart,date
344495,Lose Control,Teddy Swims,9,6.0,1,52,2024-08-17
344507,I Remember Everything,Zach Bryan Featuring Kacey Musgraves,21,24.0,1,50,2024-08-17
344519,Stick Season,Noah Kahan,33,30.0,9,45,2024-08-17
344529,Feather,Sabrina Carpenter,43,38.0,21,36,2024-08-17
344499,Cowgirls,Morgan Wallen Featuring ERNEST,13,15.0,12,34,2024-08-17
...,...,...,...,...,...,...,...
95,Over And Over,Thurston Harris,96,,96,1,1958-08-04
96,I Believe In You,Robert & Johnny,97,,97,1,1958-08-04
97,Little Serenade,The Ames Brothers,98,,98,1,1958-08-04
98,I'll Get By (As Long As I Have You),Billy Williams,99,,99,1,1958-08-04


In [9]:
dataset_with_lyrics = pd.read_excel("Final processed dataset.xlsx")

In [10]:
no_dupes

Unnamed: 0,song,artist,this_week,last_week,peak_position,weeks_on_chart,date
344495,Lose Control,Teddy Swims,9,6.0,1,52,2024-08-17
344507,I Remember Everything,Zach Bryan Featuring Kacey Musgraves,21,24.0,1,50,2024-08-17
344519,Stick Season,Noah Kahan,33,30.0,9,45,2024-08-17
344529,Feather,Sabrina Carpenter,43,38.0,21,36,2024-08-17
344499,Cowgirls,Morgan Wallen Featuring ERNEST,13,15.0,12,34,2024-08-17
...,...,...,...,...,...,...,...
89,Stay,The Ames Brothers,90,,90,1,1958-08-04
95,Over And Over,Thurston Harris,96,,96,1,1958-08-04
97,Little Serenade,The Ames Brothers,98,,98,1,1958-08-04
98,I'll Get By (As Long As I Have You),Billy Williams,99,,99,1,1958-08-04


In [11]:
dataset_with_lyrics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22600 entries, 0 to 22599
Data columns (total 49 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Unnamed: 0               22600 non-null  int64  
 1   url                      22600 non-null  object 
 2   WeekID                   22600 non-null  object 
 3   Week Position            22600 non-null  int64  
 4   Song                     22600 non-null  object 
 5   Performer                22600 non-null  object 
 6   SongID                   22600 non-null  object 
 7   Instance                 22046 non-null  float64
 8   Previous Week Position   5058 non-null   float64
 9   Peak Position            22046 non-null  float64
 10  Weeks on Chart           22046 non-null  float64
 11  Lyrics                   22600 non-null  object 
 12  Artist                   22600 non-null  object 
 13  words                    22600 non-null  object 
 14  wordCount             

In [12]:
no_dupes

Unnamed: 0,song,artist,this_week,last_week,peak_position,weeks_on_chart,date
344495,Lose Control,Teddy Swims,9,6.0,1,52,2024-08-17
344507,I Remember Everything,Zach Bryan Featuring Kacey Musgraves,21,24.0,1,50,2024-08-17
344519,Stick Season,Noah Kahan,33,30.0,9,45,2024-08-17
344529,Feather,Sabrina Carpenter,43,38.0,21,36,2024-08-17
344499,Cowgirls,Morgan Wallen Featuring ERNEST,13,15.0,12,34,2024-08-17
...,...,...,...,...,...,...,...
89,Stay,The Ames Brothers,90,,90,1,1958-08-04
95,Over And Over,Thurston Harris,96,,96,1,1958-08-04
97,Little Serenade,The Ames Brothers,98,,98,1,1958-08-04
98,I'll Get By (As Long As I Have You),Billy Williams,99,,99,1,1958-08-04


In [13]:
no_dupes.artist = no_dupes.artist.str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_dupes.artist = no_dupes.artist.str.lower()


In [14]:
merged_df = pd.merge(left = no_dupes, right = dataset_with_lyrics, left_on = ["song", "artist"], right_on = ["Song", "Performer"], how = "left")

In [15]:
merged_df

Unnamed: 0.1,song,artist,this_week,last_week,peak_position,weeks_on_chart,date,Unnamed: 0,url,WeekID,...,anticipation_normalized,fear,fear_normalized,surprise,surprise_normalized,emo_score,happy,happy_normalized,sorrow,sorrow_normalized
0,Lose Control,teddy swims,9,6.0,1,52,2024-08-17,,,,...,,,,,,,,,,
1,I Remember Everything,zach bryan featuring kacey musgraves,21,24.0,1,50,2024-08-17,,,,...,,,,,,,,,,
2,Stick Season,noah kahan,33,30.0,9,45,2024-08-17,,,,...,,,,,,,,,,
3,Feather,sabrina carpenter,43,38.0,21,36,2024-08-17,,,,...,,,,,,,,,,
4,Cowgirls,morgan wallen featuring ernest,13,15.0,12,34,2024-08-17,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31445,Stay,the ames brothers,90,,90,1,1958-08-04,,,,...,,,,,,,,,,
31446,Over And Over,thurston harris,96,,96,1,1958-08-04,47343.0,http://www.billboard.com/charts/hot-100/1958-0...,8/2/1958,...,0.060645,1.773,0.014653,0.282,0.002331,19.844,6.817,0.056339,13.027,0.107661
31447,Little Serenade,the ames brothers,98,,98,1,1958-08-04,25505.0,http://www.billboard.com/charts/hot-100/1958-0...,8/2/1958,...,0.056108,2.118,0.019081,1.094,0.009856,36.331,24.825,0.223649,11.506,0.103658
31448,I'll Get By (As Long As I Have You),billy williams,99,,99,1,1958-08-04,,,,...,,,,,,,,,,


In [16]:
merged_df[['Lyrics']]

Unnamed: 0,Lyrics
0,
1,
2,
3,
4,
...,...
31445,
31446,doo doo doo doo\n\nwell i went to a dance the ...
31447,the ames brothers\nmiscellaneous\nmelodie d'am...
31448,


In [17]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31450 entries, 0 to 31449
Data columns (total 56 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   song                     31450 non-null  object        
 1   artist                   31450 non-null  object        
 2   this_week                31450 non-null  int64         
 3   last_week                27502 non-null  float64       
 4   peak_position            31450 non-null  int64         
 5   weeks_on_chart           31450 non-null  int64         
 6   date                     31450 non-null  datetime64[ns]
 7   Unnamed: 0               19506 non-null  float64       
 8   url                      19506 non-null  object        
 9   WeekID                   19506 non-null  object        
 10  Week Position            19506 non-null  float64       
 11  Song                     19506 non-null  object        
 12  Performer                19506 n

In [18]:
missing_lyrics = merged_df[merged_df.Lyrics.isnull()]