In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
import zipfile
import os
import json
import re
from html.parser import HTMLParser
import csv
import json


# Attain and Load the genius.com data

In [4]:
# used for cleaning up the genius annotation and lyrics

class MyHTMLParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.capture = False
        self.captured_data = []

    def handle_starttag(self, tag, attrs):
        if tag == 'p':
            self.capture = True

    def handle_endtag(self, tag):
        if tag == 'p':
            self.capture = False

    def handle_data(self, data):
        if self.capture:
            self.captured_data.append(data)

def parse_content(html_content):
    parser = MyHTMLParser()
    parser.feed(html_content)
    return parser.captured_data

In [5]:

# these function attained from https://github.com/cptq/genius-expertise/tree/master/data
# of how to load the annotation data, song data and the lyrics data

def load_annotation_info(reviewed=True, max_lines=100):
    annotation_info = []
    with open("/content/gdrive/My Drive/AC215 Project/annotation_info.json", 'r') as f:
        for i, line in enumerate(f):
            j = json.loads(line)
            # only choose the annotation that is verified by the artist
            if reviewed and j['type'] == 'verified':
                annotation_info.append(j)
            elif not reviewed:
                annotation_info.append(j)
    return annotation_info



def song_info_gen():
    ''' Loads song information.'''
    with open(f'/content/gdrive/My Drive/AC215 Project/song_info.json', 'r') as f:
        song_info = f.readlines()
        song_info = map(json.loads, song_info)
    return song_info


def lyrics_info_gen():
    '''lyrics information generator.'''
    with open(f'/content/gdrive/My Drive/AC215 Project/lyrics.jl', 'r') as f:
        lyrics_info = f.readlines()
    return map(json.loads, lyrics_info)


# Annotation Data

In [6]:
annotations = load_annotation_info(reviewed=True, max_lines=100)
len(annotations)

8547

In [None]:
# Path to save the CSV file
csv_file_path = '/content/gdrive/My Drive/AC215 Project/annotations.csv'

# Writing to a CSV file
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=["Artist", "Song", "Lyrics", "Annotation"])
    writer.writeheader()
    for annotation in annotations:
        # clean up the annotation
        p_content = parse_content(annotation["content"])
        # print(p_content)
        for content in p_content:
            # print(content)
            writer.writerow({
                "Artist": annotation["artist"],
                "Song": annotation["song"],
                "Lyrics": annotation["lyrics"],
                "Annotation": content
            })

print(f'Data has been written to {csv_file_path}')

Data has been written to /content/gdrive/My Drive/AC215 Project/annotations.csv


In [7]:
csv_file_path = '/content/gdrive/My Drive/AC215 Project/annotations.csv'
df = pd.read_csv(csv_file_path)
df

Unnamed: 0,Artist,Song,Lyrics,Annotation
0,KendrickLamar,Kendrick-lamar-swimming-pools-drank-lyrics,Now I done grew up 'round some people livin' t...,This song is about me reminising about my year...
1,KendrickLamar,Kendrick-lamar-swimming-pools-drank-lyrics,Backstroke every day in Chicago,back stroke. playing on the word swimming pool...
2,KendrickLamar,Kendrick-lamar-money-trees-lyrics,I fucked Sherane and went to tell my bros (tel...,That was a metaphor for both. Not saying that ...
3,JayRock,Kendrick-lamar-money-trees-lyrics,[Verse 3: Jay Rock],"I first heard the record before it dropped, ri..."
4,JayRock,Kendrick-lamar-money-trees-lyrics,Imagine Rock up in them projects\nWhere them n...,Moving out the projects was hard. That’s home....
...,...,...,...,...
15546,BBP,Pnl-naha-lyrics,[Produit par BBP],"J'ai fais plein de prods, je leur ai envoyé et..."
15547,BBP,Pnl-je-thaine-version-orange-lyrics,[Produit par BBP],"À la base j'avais fait cette prod séparément, ..."
15548,BBP,Pnl-je-thaine-version-orange-lyrics,[Produit par BBP],Vald
15549,BBP,Pnl-je-thaine-version-orange-lyrics,[Produit par BBP],", mais son DA la trouvait “sans intérêt”. Du c..."


# Song Data

In [8]:
song_info = song_info_gen()

In [9]:
song_info_list = list(song_info)

In [10]:
song_info_df = pd.DataFrame(song_info_list)
song_info_df.head()

Unnamed: 0,url_name,title,primary_artist,release_date,pyongs,contributors,has_bio,views,tags,annotations
0,Kendrick-lamar-swimming-pools-drank-lyrics,Swimming Pools (Drank),Kendrick-lamar,"July 31, 2012",894.0,403,True,5589280.0,"[Trap, Conscious Hip-Hop, Memes, West Coast, R...",
1,Kendrick-lamar-money-trees-lyrics,Money Trees,Kendrick-lamar,"October 22, 2012",880.0,394,True,4592003.0,"[Conscious Hip-Hop, West Coast, Rap, Producer]",
2,Kendrick-lamar-xxx-lyrics,XXX.,Kendrick-lamar,"April 14, 2017",188.0,389,True,4651514.0,"[Conscious Hip-Hop, Boom Bap, Pop, West Coast,...",
3,A-ap-rocky-fuckin-problems-lyrics,Fuckin’ Problems,A-ap-rocky,"October 24, 2012",706.0,437,True,7378309.0,"[Gangsta Rap, Dirty South, Atlanta, Posse Cut,...",
4,Kendrick-lamar-dna-lyrics,DNA.,Kendrick-lamar,"April 14, 2017",555.0,570,True,5113687.0,"[Politics, Producer, News, Conscious Hip-Hop, ...",


# Lyrics Data

In [11]:
lyrics_info = lyrics_info_gen()

In [12]:
lyrics_info = list(lyrics_info)
lyrics_info_df = pd.DataFrame(lyrics_info)
lyrics_info_df["lyrics"]

Unnamed: 0,lyrics
0,\n\n[Produced by T-Minus]\n\n[Intro]\nPour up ...
1,\n\n[Produced by DJ Dahi]\n\n[Verse 1: Kendric...
2,"\n\n[Intro: Bēkon & Kid Capri]\nAmerica, God b..."
3,"\n\n[Chorus: 2 Chainz, Drake & Both (A$AP Rock..."
4,"\n\n[Verse 1]\nI got, I got, I got, I got—\nLo..."
...,...
37988,"\n\n[Intro : N.O.S.]\nOuais, ouais, ouais, oua..."
37989,"\n\n[Couplet 1 : Ademo]\nChang, chang, chang, ..."
37990,"\n\n[Intro: N.O.S]\nOuais, Ah on va voir\nOuai..."
37991,\n\n[Produit par BBP]\n\n[Couplet 1 : Ademo]\n...


In [13]:
def clean_lyrics_dataset(df):
    def clean_text(text):
        if not isinstance(text, str):
            return text

        # Remove literal '\n' strings
        text = text.replace('\\n', ' ')

        # Remove section headers in brackets
        text = re.sub(r'\[[^\]]*\]', '', text)

        # Remove special characters and numbers
        text = re.sub(r'\*+\d+\*+', '', text)
        text = re.sub(r'\*\*\d+\*\*', '', text)

        # Remove producer tags
        text = re.sub(r'\[Produit par[^\]]*\]', '', text)

        # Replace all types of newlines with spaces
        text = text.replace('\n', ' ')
        text = text.replace('\r', ' ')

        # Remove extra spaces
        text = re.sub(r'\s+', ' ', text)

        return text.strip()

    # Create a copy to avoid modifying the original
    cleaned_df = df.copy()

    # Apply cleaning to the lyrics column
    cleaned_df["lyrics"] = cleaned_df["lyrics"].apply(clean_text)

    return cleaned_df


In [14]:
cleaned_lyrics = clean_lyrics_dataset(lyrics_info_df)
cleaned_lyrics

Unnamed: 0,song,lyrics
0,Kendrick-lamar-swimming-pools-drank-lyrics,"Pour up (Drank), head shot (Drank) Sit down (D..."
1,Kendrick-lamar-money-trees-lyrics,"Uh, me and my niggas tryna get it, ya bish (ya..."
2,Kendrick-lamar-xxx-lyrics,"America, God bless you if it's good to you Ame..."
3,A-ap-rocky-fuckin-problems-lyrics,"I love bad bitches, that's my fuckin' problem ..."
4,Kendrick-lamar-dna-lyrics,"I got, I got, I got, I got— Loyalty, got royal..."
...,...,...
37988,Pnl-tchiki-tchiki-lyrics,"Ouais, ouais, ouais, ouais, ouais J'sens l'ode..."
37989,Pnl-chang-lyrics,"Chang, chang, chang, j'marche sur la muraille ..."
37990,Pnl-simba-lyrics,"Ouais, Ah on va voir Ouais, ouais Et ça r'comm..."
37991,Pnl-je-thaine-version-orange-lyrics,Que dire à part que ça pue dans la street à pa...


# Combine Song, Lyrics and Annotation

In [15]:
song_info_df['url_name'] = song_info_df['url_name'].astype("string")
df['Song'] = df['Song'].astype("string").str.replace(" ","")
song_info_df['url_name'] = song_info_df['url_name'].astype("string").str.replace(" ","")

In [16]:
combined = df.merge(song_info_df, how='left', left_on='Song', right_on='url_name')

In [17]:
combined.head()

Unnamed: 0,Artist,Song,Lyrics,Annotation,url_name,title,primary_artist,release_date,pyongs,contributors,has_bio,views,tags,annotations
0,KendrickLamar,Kendrick-lamar-swimming-pools-drank-lyrics,Now I done grew up 'round some people livin' t...,This song is about me reminising about my year...,Kendrick-lamar-swimming-pools-drank-lyrics,Swimming Pools (Drank),Kendrick-lamar,"July 31, 2012",894.0,403.0,True,5589280.0,"[Trap, Conscious Hip-Hop, Memes, West Coast, R...",
1,KendrickLamar,Kendrick-lamar-swimming-pools-drank-lyrics,Backstroke every day in Chicago,back stroke. playing on the word swimming pool...,Kendrick-lamar-swimming-pools-drank-lyrics,Swimming Pools (Drank),Kendrick-lamar,"July 31, 2012",894.0,403.0,True,5589280.0,"[Trap, Conscious Hip-Hop, Memes, West Coast, R...",
2,KendrickLamar,Kendrick-lamar-money-trees-lyrics,I fucked Sherane and went to tell my bros (tel...,That was a metaphor for both. Not saying that ...,Kendrick-lamar-money-trees-lyrics,Money Trees,Kendrick-lamar,"October 22, 2012",880.0,394.0,True,4592003.0,"[Conscious Hip-Hop, West Coast, Rap, Producer]",
3,JayRock,Kendrick-lamar-money-trees-lyrics,[Verse 3: Jay Rock],"I first heard the record before it dropped, ri...",Kendrick-lamar-money-trees-lyrics,Money Trees,Kendrick-lamar,"October 22, 2012",880.0,394.0,True,4592003.0,"[Conscious Hip-Hop, West Coast, Rap, Producer]",
4,JayRock,Kendrick-lamar-money-trees-lyrics,Imagine Rock up in them projects\nWhere them n...,Moving out the projects was hard. That’s home....,Kendrick-lamar-money-trees-lyrics,Money Trees,Kendrick-lamar,"October 22, 2012",880.0,394.0,True,4592003.0,"[Conscious Hip-Hop, West Coast, Rap, Producer]",


In [18]:
cleaned_lyrics['song'] = cleaned_lyrics['song'].astype("string").str.replace(" ","")

In [19]:
cleaned_lyrics['song'] = cleaned_lyrics['song'].astype("string").str.replace(" ","")
combined_final = combined.merge(cleaned_lyrics, how='left', left_on='Song', right_on='song')
combined_final

Unnamed: 0,Artist,Song,Lyrics,Annotation,url_name,title,primary_artist,release_date,pyongs,contributors,has_bio,views,tags,annotations,song,lyrics
0,KendrickLamar,Kendrick-lamar-swimming-pools-drank-lyrics,Now I done grew up 'round some people livin' t...,This song is about me reminising about my year...,Kendrick-lamar-swimming-pools-drank-lyrics,Swimming Pools (Drank),Kendrick-lamar,"July 31, 2012",894.0,403.0,True,5589280.0,"[Trap, Conscious Hip-Hop, Memes, West Coast, R...",,Kendrick-lamar-swimming-pools-drank-lyrics,"Pour up (Drank), head shot (Drank) Sit down (D..."
1,KendrickLamar,Kendrick-lamar-swimming-pools-drank-lyrics,Backstroke every day in Chicago,back stroke. playing on the word swimming pool...,Kendrick-lamar-swimming-pools-drank-lyrics,Swimming Pools (Drank),Kendrick-lamar,"July 31, 2012",894.0,403.0,True,5589280.0,"[Trap, Conscious Hip-Hop, Memes, West Coast, R...",,Kendrick-lamar-swimming-pools-drank-lyrics,"Pour up (Drank), head shot (Drank) Sit down (D..."
2,KendrickLamar,Kendrick-lamar-money-trees-lyrics,I fucked Sherane and went to tell my bros (tel...,That was a metaphor for both. Not saying that ...,Kendrick-lamar-money-trees-lyrics,Money Trees,Kendrick-lamar,"October 22, 2012",880.0,394.0,True,4592003.0,"[Conscious Hip-Hop, West Coast, Rap, Producer]",,Kendrick-lamar-money-trees-lyrics,"Uh, me and my niggas tryna get it, ya bish (ya..."
3,JayRock,Kendrick-lamar-money-trees-lyrics,[Verse 3: Jay Rock],"I first heard the record before it dropped, ri...",Kendrick-lamar-money-trees-lyrics,Money Trees,Kendrick-lamar,"October 22, 2012",880.0,394.0,True,4592003.0,"[Conscious Hip-Hop, West Coast, Rap, Producer]",,Kendrick-lamar-money-trees-lyrics,"Uh, me and my niggas tryna get it, ya bish (ya..."
4,JayRock,Kendrick-lamar-money-trees-lyrics,Imagine Rock up in them projects\nWhere them n...,Moving out the projects was hard. That’s home....,Kendrick-lamar-money-trees-lyrics,Money Trees,Kendrick-lamar,"October 22, 2012",880.0,394.0,True,4592003.0,"[Conscious Hip-Hop, West Coast, Rap, Producer]",,Kendrick-lamar-money-trees-lyrics,"Uh, me and my niggas tryna get it, ya bish (ya..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15546,BBP,Pnl-naha-lyrics,[Produit par BBP],"J'ai fais plein de prods, je leur ai envoyé et...",Pnl-naha-lyrics,Naha,Pnl,"September 16, 2016",53.0,141.0,True,865612.0,"[Cloud Rap, French Rap, France, Rap]",,Pnl-naha-lyrics,Mes gouttes de sueur ont l'odeur d'l'Enfer Ça ...
15547,BBP,Pnl-je-thaine-version-orange-lyrics,[Produit par BBP],"À la base j'avais fait cette prod séparément, ...",Pnl-je-thaine-version-orange-lyrics,Je t’haine (Version Orange),Pnl,"September 16, 2016",10.0,67.0,True,270890.0,"[Cloud Rap, French Rap, France, Rap]",,Pnl-je-thaine-version-orange-lyrics,Que dire à part que ça pue dans la street à pa...
15548,BBP,Pnl-je-thaine-version-orange-lyrics,[Produit par BBP],Vald,Pnl-je-thaine-version-orange-lyrics,Je t’haine (Version Orange),Pnl,"September 16, 2016",10.0,67.0,True,270890.0,"[Cloud Rap, French Rap, France, Rap]",,Pnl-je-thaine-version-orange-lyrics,Que dire à part que ça pue dans la street à pa...
15549,BBP,Pnl-je-thaine-version-orange-lyrics,[Produit par BBP],", mais son DA la trouvait “sans intérêt”. Du c...",Pnl-je-thaine-version-orange-lyrics,Je t’haine (Version Orange),Pnl,"September 16, 2016",10.0,67.0,True,270890.0,"[Cloud Rap, French Rap, France, Rap]",,Pnl-je-thaine-version-orange-lyrics,Que dire à part que ça pue dans la street à pa...


In [20]:
combined_final = combined_final.rename(columns={'lyrics': 'lyrics_full'})
combined_final

Unnamed: 0,Artist,Song,Lyrics,Annotation,url_name,title,primary_artist,release_date,pyongs,contributors,has_bio,views,tags,annotations,song,lyrics_full
0,KendrickLamar,Kendrick-lamar-swimming-pools-drank-lyrics,Now I done grew up 'round some people livin' t...,This song is about me reminising about my year...,Kendrick-lamar-swimming-pools-drank-lyrics,Swimming Pools (Drank),Kendrick-lamar,"July 31, 2012",894.0,403.0,True,5589280.0,"[Trap, Conscious Hip-Hop, Memes, West Coast, R...",,Kendrick-lamar-swimming-pools-drank-lyrics,"Pour up (Drank), head shot (Drank) Sit down (D..."
1,KendrickLamar,Kendrick-lamar-swimming-pools-drank-lyrics,Backstroke every day in Chicago,back stroke. playing on the word swimming pool...,Kendrick-lamar-swimming-pools-drank-lyrics,Swimming Pools (Drank),Kendrick-lamar,"July 31, 2012",894.0,403.0,True,5589280.0,"[Trap, Conscious Hip-Hop, Memes, West Coast, R...",,Kendrick-lamar-swimming-pools-drank-lyrics,"Pour up (Drank), head shot (Drank) Sit down (D..."
2,KendrickLamar,Kendrick-lamar-money-trees-lyrics,I fucked Sherane and went to tell my bros (tel...,That was a metaphor for both. Not saying that ...,Kendrick-lamar-money-trees-lyrics,Money Trees,Kendrick-lamar,"October 22, 2012",880.0,394.0,True,4592003.0,"[Conscious Hip-Hop, West Coast, Rap, Producer]",,Kendrick-lamar-money-trees-lyrics,"Uh, me and my niggas tryna get it, ya bish (ya..."
3,JayRock,Kendrick-lamar-money-trees-lyrics,[Verse 3: Jay Rock],"I first heard the record before it dropped, ri...",Kendrick-lamar-money-trees-lyrics,Money Trees,Kendrick-lamar,"October 22, 2012",880.0,394.0,True,4592003.0,"[Conscious Hip-Hop, West Coast, Rap, Producer]",,Kendrick-lamar-money-trees-lyrics,"Uh, me and my niggas tryna get it, ya bish (ya..."
4,JayRock,Kendrick-lamar-money-trees-lyrics,Imagine Rock up in them projects\nWhere them n...,Moving out the projects was hard. That’s home....,Kendrick-lamar-money-trees-lyrics,Money Trees,Kendrick-lamar,"October 22, 2012",880.0,394.0,True,4592003.0,"[Conscious Hip-Hop, West Coast, Rap, Producer]",,Kendrick-lamar-money-trees-lyrics,"Uh, me and my niggas tryna get it, ya bish (ya..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15546,BBP,Pnl-naha-lyrics,[Produit par BBP],"J'ai fais plein de prods, je leur ai envoyé et...",Pnl-naha-lyrics,Naha,Pnl,"September 16, 2016",53.0,141.0,True,865612.0,"[Cloud Rap, French Rap, France, Rap]",,Pnl-naha-lyrics,Mes gouttes de sueur ont l'odeur d'l'Enfer Ça ...
15547,BBP,Pnl-je-thaine-version-orange-lyrics,[Produit par BBP],"À la base j'avais fait cette prod séparément, ...",Pnl-je-thaine-version-orange-lyrics,Je t’haine (Version Orange),Pnl,"September 16, 2016",10.0,67.0,True,270890.0,"[Cloud Rap, French Rap, France, Rap]",,Pnl-je-thaine-version-orange-lyrics,Que dire à part que ça pue dans la street à pa...
15548,BBP,Pnl-je-thaine-version-orange-lyrics,[Produit par BBP],Vald,Pnl-je-thaine-version-orange-lyrics,Je t’haine (Version Orange),Pnl,"September 16, 2016",10.0,67.0,True,270890.0,"[Cloud Rap, French Rap, France, Rap]",,Pnl-je-thaine-version-orange-lyrics,Que dire à part que ça pue dans la street à pa...
15549,BBP,Pnl-je-thaine-version-orange-lyrics,[Produit par BBP],", mais son DA la trouvait “sans intérêt”. Du c...",Pnl-je-thaine-version-orange-lyrics,Je t’haine (Version Orange),Pnl,"September 16, 2016",10.0,67.0,True,270890.0,"[Cloud Rap, French Rap, France, Rap]",,Pnl-je-thaine-version-orange-lyrics,Que dire à part que ça pue dans la street à pa...


In [21]:
combined_final = combined_final[["primary_artist", "Annotation", "Lyrics", "title", "release_date", "tags", "lyrics_full"]]
combined_final

Unnamed: 0,primary_artist,Annotation,Lyrics,title,release_date,tags,lyrics_full
0,Kendrick-lamar,This song is about me reminising about my year...,Now I done grew up 'round some people livin' t...,Swimming Pools (Drank),"July 31, 2012","[Trap, Conscious Hip-Hop, Memes, West Coast, R...","Pour up (Drank), head shot (Drank) Sit down (D..."
1,Kendrick-lamar,back stroke. playing on the word swimming pool...,Backstroke every day in Chicago,Swimming Pools (Drank),"July 31, 2012","[Trap, Conscious Hip-Hop, Memes, West Coast, R...","Pour up (Drank), head shot (Drank) Sit down (D..."
2,Kendrick-lamar,That was a metaphor for both. Not saying that ...,I fucked Sherane and went to tell my bros (tel...,Money Trees,"October 22, 2012","[Conscious Hip-Hop, West Coast, Rap, Producer]","Uh, me and my niggas tryna get it, ya bish (ya..."
3,Kendrick-lamar,"I first heard the record before it dropped, ri...",[Verse 3: Jay Rock],Money Trees,"October 22, 2012","[Conscious Hip-Hop, West Coast, Rap, Producer]","Uh, me and my niggas tryna get it, ya bish (ya..."
4,Kendrick-lamar,Moving out the projects was hard. That’s home....,Imagine Rock up in them projects\nWhere them n...,Money Trees,"October 22, 2012","[Conscious Hip-Hop, West Coast, Rap, Producer]","Uh, me and my niggas tryna get it, ya bish (ya..."
...,...,...,...,...,...,...,...
15546,Pnl,"J'ai fais plein de prods, je leur ai envoyé et...",[Produit par BBP],Naha,"September 16, 2016","[Cloud Rap, French Rap, France, Rap]",Mes gouttes de sueur ont l'odeur d'l'Enfer Ça ...
15547,Pnl,"À la base j'avais fait cette prod séparément, ...",[Produit par BBP],Je t’haine (Version Orange),"September 16, 2016","[Cloud Rap, French Rap, France, Rap]",Que dire à part que ça pue dans la street à pa...
15548,Pnl,Vald,[Produit par BBP],Je t’haine (Version Orange),"September 16, 2016","[Cloud Rap, French Rap, France, Rap]",Que dire à part que ça pue dans la street à pa...
15549,Pnl,", mais son DA la trouvait “sans intérêt”. Du c...",[Produit par BBP],Je t’haine (Version Orange),"September 16, 2016","[Cloud Rap, French Rap, France, Rap]",Que dire à part que ça pue dans la street à pa...


In [None]:
combined_final.to_csv("/content/gdrive/My Drive/AC215 Project/combined_df.csv")

In [None]:
combined_final = pd.read_csv("/content/gdrive/My Drive/AC215 Project/combined_df.csv")

In [23]:
combined_final.primary_artist.nunique()

609

In [26]:
combined_final.tags

Unnamed: 0,tags
0,"[Trap, Conscious Hip-Hop, Memes, West Coast, R..."
1,"[Trap, Conscious Hip-Hop, Memes, West Coast, R..."
2,"[Conscious Hip-Hop, West Coast, Rap, Producer]"
3,"[Conscious Hip-Hop, West Coast, Rap, Producer]"
4,"[Conscious Hip-Hop, West Coast, Rap, Producer]"
...,...
15546,"[Cloud Rap, French Rap, France, Rap]"
15547,"[Cloud Rap, French Rap, France, Rap]"
15548,"[Cloud Rap, French Rap, France, Rap]"
15549,"[Cloud Rap, French Rap, France, Rap]"


In [28]:
# Summarize the dataset
combined_final.describe()

Unnamed: 0,primary_artist,Annotation,Lyrics,title,release_date,tags,lyrics_full
count,15549,15551,15461,15549,14636,15549,15551
unique,609,12351,7716,1989,1100,599,2028
top,Pusha-t,\n,[Verse 1],Age Ain’t Nothing But A Number To Write Lyrics...,"September 18, 2015",[Rap],SCENE. NEW YORK CITY. TAXI. Enter SUPERFLYRONA...
freq,467,1236,112,195,180,6347,195


In [30]:
combined_final.isnull().sum()

Unnamed: 0,0
primary_artist,2
Annotation,0
Lyrics,90
title,2
release_date,915
tags,2
lyrics_full,0


In [31]:
combined_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15551 entries, 0 to 15550
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   primary_artist  15549 non-null  object
 1   Annotation      15551 non-null  object
 2   Lyrics          15461 non-null  object
 3   title           15549 non-null  object
 4   release_date    14636 non-null  object
 5   tags            15549 non-null  object
 6   lyrics_full     15551 non-null  object
dtypes: object(7)
memory usage: 850.6+ KB


# Save small set of data

In [22]:
first_500_rows = combined_final.head(500)
first_500_rows.tags.value_counts()
# first_100_rows.to_csv("/content/gdrive/My Drive/AC215 Project/combined_df_test.csv")

Unnamed: 0_level_0,count
tags,Unnamed: 1_level_1
[Rap],113
"[West Coast, Rap]",74
"[Rock, Screen, Rap]",39
"[SHADYXV, Rap]",30
"[Rap Rock, Screen, Soundtrack, SHADYXV, Rap]",29
"[Bay Area, Trap, West Coast, Rap, Soundtrack]",26
"[Trap, Rap]",14
"[Indie Rap, Conscious Hip-Hop, West Coast, Rap]",14
"[Rap, R&B]",14
"[Comedy, Hardcore Hip-Hop, SHADYXV, Horrorcore, G-Funk, Memes, Rap]",12
