In [2]:
import pandas as pd
import numpy as np
import s3fs
import os
import io
import boto3

import s3fs
fs = s3fs.S3FileSystem(anon=False,key='####',secret='#####')

from dotenv import load_dotenv
load_dotenv(verbose=True)

def aws_session(region_name='us-east-1'):
    return boto3.session.Session(aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'), #looks for any .env file
                                aws_secret_access_key=os.getenv('AWS_ACCESS_KEY_SECRET'), #Has to be in same directory
                                region_name=region_name) #from above

def make_bucket(name, acl): 
    session = aws_session()
    s3_resource = session.resource('s3')
    return s3_resource.create_bucket(Bucket=name, ACL=acl)

def upload_file_to_bucket(bucket_name, file_path):
    session = aws_session()
    s3_resource = session.resource('s3')
    file_dir, file_name = os.path.split(file_path)

    bucket = s3_resource.Bucket(bucket_name)
    bucket.upload_file(
      Filename=file_path,
      Key=file_name,
      ExtraArgs={'ACL': 'public-read'}
    )

    s3_url = f"https://{bucket_name}.s3.amazonaws.com/{file_name}"
    return s3_url

def download_file_from_bucket(bucket_name, s3_key, dst_path):
    session = aws_session()
    s3_resource = session.resource('s3')
    bucket = s3_resource.Bucket(bucket_name)
    bucket.download_file(Key=s3_key, Filename=dst_path)

## download_file_from_bucket('music-demo-lyrics', 'lyrics_25k.csv', 'short_name.csv')
## with open('short_name.csv') as fo:
    ## print(fo.read())

genre_df = pd.read_csv('s3://wrangled-1/merged3_genre_df.csv')
decades_df = pd.read_csv('s3://wrangled-1/decades_df.csv')

In [3]:
genre_df.describe(include='all')

Unnamed: 0,original_csv,artist_name,song_name,link,lyrics,language,genre,date
count,94915,94915,94915,94915,94915,94915,94915,0.0
unique,1,1151,67266,87151,86419,1,6,
top,genres_csv,chris-brown,Intro,/duran-duran/im-looking-for-cracks-in-the-pave...,[This song is an instrumental.].,ENGLISH,Rock,
freq,94915,1124,55,4,16,94915,50160,
mean,,,,,,,,
std,,,,,,,,
min,,,,,,,,
25%,,,,,,,,
50%,,,,,,,,
75%,,,,,,,,


In [5]:
decades_df.describe(include='all')

Unnamed: 0,original_csv,artist_name,song_name,link,lyrics,language,genre,date
count,28372,28372,28372,0.0,28372,0.0,28372,28372.0
unique,1,5426,23689,,28372,,7,
top,decades_tcc,johnny cash,tonight,,begin live friend live meet common true know e...,,pop,
freq,28372,190,17,,1,,7042,
mean,,,,,,,,1990.236888
std,,,,,,,,18.487463
min,,,,,,,,1950.0
25%,,,,,,,,1975.0
50%,,,,,,,,1991.0
75%,,,,,,,,2007.0


In [6]:
big2_df = pd.concat([genre_df, decades_df]).reset_index(drop = True)
big2_df.describe(include='all')

Unnamed: 0,original_csv,artist_name,song_name,link,lyrics,language,genre,date
count,123287,123287,123287,94915,123287,94915,123287,28372.0
unique,2,6445,90916,87151,114791,1,13,
top,genres_csv,chris-brown,Intro,/duran-duran/im-looking-for-cracks-in-the-pave...,[This song is an instrumental.].,ENGLISH,Rock,
freq,94915,1124,55,4,16,94915,50160,
mean,,,,,,,,1990.236888
std,,,,,,,,18.487463
min,,,,,,,,1950.0
25%,,,,,,,,1975.0
50%,,,,,,,,1991.0
75%,,,,,,,,2007.0


In [9]:
big2_df.tail(20)

Unnamed: 0,original_csv,artist_name,song_name,link,lyrics,language,genre,date
123267,decades_tcc,mos def,"sex, love & money",,money brooklyn money come danger rockin best b...,,hip hop,2019.0
123268,decades_tcc,cassidy,a.m. to p.m.,,work niggas beef gonna spray pump street nigga...,,hip hop,2019.0
123269,decades_tcc,eric b. & rakim,paid in full,,rakim rakim knowledge try pay check nobry walt...,,hip hop,2019.0
123270,decades_tcc,t-pain,look at her go (feat. chris brown),,boom come baby boom come baby boom come baby c...,,hip hop,2019.0
123271,decades_tcc,$uicideboy$,you're now tuning into 66.6 fm with dj rapture...,,sink fuck spine second guess crime snort slug ...,,hip hop,2019.0
123272,decades_tcc,ghostface killah,iron maiden,,doin turf punk message smokey smokey smokey mo...,,hip hop,2019.0
123273,decades_tcc,q-tip,life is better,,life fill come fall away play bangin gonna ban...,,hip hop,2019.0
123274,decades_tcc,future,tony montana,,fuckin cockroaches motherfuckin freebandz want...,,hip hop,2019.0
123275,decades_tcc,nappy roots,blowin' trees,,nappy root gotta alright flyin dear leave lone...,,hip hop,2019.0
123276,decades_tcc,eazy-e,eazy-duz-it,,eazye bitch galore bitch super duper group com...,,hip hop,2019.0


In [12]:
big2_df[big2_df.duplicated(subset=['artist_name','song_name'])]

Unnamed: 0,original_csv,artist_name,song_name,link,lyrics,language,genre,date
1,genres_csv,10000-maniacs,More Than This,/10000-maniacs/more-than-this.html,I could feel at the time. There was no way of ...,ENGLISH,Pop,
3,genres_csv,10000-maniacs,Because The Night,/10000-maniacs/because-the-night.html,"Take me now, baby, here as I am. Hold me close...",ENGLISH,Pop,
5,genres_csv,10000-maniacs,These Are Days,/10000-maniacs/these-are-days.html,These are. These are days you'll remember. Nev...,ENGLISH,Pop,
7,genres_csv,10000-maniacs,A Campfire Song,/10000-maniacs/a-campfire-song.html,"A lie to say, ""O my mountain has coal veins an...",ENGLISH,Pop,
9,genres_csv,10000-maniacs,Everyday Is Like Sunday,/10000-maniacs/everyday-is-like-sunday.html,Trudging slowly over wet sand. Back to the ben...,ENGLISH,Pop,
...,...,...,...,...,...,...,...,...
94908,genres_csv,sambo,Smells Like Teen Spirit,/sambo/smells-like-teen-spirit.html,"(Chorus). Hello, hello,hello,how low. Hello,he...",ENGLISH,Rock,
94910,genres_csv,sambo,Valerie,/sambo/valerie.html,Well sometimes I go out by myself. And I look ...,ENGLISH,Rock,
94912,genres_csv,sambo,Wake Me Up,/sambo/wake-me-up.html,Feeling my way through the darkness. Guided by...,ENGLISH,Rock,
97230,decades_tcc,blondie,11:59,,lean corner like candidate sidewalk social sci...,,pop,1978.0


In [13]:
big2_df[big2_df.duplicated(subset=['lyrics'])]

Unnamed: 0,original_csv,artist_name,song_name,link,lyrics,language,genre,date
1,genres_csv,10000-maniacs,More Than This,/10000-maniacs/more-than-this.html,I could feel at the time. There was no way of ...,ENGLISH,Pop,
3,genres_csv,10000-maniacs,Because The Night,/10000-maniacs/because-the-night.html,"Take me now, baby, here as I am. Hold me close...",ENGLISH,Pop,
5,genres_csv,10000-maniacs,These Are Days,/10000-maniacs/these-are-days.html,These are. These are days you'll remember. Nev...,ENGLISH,Pop,
7,genres_csv,10000-maniacs,A Campfire Song,/10000-maniacs/a-campfire-song.html,"A lie to say, ""O my mountain has coal veins an...",ENGLISH,Pop,
9,genres_csv,10000-maniacs,Everyday Is Like Sunday,/10000-maniacs/everyday-is-like-sunday.html,Trudging slowly over wet sand. Back to the ben...,ENGLISH,Pop,
...,...,...,...,...,...,...,...,...
94904,genres_csv,sambo,Jingle Bell Rock,/sambo/jingle-bell-rock.html,"Jingle bell, jingle bell, jingle bell rock. Ji...",ENGLISH,Rock,
94906,genres_csv,sambo,Rock And Roll,/sambo/rock-and-roll.html,"It's been a long time since I rock and rolled,...",ENGLISH,Rock,
94908,genres_csv,sambo,Smells Like Teen Spirit,/sambo/smells-like-teen-spirit.html,"(Chorus). Hello, hello,hello,how low. Hello,he...",ENGLISH,Rock,
94910,genres_csv,sambo,Valerie,/sambo/valerie.html,Well sometimes I go out by myself. And I look ...,ENGLISH,Rock,


In [14]:
big2_df[big2_df.duplicated()]

Unnamed: 0,original_csv,artist_name,song_name,link,lyrics,language,genre,date


In [15]:
big2_df.drop_duplicates(subset=['lyrics'], inplace = True)
big2_df.describe(include='all')

Unnamed: 0,original_csv,artist_name,song_name,link,lyrics,language,genre,date
count,114791,114791,114791,86419,114791,86419,114791,28372.0
unique,2,6441,90519,86419,114791,1,13,
top,genres_csv,elvis-presley,Intro,/joan-osborne/loves-in-need-of-love-today.html,"Well ya know when you're young,. There's such ...",ENGLISH,Rock,
freq,86419,747,50,1,1,86419,47409,
mean,,,,,,,,1990.236888
std,,,,,,,,18.487463
min,,,,,,,,1950.0
25%,,,,,,,,1975.0
50%,,,,,,,,1991.0
75%,,,,,,,,2007.0


In [16]:
big2_df.to_csv('big2_df.csv', index = False)