In [1]:
"""This script is a repository for all successful code written and tested in
the wrangling phase.  Goal is to have a single place to run script to get from 
seven unique data sets down to four with the same columns and headers.
With the .csv files pulled from the buckets.
"""

import pandas as pd
import numpy as np
import s3fs
import os
import io
import boto3

import s3fs
fs = s3fs.S3FileSystem(anon=False,key='###########',secret='##############')

from dotenv import load_dotenv
load_dotenv(verbose=True)

def aws_session(region_name='us-east-1'):
    return boto3.session.Session(aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'), #looks for any .env file
                                aws_secret_access_key=os.getenv('AWS_ACCESS_KEY_SECRET'), #Has to be in same directory
                                region_name=region_name) #from above

def make_bucket(name, acl): 
    session = aws_session()
    s3_resource = session.resource('s3')
    return s3_resource.create_bucket(Bucket=name, ACL=acl)

def upload_file_to_bucket(bucket_name, file_path):
    session = aws_session()
    s3_resource = session.resource('s3')
    file_dir, file_name = os.path.split(file_path)

    bucket = s3_resource.Bucket(bucket_name)
    bucket.upload_file(
      Filename=file_path,
      Key=file_name,
      ExtraArgs={'ACL': 'public-read'}
    )

    s3_url = f"https://{bucket_name}.s3.amazonaws.com/{file_name}"
    return s3_url

## s3_url = upload_file_to_bucket('worm-begin','lyrics_25k.csv')
## print(s3_url) 
## s3_url = upload_file_to_bucket('worm-begin','album_details_25k.csv')
## print(s3_url)
## s3_url = upload_file_to_bucket('worm-begin','songs_details_25k.csv')
## print(s3_url)

def download_file_from_bucket(bucket_name, s3_key, dst_path):
    session = aws_session()
    s3_resource = session.resource('s3')
    bucket = s3_resource.Bucket(bucket_name)
    bucket.download_file(Key=s3_key, Filename=dst_path)

## download_file_from_bucket('music-demo-lyrics', 'lyrics_25k.csv', 'short_name.csv')
## with open('short_name.csv') as fo:
    ## print(fo.read())

"""I am merging the genres dataset first.  This is a large set with lyrics 
and a smaller set with the artist's genre. (Concerned that genre is 
connected to artist rather than song.)
This 'boto3' method requires a place for the csv to go in the target path. 
So I've 'touched' a few csv files in the working dir from the command line.
"""
download_file_from_bucket('worm-begin','genres_lyrics_data.csv','genres_lyrics.csv')
with open('genres_lyrics.csv') as fo:
    lyrics_df = pd.read_csv(fo)

download_file_from_bucket('worm-begin','genres_artists_data.csv','genres_genres.csv')
with open('genres_genres.csv') as fo:
    genres_df = pd.read_csv(fo)

"""Reduce genres_df to just artist-name (the key with lyrics_df) and drop dupes."""
genres1_df = pd.DataFrame(genres_df, columns=['Link','Genre'])
genres2_df = genres1_df.rename(columns={'Link':'artist_name','Genre':'genre'})
genres2_df[genres2_df.duplicated(keep = False)]

"""Reorder lyrics_df columns and rename IAW naming convention. Drop duplicates.
Drop all but the ENGLISH lyrics."""
lyrics2_df = lyrics_df.rename(columns={'ALink':'artist_name','SName':'song_name','SLink':'link','Lyric':'lyrics','Idiom':'language'})
lyrics3_df = (lyrics2_df[lyrics2_df['language']=='ENGLISH'])
lyrics3_df[lyrics3_df.duplicated(keep = False)]

"""Merge lyrics_df with genre_df to add genre to a single df with the lyrics."""

merged_genre_df = pd.merge(lyrics3_df,genres2_df,on = 'artist_name') 

""" NExt need to add a column for the orginal.csv name"""
merged2_genre_df = pd.DataFrame((merged_genre_df), columns = ['original_csv','artist_name','song_name','link','lyrics','language','genre'])
merged2_genre_df['original_csv'] = 'genres_csv'

"""Write the dataframe to a csv.  First round I did it in a notebook, with no
path.  Locked it all up."""


'Write the dataframe to a csv.  First round I did it in a notebook, with no\npath.  Locked it all up.'

"""Like BLEW IT UP exciting..."""

In [2]:
merged2_genre_df

Unnamed: 0,original_csv,artist_name,song_name,link,lyrics,language,genre
0,genres_csv,/10000-maniacs/,More Than This,/10000-maniacs/more-than-this.html,I could feel at the time. There was no way of ...,ENGLISH,Rock
1,genres_csv,/10000-maniacs/,More Than This,/10000-maniacs/more-than-this.html,I could feel at the time. There was no way of ...,ENGLISH,Pop
2,genres_csv,/10000-maniacs/,Because The Night,/10000-maniacs/because-the-night.html,"Take me now, baby, here as I am. Hold me close...",ENGLISH,Rock
3,genres_csv,/10000-maniacs/,Because The Night,/10000-maniacs/because-the-night.html,"Take me now, baby, here as I am. Hold me close...",ENGLISH,Pop
4,genres_csv,/10000-maniacs/,These Are Days,/10000-maniacs/these-are-days.html,These are. These are days you'll remember. Nev...,ENGLISH,Rock
...,...,...,...,...,...,...,...
124225,genres_csv,/sambo/,Valerie,/sambo/valerie.html,Well sometimes I go out by myself. And I look ...,ENGLISH,Rock
124226,genres_csv,/sambo/,Wake Me Up,/sambo/wake-me-up.html,Feeling my way through the darkness. Guided by...,ENGLISH,Samba
124227,genres_csv,/sambo/,Wake Me Up,/sambo/wake-me-up.html,Feeling my way through the darkness. Guided by...,ENGLISH,Rock
124228,genres_csv,/seu-jorge/,Don't,/seu-jorge/dont.html,"Don't, don't, that's what you say. Each time t...",ENGLISH,Samba


In [3]:
import s3fs
fs = s3fs.S3FileSystem(anon=False,key='###########',secret='#############')

In [4]:
test1 = pd.read_csv('s3://worm-begin/decades_tcc_ceds_music.csv')

In [5]:
test1

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,...,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age
0,0,mukesh,mohabbat bhi jhoothi,1950,pop,hold time feel break feel untrue convince spea...,95,0.000598,0.063746,0.000598,...,0.380299,0.117175,0.357739,0.454119,0.997992,0.901822,0.339448,0.137110,sadness,1.000000
1,4,frankie laine,i believe,1950,pop,believe drop rain fall grow believe darkest ni...,51,0.035537,0.096777,0.443435,...,0.001284,0.001284,0.331745,0.647540,0.954819,0.000002,0.325021,0.263240,world/life,1.000000
2,6,johnnie ray,cry,1950,pop,sweetheart send letter goodbye secret feel bet...,24,0.002770,0.002770,0.002770,...,0.002770,0.225422,0.456298,0.585288,0.840361,0.000000,0.351814,0.139112,music,1.000000
3,10,pérez prado,patricia,1950,pop,kiss lips want stroll charm mambo chacha merin...,54,0.048249,0.001548,0.001548,...,0.225889,0.001548,0.686992,0.744404,0.083935,0.199393,0.775350,0.743736,romantic,1.000000
4,12,giorgos papadopoulos,apopse eida oneiro,1950,pop,till darling till matter know till dream live ...,48,0.001350,0.001350,0.417772,...,0.068800,0.001350,0.291671,0.646489,0.975904,0.000246,0.597073,0.394375,romantic,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28367,82447,mack 10,10 million ways,2019,hip hop,cause fuck leave scar tick tock clock come kno...,78,0.001350,0.001350,0.001350,...,0.065664,0.001350,0.889527,0.759711,0.062549,0.000000,0.751649,0.695686,obscene,0.014286
28368,82448,m.o.p.,ante up (robbin hoodz theory),2019,hip hop,minks things chain ring braclets yap fame come...,67,0.001284,0.001284,0.035338,...,0.001284,0.001284,0.662082,0.789580,0.004607,0.000002,0.922712,0.797791,obscene,0.014286
28369,82449,nine,whutcha want?,2019,hip hop,get ban get ban stick crack relax plan attack ...,77,0.001504,0.154302,0.168988,...,0.001504,0.001504,0.663165,0.726970,0.104417,0.000001,0.838211,0.767761,obscene,0.014286
28370,82450,will smith,switch,2019,hip hop,check check yeah yeah hear thing call switch g...,67,0.001196,0.001196,0.001196,...,0.001196,0.001196,0.883028,0.786888,0.007027,0.000503,0.508450,0.885882,obscene,0.014286


In [6]:
test1.to_csv(decades.csv, index=False)

NameError: name 'decades' is not defined

In [7]:
test1.to_csv('decades.csv', index=False)

In [8]:
merged2_genre_df.to_csv('merged2_genre_df.csv', index=False)

In [10]:
fs.ls('music-demo-lyrics')

['music-demo-lyrics/lyrics_25k.csv',
 'music-demo-lyrics/test2.txt',
 'music-demo-lyrics/test2.txtpwd']

In [11]:
upload_file_to_bucket('music-demo-lyrics','merged2_genre_df.csv')

'https://music-demo-lyrics.s3.amazonaws.com/merged2_genre_df.csv'

In [12]:
merged2_genre_df

Unnamed: 0,original_csv,artist_name,song_name,link,lyrics,language,genre
0,genres_csv,/10000-maniacs/,More Than This,/10000-maniacs/more-than-this.html,I could feel at the time. There was no way of ...,ENGLISH,Rock
1,genres_csv,/10000-maniacs/,More Than This,/10000-maniacs/more-than-this.html,I could feel at the time. There was no way of ...,ENGLISH,Pop
2,genres_csv,/10000-maniacs/,Because The Night,/10000-maniacs/because-the-night.html,"Take me now, baby, here as I am. Hold me close...",ENGLISH,Rock
3,genres_csv,/10000-maniacs/,Because The Night,/10000-maniacs/because-the-night.html,"Take me now, baby, here as I am. Hold me close...",ENGLISH,Pop
4,genres_csv,/10000-maniacs/,These Are Days,/10000-maniacs/these-are-days.html,These are. These are days you'll remember. Nev...,ENGLISH,Rock
...,...,...,...,...,...,...,...
124225,genres_csv,/sambo/,Valerie,/sambo/valerie.html,Well sometimes I go out by myself. And I look ...,ENGLISH,Rock
124226,genres_csv,/sambo/,Wake Me Up,/sambo/wake-me-up.html,Feeling my way through the darkness. Guided by...,ENGLISH,Samba
124227,genres_csv,/sambo/,Wake Me Up,/sambo/wake-me-up.html,Feeling my way through the darkness. Guided by...,ENGLISH,Rock
124228,genres_csv,/seu-jorge/,Don't,/seu-jorge/dont.html,"Don't, don't, that's what you say. Each time t...",ENGLISH,Samba


In [14]:
merged2_genre_df.describe(include='all')

Unnamed: 0,original_csv,artist_name,song_name,link,lyrics,language,genre
count,124230,124230,124230,124230,124230,124230,124230
unique,1,1151,67266,87151,86419,1,6
top,genres_csv,/chris-brown/,Intro,/m-i-a/pull-up-the-people.html,[This song is an instrumental.].,ENGLISH,Rock
freq,124230,2348,66,12,17,124230,60585


I have finished with the genre dataframe.