# Preprocess calls

### Imports

In [1]:
import pandas as pd

from py_files.string_cleaning import clean_chords
from py_files.genre_cleaning import genre_cleaning
from py_files.columns import new_columns, song_length, drop_dups_cols, expand_cols
from py_files.get_data_slice import filter_length, get_songs
from py_files.utils import df_to_csv, count_chords, count_genres, count_artists, \
                  get_csv_data, get_text_data

## Read Data

In [2]:
raw_kaggle_df = get_csv_data('kaggle_raw.csv')
len(raw_kaggle_df)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/Emily/.pyenv/versions/3.10.6/envs/cleaningenv/lib/python3.10/final-project-data/data/raw/kaggle_raw.csv'

In [None]:
raw_kaggle_df.head(3)

In [None]:
raw_jazz_col = get_text_data('jazz_raw.txt')
raw_jazz_df = expand_cols(raw_jazz_col)
raw_jazz_df

## Drop Duplicates and unwanted columns (only if song/artist data exists)

In [None]:
slim_df = drop_dups_cols(raw_kaggle_df)
len(slim_df)

## Concat DFs

In [None]:
concat_df = pd.concat([slim_df, raw_jazz_df], ignore_index=True)
len(concat_df)

## Clean Chords

In [None]:
cleaned_df = concat_df.copy()
cleaned_chords_column = clean_chords(concat_df['chords'])
cleaned_df['chords'] = cleaned_chords_column
len(cleaned_df)

In [None]:
cleaned_df

### drop duplicates based on chords and song name

In [None]:
new_columns_df = new_columns(cleaned_df)

unreplicated_df = \
    new_columns_df.drop_duplicates(subset=['chords_list', 'song_name'],
                                   keep = 'last').reset_index(drop = True)
    
# drop unnecessary columns
unreplicated_df.drop(columns=['song_name', 'chords_list'], inplace=True)

## Clean Genres

In [None]:
slim_genres = genre_cleaning(unreplicated_df['genres'])
clean_genres_df = unreplicated_df.copy()
clean_genres_df['genres'] = slim_genres

## Get Song Length (chord count)

In [None]:
song_len_df = song_length(clean_genres_df)
song_len_df

## Filter DF

#### by song length

In [None]:
final_df = filter_length(song_len_df, 8)
len(final_df)

#### by number of samples (songs)

In [None]:
#final_df = get_songs(final_df, size=5000)

## Send df to cvs

In [None]:
# df_to_csv(final_df, '3', '~/code/emilycardwell/final-project-data/data/clean')

# Utility Calls

In [None]:
chord_count_df = count_chords(final_df, low_freq_to_remove=1, 
                              histplot=False, ascending=True)
chord_count_df.head(50)

In [None]:
genre_count_df = count_genres(final_df, histplot=False)

In [None]:
artists_df = count_artists(final_df, histplot=False)