*Computer_Pianist-Create_CSV.ipynb* <p style='text-align: right;'> <b> September 20th 2020 </b> </p>
<p style='text-align: right;'> <b> David Diston </b> </p>

# Create .CSV of All Raw Data

***Resulting .csv will be used for EDA and Modeling***

This code will sort through each Midi file in the project `Raw_Midi` folder to extract:
1. Variables from the file names including:
    * `Composer Name`, `Performer Name`, `Performer Nationality`, `Piece Name`, and `Performer Gender`
2. Variables from the Midi files including:
    * `Piece Length` (seconds), `Tempo` (bpm), `Time Signature`, `Pitch Proportion` (for all 12 tones), and `Unique Note Velicities`
3. Variables from aggregation including:
    * `Average Velocity`, `Minimum Velocity`, and `Maximum Velocity`

In [1]:
# Import all libraries required for handling midi files and dataframes
import numpy as np
import pandas as pd
import mido
from mido import MidiFile
import pretty_midi
from pretty_midi import TimeSignature
import os

In [2]:
# Create a new dataframe for all midi file meta information
midi_files = pd.DataFrame(columns = ['Composer', 'Performer', 'Perf_Nationality', 'Perf_Gender', 'Piece', 'Length', 'Tempo', 'Time_Signature', 'Freq_C', 'Freq_C#/Db', 'Freq_D', 'Freq_D#/Eb', 'Freq_E', 'Freq_F', 'Freq_F#/Gb', 'Freq_G', 'Freq_G#/Ab', 'Freq_A', 'Freq_A#/Bb', 'Freq_B', 'Unique_Velocities', 'Average_Velocity', 'Min_Velocity', 'Max_Velocity'])

total_files = 0
processed_files = 0

# Iterate over all files in my data directory
for file in os.listdir('Raw_Midi/'):
    
    # Create a variable for the midi meta information of the new dataframe row
    row = []
    # Increase the total files by 1
    total_files += 1
    
    # Here I check to make sure all files are formatted as expected
    if file.endswith('.mid') or file.endswith('.MID'):
        
        # Create a MidiFile type object of each file in Raw_Midi directory
        clip = MidiFile(f'Raw_Midi/{file}')
        
        # Check to make sure that the midi file is coded correctly
        if clip.type == 0:
            
            # Create a pretty_midi object to extract meta information
            pretty_clip = pretty_midi.PrettyMIDI(f'Raw_Midi/{file}')
            
            # Identify the composer, performer, nationality, gender, and piece from the file title
            composer = file[file.find('_C--') + len('_C--') : file.find('_T--')]
            performer = file[file.find('--') + len('--') : file.find('_N--')]
            nationality = file[file.find('_N--') + len('_N--') : file.find('_C--')]
            gender = file[1]
            piece = file[file.find('_T--') + len('_T--') : file.find('.')]
            
            # Extract length, tempo, time signature, and pitch data from the pretty_midi object
            end_time = pretty_clip.get_end_time()
            tempo = pretty_clip.estimate_tempo()
            times = str(pretty_clip.time_signature_changes)
            time_sig = times[times.find('numerator=') + len('numerator=') : times.find('numerator=') + len('numerator=') + 1] \
                + '/' \
                + times[times.find('denominator=') + len('denominator=') : times.find('denominator=') + len('denominator=') + 1]
            pitches = pretty_clip.get_pitch_class_histogram().tolist()
            
            # Create a list of all velocities used in each piece extracted from the Mido object
            velocity_list = []
            for instrument in pretty_clip.instruments:
                for note in instrument.notes:
                    velocity_list.append(note.velocity)
                    
            # Append each of these variables to the row variable created above
            row.append(composer)
            row.append(performer)
            row.append(nationality)
            row.append(gender)
            row.append(piece)
            row.append(end_time)
            row.append(tempo)
            row.append(time_sig)
            for i in range(0, 12):
                row.append(pitches[i])
            row.append(len(set(velocity_list)))
            row.append(np.mean(velocity_list))
            row.append(min(velocity_list))
            row.append(max(velocity_list))
            
            # Append the row list to the end of the dataframe
            length = len(midi_files)
            midi_files.loc[length] = row
            
            # Increase the processed files by 1
            processed_files += 1
                
            # Print a progress note for the file being processed
            print(f'Processed {processed_files} files.    ', end = '\r')
            
        else:
            print(f'File not type 0: {file}')
            
    else:
        print(f'File not midi: {file}')
            
print(f'\nProcessed {processed_files} out of {total_files} total files.')

Processed 2587 files.    
Processed 2587 out of 2587 total files.


In [3]:
# Take a quick look at the resulting dataframe (sanity check)
midi_files.head()

Unnamed: 0,Composer,Performer,Perf_Nationality,Perf_Gender,Piece,Length,Tempo,Time_Signature,Freq_C,Freq_C#/Db,...,Freq_F#/Gb,Freq_G,Freq_G#/Ab,Freq_A,Freq_A#/Bb,Freq_B,Unique_Velocities,Average_Velocity,Min_Velocity,Max_Velocity
0,Bach,AllisonTo,USA,F,PreludeAndFugueInFMinorWTCBook1,354.472936,177.721697,4/4,0.164755,0.062565,...,0.010428,0.131908,0.114181,0.03024,0.116267,0.033889,67,60.607404,3,95
1,Beethoven,AllisonTo,USA,F,SonataNo21Op521stMov,483.795456,202.068834,4/4,0.13308,0.03346,...,0.04289,0.124259,0.059011,0.09749,0.03924,0.12,96,64.732471,3,108
2,Chopin,AllisonTo,USA,F,NocturneOp62No1,414.144885,202.402893,4/4,0.041353,0.101974,...,0.139098,0.034774,0.12735,0.023966,0.088816,0.119361,84,56.144267,6,102
3,Chopin,AllisonTo,USA,F,ScherzoNo4InEMajor,625.507921,202.243617,4/4,0.053622,0.124054,...,0.103211,0.03177,0.133132,0.075475,0.052278,0.143722,108,65.535216,3,112
4,Ligeti,AllisonTo,USA,F,EtudeNo13,271.554216,223.185387,4/4,0.102083,0.078426,...,0.079969,0.076369,0.072255,0.092569,0.091283,0.087426,118,75.663924,3,126


In [4]:
# Order the dataframe by composer name
midi_files.sort_values(by = 'Composer', ascending = True, inplace = True)

In [5]:
# Reset and replace the dataframe index
midi_files.reset_index(drop = True, inplace = True)

In [6]:
# Round all numeric columns to reasonable significance (decimals)
midi_files['Length'] = midi_files['Length'].round()
midi_files['Tempo'] = midi_files['Tempo'].round()
midi_files['Freq_C'] = midi_files['Freq_C'].round(5)
midi_files['Freq_C#/Db'] = midi_files['Freq_C#/Db'].round(5)
midi_files['Freq_D'] = midi_files['Freq_D'].round(5)
midi_files['Freq_D#/Eb'] = midi_files['Freq_D#/Eb'].round(5)
midi_files['Freq_E'] = midi_files['Freq_E'].round(5)
midi_files['Freq_F'] = midi_files['Freq_F'].round(5)
midi_files['Freq_F#/Gb'] = midi_files['Freq_F#/Gb'].round(5)
midi_files['Freq_G'] = midi_files['Freq_G'].round(5)
midi_files['Freq_G#/Ab'] = midi_files['Freq_G#/Ab'].round(5)
midi_files['Freq_A'] = midi_files['Freq_A'].round(5)
midi_files['Freq_A#/Bb'] = midi_files['Freq_A#/Bb'].round(5)
midi_files['Freq_B'] = midi_files['Freq_B'].round(5)
midi_files['Average_Velocity'] = midi_files['Average_Velocity'].round(2)

In [7]:
# Convert specific columns to int datatype
midi_files['Length'] = midi_files['Length'].astype(int)
midi_files['Tempo'] = midi_files['Tempo'].astype(int)
midi_files['Unique_Velocities'] = midi_files['Unique_Velocities'].astype(int)
midi_files['Min_Velocity'] = midi_files['Min_Velocity'].astype(int)
midi_files['Max_Velocity'] = midi_files['Max_Velocity'].astype(int)

In [8]:
# Sanity check all changes
midi_files.head()

Unnamed: 0,Composer,Performer,Perf_Nationality,Perf_Gender,Piece,Length,Tempo,Time_Signature,Freq_C,Freq_C#/Db,...,Freq_F#/Gb,Freq_G,Freq_G#/Ab,Freq_A,Freq_A#/Bb,Freq_B,Unique_Velocities,Average_Velocity,Min_Velocity,Max_Velocity
0,Albeniz,JieChen,China,F,TrianaFromIberia,299,215,4/4,0.07603,0.14623,...,0.12477,0.03364,0.12954,0.09272,0.03709,0.08159,109,64.54,6,118
1,Albeniz,InesaSinkevych,Israel,F,TrianaFromIberia,261,210,4/4,0.0743,0.14861,...,0.12023,0.03612,0.12822,0.09056,0.03689,0.08127,100,70.56,6,108
2,Albeniz,AndrewStaupe,USA,M,EvocationsFromIberia,324,198,4/4,0.05186,0.10722,...,0.08479,0.08059,0.12964,0.03013,0.10652,0.08199,97,49.84,3,101
3,Albeniz,GregoryDeTurck,USA,M,RondenaFromIberia,412,187,4/4,0.07866,0.06898,...,0.08337,0.10521,0.04467,0.13201,0.06973,0.06725,116,67.68,3,119
4,Albeniz,GregoryDeTurck,USA,M,Iberia,1246,196,4/4,0.08401,0.08824,...,0.0848,0.10109,0.07221,0.09783,0.06041,0.07793,115,63.29,3,117


In [9]:
# Save the dataframe to .csv in the project working directory
midi_files.to_csv('midi_files.csv', index = False)

<p style='text-align: right;'> <b> Next Step: </b> EDA and Visualization - <em> EDA_and_Visualization.ipynb </em> </p>