In [None]:
#Importing all necessary packages

import gzip
import pandas as pd
import matplotlib.pyplot as plt
import math
import random
from sklearn.preprocessing import LabelEncoder

#Unzipping the data csv file

f = gzip.open('/content/data.csv.gz', 'rb')
data = pd.read_csv(f)
f.close()

In [None]:
le = LabelEncoder()
print(data)

    Chord1 Chord2 Chord3 Chord4
0       i7     i7     i7     i7
1        V   ♭VII   ♭VII   ♭VII
2        I      I     ii   ii°7
3      iv9      i    iv9   ♭VII
4        I      I     IV     IV
..     ...    ...    ...    ...
728      I     IV     ii   ♭VII
729      I      V    vi7   IVM9
730      I      I    vi9    vi9
731      I     ii     ii      I
732     i9      i      i    iv9

[733 rows x 4 columns]


In [None]:
def encode(data):

  encoded_data = pd.DataFrame()
  for x in range(1, len(data.columns) + 1):
    encoded_data[("Chord" + str(x) + "_E")] = le.fit_transform(data[("Chord" + str(x))])
    
  return encoded_data

The data is encoded as numeric values in order to simplify processing and future adaptation for ML models, etc.

In [None]:
def chord_tally(encoded_data, chord_num):

  chord_tally = dict()
  
  for x in encoded_data["Chord%s_E" % chord_num]:
    if x not in chord_tally:
      chord_tally[x] = 1
    else:
      chord_tally[x] += 1

  return chord_tally

The number of instances of each chord in each of 4 positions is tallied.

In [None]:
def first_to_second(encoded_data, num):
  
  counts = dict()
  for x in range(len(encoded_data["Chord%s_E" % num])):
    if str(encoded_data["Chord%s_E" % num][x]) + " " + str(encoded_data["Chord%s_E" % (num + 1)][x]) not in counts:
      counts[str(encoded_data["Chord%s_E" % num][x]) + " " + str(encoded_data["Chord%s_E" % (num + 1)][x])] = 1
    else:
      counts[str(encoded_data["Chord%s_E" % num][x]) + " " + str(encoded_data["Chord%s_E" % (num + 1)][x])] += 1
  
  return counts 

The number of occurrences of two chord sequences in any of 3 possible positions is tallied and returned as a dictionary.

In [None]:
def first_to_second_ratios(c1_c2_counts, c1):
  
  ratios = dict()
  cts = list(c1_c2_counts.keys())
  for x in range(len(cts)):
      ratios[cts[x]] = c1_c2_counts[cts[x]] / c1[find_first_chord(cts[x])]

  return ratios

Using the tallies generated in the previous step, the sequence ratios (i.e. occurences of a two chord sequence in the dataset out of all two chord sequences in that same position) are calculated and returned.

In [None]:
def find_first_chord(chords):

  first_chord = ""
  for x in chords:
    if x != " ":
      first_chord += x
    else:
      break
      
  return int(first_chord)

A function is defined to find the first word given a 2 chord sequence (in the form of a string). 

In [None]:
def find_completed_ratio(encoded_data):
  
  ratios = []

  for x in range(1, len(encoded_data.columns)): #main loop, runs through each column of the progression inputs.
    c1 = chord_tally(encoded_data, x)
    c1_c2_counts = first_to_second(encoded_data, x)
    ratios.append(first_to_second_ratios(c1_c2_counts, c1))

  return ratios

A function calls the first_to_second_ratios() function on all positions in the data, returning a list of dictionaries.

In [None]:
def select_first_chord(first_chord_data, user_value):
  
  weighted = []
  first_chord_data = dict(sorted(first_chord_data.items(), key=lambda item: item[1]))
  for x in first_chord_data:
    weighted.extend([x] * first_chord_data[x])

  return random.choice(weighted)

A chord is randomly chosen from a weighted list for the first chord of the generated sequence.

In [None]:
def generate_next_chord(finished_ratios, curr, prev):
  
  weighted=[]
  next_chord_data = {}
  for x in finished_ratios[curr-1]:
    if(x[0:2].replace(" ", "") == str(prev)):
      if(x[len(x)-2:] in next_chord_data.keys()):
        next_chord_data[x[len(x)-2:]] += 1
      else:
        next_chord_data[x[len(x)-2:]] = 1
  next_chord_data = dict(sorted(next_chord_data.items(), key=lambda item: item[1]))
  sum = 0
  for x in next_chord_data:
    sum += next_chord_data[x]
  for x in next_chord_data:
    weighted.extend([x] * int(1000*round(next_chord_data[x]/sum, 3)))
  return random.choice(weighted)

A method chooses the following three chords using weighted lists of chord sequences.

In [None]:
def gen_chords():

  chord_one = select_first_chord(first_chord_data, 0.4)
  chord_two = generate_next_chord(finished_ratios, 1, chord_one)
  chord_three = generate_next_chord(finished_ratios, 2, chord_two)
  chord_four = generate_next_chord(finished_ratios, 3, chord_three)
  
  return [chord_one, chord_two, chord_three, chord_four]

Four chords are generated and returned as a list.

In [None]:
encoded_data = pd.DataFrame()
encoded_data = encode(data)
first_chord_data = chord_tally(encoded_data, 1)
finished_ratios = find_completed_ratio(encoded_data)
worked = 0
while(worked != 1):
  try:
    chords = gen_chords()
    worked = 1
  except:
    pass

legible_chords = []
for x in chords:
  legible_chords.append(le.inverse_transform([int(x)]*733)[0])

print(legible_chords)

['I7', '♭VII', 'VI9', 'iv9']


(Driver code) The encoder decodes the encoded chord values, which are then printed.