<a href="https://colab.research.google.com/github/evansuslovich/ChordWave/blob/main/Chord_Test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

df = pd.read_csv("hf://datasets/ailsntua/Chordonomicon/chordonomicon_v2.csv")

In [None]:
df.head()

In [None]:
subset_df = df[['id', 'chords', 'genres', 'decade', 'main_genre']]
subset_df.head()

In [None]:
cleaned_df = subset_df.dropna()
cleaned_df.head()

In [None]:
import matplotlib.pyplot as plt

# For 'main_genre'
plt.figure(figsize=(10, 6))
cleaned_df['main_genre'].value_counts().plot(kind='bar')
plt.title('Distribution of Songs by Main Genre')
plt.xlabel('Genre')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# For 'decade'
plt.figure(figsize=(10, 6))
cleaned_df['decade'].hist(bins=10)
plt.title('Distribution of Songs by Decade')
plt.xlabel('Decade')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

In [None]:
# Create a crosstab of decade vs main_genre
decade_genre = pd.crosstab(cleaned_df['decade'], cleaned_df['main_genre'])

# Plot stacked bar chart
decade_genre.plot(kind='bar', stacked=True, figsize=(12, 8))
plt.title('Genre Distribution Across Decades')
plt.xlabel('Decade')
plt.ylabel('Count')
plt.legend(title='Genre', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns
import re
from collections import Counter

def extract_chords(chord_string):
    cleaned = re.sub(r'<[^>]+>', ' ', chord_string)
    return [chord for chord in cleaned.split() if chord.strip()]

In [None]:
cleaned_df['chord_list'] = cleaned_df['chords'].apply(extract_chords)

cleaned_df['unique_chord_count'] = cleaned_df['chord_list'].apply(lambda x: len(set(x)))

plt.figure(figsize=(12, 6))
sns.barplot(x='main_genre', y='unique_chord_count', data=cleaned_df)
plt.title('Average Number of Unique Chords by Genre')
plt.xlabel('Genre')
plt.ylabel('Average Unique Chord Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
def top_chords_by_genre(genre_df, n=5):
    all_chords = [chord for sublist in genre_df['chord_list'] for chord in sublist]
    return Counter(all_chords).most_common(n)

genre_top_chords = {}
for genre in cleaned_df['main_genre'].unique():
    genre_df = cleaned_df[cleaned_df['main_genre'] == genre]
    genre_top_chords[genre] = top_chords_by_genre(genre_df)

chord_genre_data = []
for genre, chords in genre_top_chords.items():
    for chord, count in chords:
        chord_genre_data.append({'Genre': genre, 'Chord': chord, 'Count': count})

chord_genre_df = pd.DataFrame(chord_genre_data)



In [None]:
plt.figure(figsize=(14, 8))
genres = chord_genre_df['Genre'].unique()
for i, genre in enumerate(genres):
    genre_data = chord_genre_df[chord_genre_df['Genre'] == genre]
    x = range(len(genre_data))
    plt.bar([j + i*0.2 for j in x], genre_data['Count'], width=0.2, label=genre)
    if i == 0:
        plt.xticks([j + 0.2*(len(genres)-1)/2 for j in x], genre_data['Chord'])

plt.title('Top 5 Chords by Genre')
plt.xlabel('Chord')
plt.ylabel('Frequency')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
def get_chord_freq_by_genre():
    all_chords = [chord for sublist in cleaned_df['chord_list'] for chord in sublist]
    top_chords = [chord for chord, _ in Counter(all_chords).most_common(15)]

    result = {chord: [] for chord in top_chords}
    result['Genre'] = []

    for genre in cleaned_df['main_genre'].unique():
        genre_df = cleaned_df[cleaned_df['main_genre'] == genre]
        genre_chords = [chord for sublist in genre_df['chord_list'] for chord in sublist]
        total_chords = len(genre_chords)

        if total_chords == 0:
            continue

        genre_counter = Counter(genre_chords)
        result['Genre'].append(genre)

        for chord in top_chords:
            percentage = (genre_counter.get(chord, 0) / total_chords) * 100
            result[chord].append(percentage)

    return pd.DataFrame(result)

In [None]:
chord_freq_df = get_chord_freq_by_genre()
chord_freq_pivot = chord_freq_df.set_index('Genre')

plt.figure(figsize=(14, 10))
sns.heatmap(chord_freq_pivot, annot=True, cmap='YlGnBu', fmt='.1f')
plt.title('Percentage of Chord Usage by Genre')
plt.ylabel('Genre')
plt.xlabel('Chord')
plt.tight_layout()
plt.show()

In [21]:
# put all chord data into csv
chords = cleaned_df['chords']
chord_data = []
for chord in chords: 
    chord_data.extend(chord.split(" "))


print(len(chord_data))



24523034


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Assuming chord_data is already created
# 1. First shuffle and split the data
# train_data, temp_data = train_test_split(chord_data, test_size=0.2, random_state=42)
# val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

full_data, full_test_data = train_test_split(chord_data, test_size=0.1, random_state=42)
train_data_on_personal, test_data_on_personal = train_test_split(test_data, test_size=0.2, random_state=42)


print(len(train_data_on_personal))
print(len(test_data_on_personal))

# 2. Convert to DataFrames
# train_df = pd.DataFrame(train_data, columns=['chord'])
# val_df = pd.DataFrame(val_data, columns=['chord'])
# test_df = pd.DataFrame(test_data, columns=['chord'])

# 3. Save to CSV files
# train_df.to_csv("train_chords.csv", index=False)
# val_df.to_csv("val_chords.csv", index=False)
# test_df.to_csv("test_chords.csv", index=False)

train_data_on_personal_df = pd.DataFrame(train_data_on_personal, columns=['text'])
print(len(train_data_on_personal_df))

test_data_on_personal_df= pd.DataFrame(test_data_on_personal, columns=['text'])
print(len(test_data_on_personal_df))

# train_data_on_personal_df.to_csv("train_data_personal.csv", index=False)
# test_data_on_personal_df.to_csv("test_data_personal.csv", index=False)


In [29]:
import pandas as pd

# test_df = pd.read_csv("test_chords.csv")
# print(len(test_df))
# train_df = pd.read_csv("train_chords.csv")
# print(len(train_df))
# val_df = pd.read_csv("val_chords.csv")
# print(len(val_df))


train_data_personal_df = pd.read_csv("data/train_data_personal.csv")
test_data_personal_df = pd.read_csv("data/test_data_personal.csv")

train_data_compared_to_full_data = len(train_data_personal_df)/ len(chord_data)
test_data_compared_to_full_data = len(test_data_personal_df)/ len(chord_data)

print(f"{train_data_compared_to_full_data * 100:.3f}%")
print(f"{test_data_compared_to_full_data * 100:.3f}%")



0.320%
0.080%


In [25]:

print("{}%".format(78474/24523034 * 100))

0.00320001187455027
0.0008000233576318493
