In [None]:
pip install scikit-learn

In [2]:
# 1 part a

import pandas as pd
from sklearn.model_selection import train_test_split

def split_data(df, target_column, train_size=0.7, val_size=0.15, test_size=0.15, random_state=42):
    """
    Splits the dataset into training, validation, and test sets.
    """
    assert train_size + val_size + test_size == 1, "Splits must sum to 1"

    X = df.drop(columns=[target_column])
    y = df[target_column]

    # Split into training and temp set (for validation and test)
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=(1 - train_size), random_state=random_state)
    
    # Split temp set into validation and test sets
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=test_size/(test_size + val_size), random_state=random_state)

    return X_train, X_val, X_test, y_train, y_val, y_test


df = pd.read_csv("./Top_spotify_songs.csv")

X_train, X_val, X_test, y_train, y_val, y_test = split_data(df, target_column="popularity")

print("Train set size: "+ str(len(X_train)))
print("Validation set size: " + str(len(X_val)))
print("Test set size: "+ str(len(X_test)))


Train set size: 1209867
Validation set size: 259257
Test set size: 259258


In [None]:
# part 1 b

X_train, X_val, X_test, y_train, y_val, y_test = split_data(df, target_column="popularity")

# Print the shapes of the splits
print("X_train shape: " + str(X_train.shape)+", y_train shape: " + str(y_train.shape))
print(f"X_val shape: "+str(X_val.shape) + ", y_val shape: " + str(y_val.shape))
print(f"X_test shape:"+str(X_test.shape) + ", y_test shape: " + str(y_test.shape))

X_train shape: (1209867, 24), y_train shape: (1209867,)
X_val shape: (259257, 24), y_val shape: (259257,)
X_test shape:(259258, 24), y_test shape: (259258,)


In [1]:
#part 1 c

import seaborn as sns
import matplotlib.pyplot as plt

def encode_categorical_features(df):
    """
    Encodes categorical columns with one-hot encoding.
    """
    categorical_columns = df.select_dtypes(include=['object']).columns
    return pd.get_dummies(df, columns=categorical_columns, drop_first=True)

def create_correlation_matrix(df, target_column):
    """
    Creates and saves a correlation matrix between features and the target column.
    """
    correlation_matrix = df.corr()

    # Set up the plot size and style
    plt.figure(figsize=(12, 10))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)

    plt.title(f"Correlation Matrix for {target_column}")
    plt.savefig(f"correlation_matrix_{target_column}.png", bbox_inches='tight')
    plt.close()

# One-hot encode categorical features
df_encoded = encode_categorical_features(df)

# Create and save correlation matrix
create_correlation_matrix(df_encoded, target_column="popularity")

NameError: name 'df' is not defined