# 4. Modeling

## Problem definition

The goal is to predict whether a song by a female rapper will become a hit based on her city of origin. 

## Data preparation and cleaning

In [2]:
import pandas as pd

# Load each dataset 
tidal_df = pd.read_csv("tidal_female_rappers_with_city.csv", sep=";")
spotify_hiphop_df = pd.read_csv("spotify_top_hiphop_artists_tracks_cities.csv", sep=";", engine="python", on_bad_lines="skip")
spotify_10k_df = pd.read_csv("spotify_top_10k_streamed_songs.csv", sep=";", engine="python", on_bad_lines="skip")
complex_cities_df = pd.read_csv("complex_best_rap_cities.csv", sep=";", engine="python", on_bad_lines="skip")
billboard_df = pd.read_csv("billboard_hot_100_with_city.csv", sep=";", engine="python", on_bad_lines="skip")

In [4]:
# Preview each dataset 
print("TIDAL")
display(tidal_df.head())

print("Spotify Hip Hop")
display(spotify_hiphop_df.head())

print("Spotify 10k")
display(spotify_10k_df.head())

print("Complex")
display(complex_cities_df.head())

print("Billboard")
display(billboard_df.head())

TIDAL


Unnamed: 0,Ranking,track_name,artist,artist_id,Album,duration,track_id,city,gender
0,1,Bodak Yellow,Cardi B,7301626,Bodak Yellow,224,75065103,"New York, NY",female
1,2,Work It,Missy Elliott,14686,Under Construction,263,533969,"Portsmouth, VA",female
2,3,Conceited (There's Something About Remy),Remy Ma,27435,There's Something About Remy-Based On A True S...,220,37751158,"New York, NY",female
3,4,Let Me Blow Ya Mind,Eve,18615,Scorpion,230,616971,"Philadelphia, PA",female
4,5,Lighters Up,Lil' Kim,14963,Lighters Up,263,10905756,"Brooklyn, NY",female


Spotify Hip Hop


Unnamed: 0,artist,track_name,ranking,duration (ms),track_id,city,gender
0,Drake,Rich Baby Daddy (feat. Sexyy Red & SZA),92,319191,1yeB8MUNeLo9Ek1UEpsyz6,"Toronto, Canada",male
1,Drake,One Dance,91,173986,1zi7xx7UVEFkmKfv06H8x0,"Toronto, Canada",male
2,Drake,IDGAF (feat. Yeat),90,260111,2YSzYUF3jWqb9YP9VXmpjE,"Toronto, Canada",male
3,Drake,First Person Shooter (feat. J. Cole),88,247444,7aqfrAY2p9BUSiupwk3svU,"Toronto, Canada",male
4,Drake,Jimmy Cooks (feat. 21 Savage),88,218364,3F5CgOj3wFlRv51JsHbxhe,"Toronto, Canada",male


Spotify 10k


Unnamed: 0,ranking,artist,track_name,days,top 10 (xtimes),peak_position,peak_position (xtimes),peak_streams,total_streams,gender,city
0,1,Post Malone,Sunflower SpiderMan: Into the SpiderVerse,1506,302.0,1,(x29),2118242,883369738,male,"Syracuse, NY"
1,2,Juice WRLD,Lucid Dreams,1673,178.0,1,(x20),2127668,864832399,male,"Chicago, IL"
2,3,Lil Uzi Vert,XO TOUR Llif3,1853,212.0,1,(x4),1660502,781153024,male,"Philadelphia, PA"
3,4,J. Cole,No Role Modelz,2547,6.0,7,0,659366,734857487,male,"Fayetteville, NC"
4,5,Post Malone,rockstar,1223,186.0,1,(x124),2905678,718865961,male,"Syracuse, NY"


Complex


Unnamed: 0,artist,City,gender
0,Future,"Atlanta, GA",male
1,Young Thug,"Atlanta, GA",male
2,Lil Baby,"Atlanta, GA",male
3,Latto,"Atlanta, GA",female
4,21 Savage,"Atlanta, GA",male


Billboard


Unnamed: 0,date,ranking,track_name,artist,last-week,peak-rank,weeks-on-board,city,gender
0,2021-11-06,1,Easy On Me,adele,1.0,1,3,"London, UK",unknown
1,2021-11-06,2,Stay,the kid laroi,2.0,1,16,"Sydney, Australia",unknown
2,2021-11-06,3,Industry Baby,lil nas,3.0,1,14,,unknown
3,2021-11-06,4,Fancy Like,walker hayes,4.0,3,19,"Mobile, AL",unknown
4,2021-11-06,5,Bad Habits,ed sheeran,5.0,2,18,"Halifax, UK",unknown


In [10]:
# Clean TIDAL dataset by filtering for female rappers 
tidal_clean = tidal_df[tidal_df['gender'].str.lower() == 'female'].copy()
tidal_clean = tidal_clean[['artist', 'track_name', 'Ranking', 'duration', 'track_id', 'city']]
tidal_clean.rename(columns={'Ranking': 'ranking', 'duration': 'duration_sec'}, inplace=True)
display(tidal_clean.head())

Unnamed: 0,artist,track_name,ranking,duration_sec,track_id,city
0,Cardi B,Bodak Yellow,1,224,75065103,"New York, NY"
1,Missy Elliott,Work It,2,263,533969,"Portsmouth, VA"
2,Remy Ma,Conceited (There's Something About Remy),3,220,37751158,"New York, NY"
3,Eve,Let Me Blow Ya Mind,4,230,616971,"Philadelphia, PA"
4,Lil' Kim,Lighters Up,5,263,10905756,"Brooklyn, NY"


In [8]:
# Clean Spotify Hip Hop dataset by filtering for female rappers 
spotify_hiphop_clean = spotify_hiphop_df[spotify_hiphop_df['gender'].str.lower() == 'female'].copy()
spotify_hiphop_clean = spotify_hiphop_clean[['artist', 'track_name', 'ranking', 'duration (ms)', 'track_id', 'city']]
spotify_hiphop_clean.rename(columns={'duration (ms)': 'duration_ms'}, inplace=True)
display(spotify_hiphop_clean.head())

Unnamed: 0,artist,track_name,ranking,duration_ms,track_id,city
20,Nicki Minaj,FTCU,87,172137,1a73gcEg6h6Re6hHXoVltJ,"New York, NY"
22,Nicki Minaj,Everybody (feat. Lil Uzi Vert),84,180869,5ZJGv7aGdIr9IGpxzSG18T,"New York, NY"
23,Nicki Minaj,Starships,77,210626,1oHNvJVbFkexQc0BpQp7Y4,"New York, NY"
24,Nicki Minaj,Barbie World (with Aqua) [From Barbie The Album],83,109750,741UUVE2kuITl0c6zuqqbO,"New York, NY"
25,Nicki Minaj,Super Bass,83,200013,3hlksXnvbKogFdPbpO9vel,"New York, NY"


In [20]:
# Clean Spotify 10k dataset by filtering for female rappers 
spotify_10k_clean = spotify_10k_df[spotify_10k_df['gender'].str.lower() == 'female'].copy()
spotify_10k_clean = spotify_10k_clean[['artist', 'track_name', 'ranking', 'total_streams', 'peak_position', 'city']]
display(spotify_10k_clean.head())

Unnamed: 0,artist,track_name,ranking,total_streams,peak_position,city
43,Doja Cat,Kiss Me More,44,376965422,1,"Los Angeles, CA"
56,Cardi B,I Like It,57,329059657,2,"New York, NY"
79,Cardi B,WAP,80,296670667,1,"New York, NY"
106,Doja Cat,Need To Know,107,268475081,3,"Los Angeles, CA"
145,Cardi B,Bodak Yellow,146,235788390,2,"New York, NY"


In [22]:
# Clean Complex dataset by filtering for female rappers 
complex_clean = complex_cities_df[complex_cities_df['gender'].str.lower() == 'female'].copy()
complex_clean = complex_clean[['artist', 'City', 'gender']]
display(complex_clean.head())

Unnamed: 0,artist,City,gender
3,Latto,"Atlanta, GA",female
17,Kali,"Atlanta, GA",female
19,Bktherula,"Atlanta, GA",female
22,Baby Tate,"Atlanta, GA",female
23,Lakeyah,"Atlanta, GA",female


In [24]:
# Clean Billboard dataset by filtering for female rappers and keeping relevant columns
billboard_clean = billboard_df[billboard_df['gender'].str.lower() == 'female'].copy()
billboard_clean = billboard_clean[['date', 'ranking', 'track_name', 'artist', 'peak-rank', 'weeks-on-board', 'city']]
display(billboard_clean.head())

Unnamed: 0,date,ranking,track_name,artist,peak-rank,weeks-on-board,city
8,2021-11-06,9,Need To Know,doja cat,9,20,"Los Angeles, CA"
11,2021-11-06,12,Kiss Me More,doja cat,3,29,"Los Angeles, CA"
15,2021-11-06,16,You Right,doja cat,11,18,"Los Angeles, CA"
62,2021-11-06,63,Woman,doja cat,62,13,"Los Angeles, CA"
83,2021-11-06,84,Thot Shit,megan thee stallion,16,20,"Houston, TX"


## Merge datasets

In [26]:
# Align columns across datasets 
tidal_merge = tidal_clean[['artist', 'track_name', 'city', 'ranking']].copy()
tidal_merge['source'] = 'tidal'

spotify_10k_merge = spotify_10k_clean[['artist', 'track_name', 'city', 'ranking']].copy()
spotify_10k_merge['source'] = 'spotify_10k'

spotify_hiphop_merge = spotify_hiphop_clean[['artist', 'track_name', 'city', 'ranking']].copy()
spotify_hiphop_merge['source'] = 'spotify_hiphop'

billboard_merge = billboard_clean[['artist', 'track_name', 'city', 'ranking']].copy()
billboard_merge['source'] = 'billboard'

# Merge datasets
merged_df = pd.concat([
    tidal_merge,
    spotify_10k_merge,
    spotify_hiphop_merge,
    billboard_merge
], ignore_index=True)

print(merged_df.shape)
print(merged_df.head())

(1943, 5)
          artist                                track_name              city  \
0        Cardi B                              Bodak Yellow      New York, NY   
1  Missy Elliott                                   Work It    Portsmouth, VA   
2        Remy Ma  Conceited (There's Something About Remy)      New York, NY   
3            Eve                       Let Me Blow Ya Mind  Philadelphia, PA   
4       Lil' Kim                               Lighters Up      Brooklyn, NY   

   ranking source  
0        1  tidal  
1        2  tidal  
2        3  tidal  
3        4  tidal  
4        5  tidal  


## Standardize city names

In [28]:
# Lowercase and strip city names
merged_df['city'] = merged_df['city'].str.lower().str.strip()

# Replace city names with consistent names
city_replacements = {
    'brooklyn, ny': 'new york, ny',
    'queens, ny': 'new york, ny',
    'bronx, ny': 'new york, ny',
    'atl': 'atlanta, ga',
    'atlanta': 'atlanta, ga',
    'la': 'los angeles, ca',
    'compton, ca': 'los angeles, ca',
    'hollywood, ca': 'los angeles, ca',
    'strafford, vt, vt': 'strafford, vt',
    'halifax, uk': 'halifax, england'
}

merged_df['city'] = merged_df['city'].replace(city_replacements)


print(merged_df['city'].value_counts().head(20))

city
new york, ny            898
houston, tx             273
los angeles, ca         253
bridgetown, barbados     25
portsmouth, va           10
atlanta, ga               8
chicago, il               8
south orange, nj          7
philadelphia, pa          6
sydney, australia         5
hackensack, nj            5
miami, fl                 3
new orleans, la           2
medellín, colombia        2
london, uk, uk            1
st. louis, mo             1
memphis, tn               1
croydon, uk               1
london, uk                1
nashville, tn             1
Name: count, dtype: int64


In [30]:
# Fix london 
final_city_fixes = {
    'london, uk, uk': 'london, uk',
    'croydon, uk': 'london, uk'
}

merged_df['city'] = merged_df['city'].replace(final_city_fixes)

print(merged_df['city'].value_counts().head(20))

city
new york, ny            898
houston, tx             273
los angeles, ca         253
bridgetown, barbados     25
portsmouth, va           10
atlanta, ga               8
chicago, il               8
south orange, nj          7
philadelphia, pa          6
sydney, australia         5
hackensack, nj            5
miami, fl                 3
london, uk                3
new orleans, la           2
medellín, colombia        2
st. louis, mo             1
memphis, tn               1
nashville, tn             1
havana, cuba              1
Name: count, dtype: int64


## Add a hit column

In [32]:
# Convert ranking to numeric 
merged_df['ranking'] = pd.to_numeric(merged_df['ranking'], errors='coerce')

# Create binary hit column (hit = 1 if ranking <= 50 otherwise hit = 0)
merged_df['hit'] = merged_df['ranking'].apply(lambda x: 1 if pd.notnull(x) and x <= 50 else 0)

print(merged_df[['artist', 'track_name', 'ranking', 'hit']].head(10))
print(merged_df['hit'].value_counts())

          artist                                track_name  ranking  hit
0        Cardi B                              Bodak Yellow        1    1
1  Missy Elliott                                   Work It        2    1
2        Remy Ma  Conceited (There's Something About Remy)        3    1
3            Eve                       Let Me Blow Ya Mind        4    1
4       Lil' Kim                               Lighters Up        5    1
5  Missy Elliott                     WTF (Where They From)        6    1
6     Foxy Brown                                   I'll Be        7    1
7    Iggy Azalea                                     Fancy        8    1
8       Lil Mama                       Lip Gloss/ No Music        9    1
9       Lil' Kim                               Not Tonight       10    1
hit
1    987
0    956
Name: count, dtype: int64


## Data preparation

In [34]:
from sklearn.model_selection import train_test_split
import pandas as pd

# Select features and target variable
features = merged_df[['artist', 'city', 'source']]  
target = merged_df['hit']  

# One-hot encode categorical features
X = pd.get_dummies(features, drop_first=True)

# Define y
y = target

# Split into training and test sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("Train target distribution:\n", y_train.value_counts(normalize=True))
print("Test target distribution:\n", y_test.value_counts(normalize=True))

X_train shape: (1360, 76)
X_test shape: (583, 76)
Train target distribution:
 hit
1    0.508088
0    0.491912
Name: proportion, dtype: float64
Test target distribution:
 hit
1    0.507719
0    0.492281
Name: proportion, dtype: float64


## Logistic Regression

In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Train logistic regression
log_model = LogisticRegression(max_iter=1000, random_state=42)
log_model.fit(X_train, y_train)

# Predict
y_pred_log = log_model.predict(X_test)

# Evaluate
print("Logistic Regression Results:")
print(confusion_matrix(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log))

Logistic Regression Results:
[[105 182]
 [ 14 282]]
              precision    recall  f1-score   support

           0       0.88      0.37      0.52       287
           1       0.61      0.95      0.74       296

    accuracy                           0.66       583
   macro avg       0.75      0.66      0.63       583
weighted avg       0.74      0.66      0.63       583



- Accuracy = 66%.
- Precision for hits = 0.61 (not great, many false positives).
- Recall for hits = 0.95 (very high, predicts almost all hits).
- F1 for hits = 0.74 (good balance).
The model is solid but overpredicts hits and struggles to identify non-hits (recall for 0 = 0.37).

## Random Forest Classifier

In [38]:
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict
y_pred_rf = rf_model.predict(X_test)

# Evaluate
print("Random Forest Results:")
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

Random Forest Results:
[[105 182]
 [ 14 282]]
              precision    recall  f1-score   support

           0       0.88      0.37      0.52       287
           1       0.61      0.95      0.74       296

    accuracy                           0.66       583
   macro avg       0.75      0.66      0.63       583
weighted avg       0.74      0.66      0.63       583



Random Forest gives the same results as Logistic Regression:
- Accuracy = 66%.
- Precision = 0.61.
- Recall = 0.95.
- F1 = 0.74.

## XGBoost

In [84]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-macosx_12_0_arm64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-3.0.2
Note: you may need to restart the kernel to use updated packages.


In [40]:
from xgboost import XGBClassifier

# Train XGBoost model
xgb_model = XGBClassifier(eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)

# Predict
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate
print("XGBoost Results:")
print(confusion_matrix(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))

XGBoost Results:
[[105 182]
 [ 14 282]]
              precision    recall  f1-score   support

           0       0.88      0.37      0.52       287
           1       0.61      0.95      0.74       296

    accuracy                           0.66       583
   macro avg       0.75      0.66      0.63       583
weighted avg       0.74      0.66      0.63       583



XGBoost gives the same results as the other models. 

- True Negatives = 105.
- False Positives =182.
- False Negatives =14.
- True Positives = 282.

## Hyperparameter tuning with GridSearchCV

In [42]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}

# Set up GridSearchCV
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf,
                           param_grid=param_grid,
                           scoring='f1',
                           cv=3,
                           n_jobs=-1,
                           verbose=1)

# Fit the model
grid_search.fit(X_train, y_train)

# Best model from grid search
best_rf = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

# Evaluate 
y_pred_best_rf = best_rf.predict(X_test)
print("Tuned Random Forest Results:")
print(confusion_matrix(y_test, y_pred_best_rf))
print(classification_report(y_test, y_pred_best_rf))

Fitting 3 folds for each of 72 candidates, totalling 216 fits
Best Parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 50}
Tuned Random Forest Results:
[[103 184]
 [ 15 281]]
              precision    recall  f1-score   support

           0       0.87      0.36      0.51       287
           1       0.60      0.95      0.74       296

    accuracy                           0.66       583
   macro avg       0.74      0.65      0.62       583
weighted avg       0.74      0.66      0.63       583



## Conclusion

We evaluated the effectiveness of different classification models in predicting future hits by female rappers based on their city of origin. Three models were built and tested: Logistic Regression, Random Forest and XGBoost.

All three models perform similarly with:
- Accuracy: ~66%
- Recall: ~95%
- F1-score: ~0.74

However, they all struggle to identify non-hits, with a recall for 0 around 37%.

To improve model performance, we used GridSearchCV to tune the hyperparameters of the Random Forest classifier. The best parameters found were: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 50}. 

Despite this optimization, the tuned model gives results nearly identical to the untuned one. It may mean that the features used somehow limit the model's performance. 

Although all models perform similarly, we selected Logistic Regression as the final model because it is simpler, faster, easier to interpret than other models, and requires fewer computational resources. 