# Decision Tree Modeling
## Continuous Response (view_count)

In [10]:
# import/install librares/packages
!pip install pandas numpy scikit-learn statsmodels xgboost matplotlib seaborn imbalanced-learn
!pip install -U scikit-learn
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler

# initialize scaler
scaler = StandardScaler()

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [11]:
# load in data set
df = pd.read_csv("model_ready_dataset.csv")
df.head()

Unnamed: 0,acousticness,danceability,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,view_count,...,avg_word_len,unique_words,vocab_richness,Rap_Street_Slang_Topic,Romance_Relationships_Topic,Life_Nostalgia_Topic,Party_Dance_Sensuality_Topic,Love_Emotion_Sentiment_Topic,Loss_Struggle_Reflection_Topic,Energy_Vibes_Epic_Topic
0,0.202,0.759,0.699,0.000131,0.443,-5.745,0.0307,92.96,0.907,1118930000.0,...,4.846395,98,0.30721,0.001245,0.001247,0.001247,0.362314,0.385711,0.001244,0.246992
1,0.0393,0.535,0.505,0.0,0.0923,-8.926,0.245,99.935,0.495,220560700.0,...,5.327024,428,0.450053,0.192193,0.030384,0.000396,0.01585,0.760386,0.000396,0.000396
2,0.542,0.698,0.533,0.0,0.333,-6.246,0.0437,134.001,0.275,87564090.0,...,5.063918,141,0.290722,0.000765,0.418962,0.230767,0.000766,0.347208,0.000765,0.000766
3,0.00364,0.767,0.551,0.0,0.0451,-7.328,0.0616,100.904,0.796,10499470.0,...,4.838269,132,0.300683,0.0009,0.000902,0.000901,0.000906,0.99459,0.0009,0.000901
4,0.175,0.398,0.804,0.0,0.181,-5.559,0.0451,186.752,0.709,21090600.0,...,5.375,117,0.365625,0.001012,0.324886,0.046997,0.624069,0.001012,0.001011,0.001014


In [12]:
# features
X = df.drop(columns=['view_count'])

# numeric columns only
num_cols = X.select_dtypes(include=['float64', 'int64']).columns
X_numeric = X[num_cols]

# scale numeric columns
X = scaler.fit_transform(X_numeric)

In [13]:
# descriptive summary of continuous response
df['view_count'].describe().apply(lambda x: f"{x:,.4f}")

count            5,395.0000
mean       192,167,018.5800
std        441,693,263.0710
min                755.0000
25%         10,632,900.5000
50%         44,285,885.0000
75%        171,219,462.5000
max      6,847,227,502.0000
Name: view_count, dtype: object

In [14]:
# create the target
df['view_count_factor'] = df['view_count'].apply(
    lambda x: 'yes' if x >= 100_000_000 else 'no'
)

# make sure it's a factor
df['view_count_factor'] = df['view_count_factor'].astype('category')

# ordered correctly
df['view_count_factor'] = df['view_count_factor'].cat.set_categories(['no', 'yes'])

y = df['view_count_factor']
print(y.value_counts())

view_count_factor
no     3551
yes    1844
Name: count, dtype: int64


In [15]:
# split data into training (70%) and testing (30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)

In [16]:
# initialize the algorithm
dtree=DecisionTreeClassifier()

# Generate a new model using training data only
dtree.fit(X_train,y_train)

# predict the test data
y_pred = dtree.predict(X_test)

In [19]:
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred)) 

0.6491661519456454
[[756 289]
 [279 295]]
              precision    recall  f1-score   support

          no       0.73      0.72      0.73      1045
         yes       0.51      0.51      0.51       574

    accuracy                           0.65      1619
   macro avg       0.62      0.62      0.62      1619
weighted avg       0.65      0.65      0.65      1619



In [20]:
# ---------------------------------------------------
# 2. Train/test split
# ---------------------------------------------------
X_train2, X_test2, y_train2, y_test2 = train_test_split(
    X, y, 
    test_size=0.3, 
    random_state=1234,
    stratify=y
)

# ---------------------------------------------------
# 3. Cross-validation on the ORIGINAL (unbalanced) training data
# ---------------------------------------------------
classifier = DecisionTreeClassifier(random_state=1234)

# IMPORTANT FIX: use X_train2, y_train2 (not X_train, y_train)
cv_accuracies = cross_val_score(estimator=classifier, 
                                X=X_train2, 
                                y=y_train2, 
                                cv=10)

print("Cross-validation accuracies:", cv_accuracies)
print("Mean CV accuracy:", cv_accuracies.mean(), "\n")

# ---------------------------------------------------
# 4. Grid search for Decision Tree hyperparameters
# ---------------------------------------------------
# IMPORTANT FIX: remove n_estimators (DecisionTreeClassifier does NOT have it)
grid_param = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [1,2,3,4,5],
    'min_samples_split': [2]
}

# FIX: use X_train2, y_train2
gd_sr = GridSearchCV(estimator=classifier,
                     param_grid=grid_param,
                     scoring='accuracy',
                     cv=5,
                     n_jobs=-1)

gd_sr.fit(X_train2, y_train2)

print("Best parameters from grid search:", gd_sr.best_params_)
print("Best CV score:", gd_sr.best_score_, "\n")


# ---------------------------------------------------
# 5. UNDERSAMPLE TRAINING DATA ONLY
# ---------------------------------------------------
rus2 = RandomUnderSampler(sampling_strategy='auto', random_state=1234)
X_train_res2, y_train_res2 = rus2.fit_resample(X_train2, y_train2)

print("Class distribution after undersampling:")
print(y_train_res2.value_counts(), "\n")

# ---------------------------------------------------
# 6. Train final Decision Tree on UNDERSAMPLED data
# ---------------------------------------------------
dtree2 = DecisionTreeClassifier(random_state=1234)
dtree2.fit(X_train_res2, y_train_res2)

# ---------------------------------------------------
# 7. Predict on the ORIGINAL test set
# ---------------------------------------------------
y_pred2 = dtree2.predict(X_test2)

# ---------------------------------------------------
# 8. Evaluate model
# ---------------------------------------------------
print("Accuracy (Tree 2):", accuracy_score(y_test2, y_pred2))
print("\nConfusion Matrix (Tree 2):\n", confusion_matrix(y_test2, y_pred2))
print("\nClassification Report (Tree 2):\n", classification_report(y_test2, y_pred2))

Cross-validation accuracies: [0.65343915 0.64021164 0.62962963 0.62433862 0.5978836  0.63492063
 0.64721485 0.64721485 0.62864721 0.62068966]
Mean CV accuracy: 0.6324189858672618 

Best parameters from grid search: {'criterion': 'gini', 'max_depth': 4, 'min_samples_split': 2}
Best CV score: 0.6806135463751357 

Class distribution after undersampling:
view_count_factor
no     1291
yes    1291
Name: count, dtype: int64 

Accuracy (Tree 2): 0.5793699814700433

Confusion Matrix (Tree 2):
 [[623 443]
 [238 315]]

Classification Report (Tree 2):
               precision    recall  f1-score   support

          no       0.72      0.58      0.65      1066
         yes       0.42      0.57      0.48       553

    accuracy                           0.58      1619
   macro avg       0.57      0.58      0.56      1619
weighted avg       0.62      0.58      0.59      1619

