In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import json
import time
import matplotlib.pyplot as plt


In [42]:
# Cell 2: Data Loading
cv_data = pd.read_csv('results/tiktok_content_analysis.csv')
viral_data = pd.read_csv('Viral Hooks Full Data Set.csv')

print("CV Data Shape:", cv_data.shape)
print("Viral Data Shape:", viral_data.shape)
display(cv_data.head(2))

CV Data Shape: (348, 4)
Viral Data Shape: (46605, 14)


Unnamed: 0,video_filename,duration,social_media_content,audio_features
0,https：//www.tiktok.com/@geocaching/video/71648...,12.666667,"{""total_faces"": 10, ""text_frames"": 1, ""screen_...","{""quality_metrics"": {""volume_level"": 0.0855373..."
1,https：//www.tiktok.com/@yoyobdt_/video/7253793...,47.666667,"{""total_faces"": 57, ""text_frames"": 48, ""screen...","{""quality_metrics"": {""volume_level"": 0.1491330..."


In [56]:
# Cell 3: Data Cleaning
# Clean URLs
cv_data['video_filename'] = cv_data['video_filename'].str.replace('⧸', '/')

# Parse JSON helper function
def parse_json_column(json_str):
    if pd.isna(json_str):
        return {}
    try:
        if isinstance(json_str, dict):
            return json_str
        if isinstance(json_str, str):
            return json.loads(json_str.replace("'", "\""))
        return {}
    except:
        print(f"Failed to parse: {type(json_str)} - {str(json_str)[:100]}...")
        return {}

# Reapply parsing
cv_data['audio_features'] = cv_data['audio_features'].apply(parse_json_column)
cv_data['social_media_content'] = cv_data['social_media_content'].apply(parse_json_column)
cv_data['video_filename'] = cv_data['video_filename'].str.replace('：', ':').str.replace('⧸', '/').str.replace('.mp4', '')

cv_data

Unnamed: 0,video_filename,duration,social_media_content,audio_features
0,https://www.tiktok.com/@geocaching/video/71648...,12.666667,"{'total_faces': 10, 'text_frames': 1, 'screen_...",{'quality_metrics': {'volume_level': 0.0855373...
1,https://www.tiktok.com/@yoyobdt_/video/7253793...,47.666667,"{'total_faces': 57, 'text_frames': 48, 'screen...",{'quality_metrics': {'volume_level': 0.1491330...
2,https://www.tiktok.com/@zebracat.ai/video/7446...,67.000000,"{'total_faces': 130, 'text_frames': 55, 'scree...",{'quality_metrics': {'volume_level': 0.0504763...
3,https://www.tiktok.com/@travel/video/737835575...,14.200000,"{'total_faces': 6, 'text_frames': 10, 'screen_...",{'quality_metrics': {'volume_level': 0.1125932...
4,https://www.tiktok.com/@beepr_app/video/726991...,7.433333,"{'total_faces': 0, 'text_frames': 8, 'screen_r...",{'quality_metrics': {'volume_level': 0.2917576...
...,...,...,...,...
343,https://www.tiktok.com/@thejerryapp/video/7253...,40.000000,"{'total_faces': 206, 'text_frames': 40, 'scree...",{'quality_metrics': {'volume_level': 0.1882175...
344,https://www.tiktok.com/@bumble/video/735589792...,14.920000,"{'total_faces': 20, 'text_frames': 5, 'screen_...",{'quality_metrics': {'volume_level': 0.2770988...
345,https://www.tiktok.com/@duolingo/video/7197477...,7.400000,"{'total_faces': 11, 'text_frames': 8, 'screen_...",{'quality_metrics': {'volume_level': 0.5891649...
346,https://www.tiktok.com/@lingodeer.app/video/71...,17.866667,"{'total_faces': 90, 'text_frames': 18, 'screen...",{'quality_metrics': {'volume_level': 0.1227528...


In [57]:
# Extract features from social_media_content and audio_features
def expand_features(df):
    # Social media content features
    df['total_faces'] = df['social_media_content'].apply(lambda x: x.get('total_faces', 0))
    df['text_frames'] = df['social_media_content'].apply(lambda x: x.get('text_frames', 0))
    df['screen_recording_frames'] = df['social_media_content'].apply(lambda x: x.get('screen_recording_frames', 0))
    df['avg_faces_per_frame'] = df['social_media_content'].apply(lambda x: x.get('avg_faces_per_frame', 0))
    df['text_percentage'] = df['social_media_content'].apply(lambda x: x.get('text_percentage', 0))
    df['screen_recording_percentage'] = df['social_media_content'].apply(lambda x: x.get('screen_recording_percentage', 0))
    
    # Audio quality metrics
    df['volume_level'] = df['audio_features'].apply(lambda x: x.get('quality_metrics', {}).get('volume_level', 0))
    df['volume_consistency'] = df['audio_features'].apply(lambda x: x.get('quality_metrics', {}).get('volume_consistency', 0))
    df['high_frequency_content'] = df['audio_features'].apply(lambda x: x.get('quality_metrics', {}).get('high_frequency_content', 0))
    df['frequency_variation'] = df['audio_features'].apply(lambda x: x.get('quality_metrics', {}).get('frequency_variation', 0))
    
    # Audio dynamic analysis
    df['dynamic_range'] = df['audio_features'].apply(lambda x: x.get('dynamic_analysis', {}).get('dynamic_range', 0))
    df['peak_volume'] = df['audio_features'].apply(lambda x: x.get('dynamic_analysis', {}).get('peak_volume', 0))
    df['median_volume'] = df['audio_features'].apply(lambda x: x.get('dynamic_analysis', {}).get('median_volume', 0))
    df['noise_floor'] = df['audio_features'].apply(lambda x: x.get('dynamic_analysis', {}).get('noise_floor', 0))
    
    # Drop original columns
    df = df.drop(['social_media_content', 'audio_features'], axis=1)
    
    return df

# Apply transformation
cv_data = expand_features(cv_data)

AttributeError: 'NoneType' object has no attribute 'get'

In [None]:
# # Check for null values
# null_rows = cv_data[cv_data['audio_features'].isnull()]
# print("\nRows with null audio_features:")
# print(null_rows[['video_filename', 'audio_features']])

# # Check for non-dict values
# non_dict_rows = cv_data[cv_data['audio_features'].apply(lambda x: not isinstance(x, dict))]
# print("\nRows with non-dictionary audio_features:")
# print(non_dict_rows[['video_filename', 'audio_features']])

In [None]:
# # Print first few rows of audio_features to see what we're dealing with
# print("\nFirst few audio_features values:")
# for i, af in enumerate(cv_data['audio_features'].head()):
#     print(f"\nRow {i}:")
#     print(type(af))
#     print(af)

In [55]:
# Check for None values
print("\nRows with None in audio_features:")
print(cv_data[cv_data['audio_features'].isna()].index)

# Print first few rows where audio_features is None
print("\nSample rows with None:")
bad_rows = cv_data[cv_data['audio_features'].isna()]
if not bad_rows.empty:
    print(bad_rows[['video_filename', 'audio_features']].head())


Rows with None in audio_features:
Index([], dtype='int64')

Sample rows with None:


In [54]:
features = pd.DataFrame({
    # Basic features
    'duration': cv_data['duration'],
    
    # Social media content features
    'total_faces': cv_data['social_media_content'].apply(lambda x: x.get('total_faces', 0)),
    'text_frames': cv_data['social_media_content'].apply(lambda x: x.get('text_frames', 0)),
    'screen_recording_frames': cv_data['social_media_content'].apply(lambda x: x.get('screen_recording_frames', 0)),
    'avg_faces_per_frame': cv_data['social_media_content'].apply(lambda x: x.get('avg_faces_per_frame', 0)),
    'text_percentage': cv_data['social_media_content'].apply(lambda x: x.get('text_percentage', 0)),
    'screen_recording_percentage': cv_data['social_media_content'].apply(lambda x: x.get('screen_recording_percentage', 0)),
    
    # Audio quality metrics
    'volume_level': cv_data['audio_features'].apply(lambda x: x.get('quality_metrics', {}).get('volume_level', 0)),
    'volume_consistency': cv_data['audio_features'].apply(lambda x: x.get('quality_metrics', {}).get('volume_consistency', 0)),
    'high_frequency_content': cv_data['audio_features'].apply(lambda x: x.get('quality_metrics', {}).get('high_frequency_content', 0)),
    'frequency_variation': cv_data['audio_features'].apply(lambda x: x.get('quality_metrics', {}).get('frequency_variation', 0)),
    
    # Audio quality scores
    'volume_quality': cv_data['audio_features'].apply(lambda x: x.get('quality_scores', {}).get('volume_quality', 'unknown')),
    'dynamic_range_quality': cv_data['audio_features'].apply(lambda x: x.get('quality_scores', {}).get('dynamic_range', 'unknown')),
    'frequency_quality': cv_data['audio_features'].apply(lambda x: x.get('quality_scores', {}).get('frequency_quality', 'unknown')),
    'overall_quality': cv_data['audio_features'].apply(lambda x: x.get('quality_scores', {}).get('overall_quality', 'unknown')),
    
    # Audio dynamic analysis
    'dynamic_range': cv_data['audio_features'].apply(lambda x: x.get('dynamic_analysis', {}).get('dynamic_range', 0)),
    'peak_volume': cv_data['audio_features'].apply(lambda x: x.get('dynamic_analysis', {}).get('peak_volume', 0)),
    'median_volume': cv_data['audio_features'].apply(lambda x: x.get('dynamic_analysis', {}).get('median_volume', 0)),
    'noise_floor': cv_data['audio_features'].apply(lambda x: x.get('dynamic_analysis', {}).get('noise_floor', 0)),
    
    'video_filename': cv_data['video_filename']
})

AttributeError: 'NoneType' object has no attribute 'get'

In [50]:
# Cell 4: Feature Engineering
features = pd.DataFrame({
    'duration': cv_data['duration'],
    'total_faces': cv_data['social_media_content'].apply(lambda x: x.get('total_faces', 0)),
    'text_frames': cv_data['social_media_content'].apply(lambda x: x.get('text_frames', 0)),
    'avg_faces': cv_data['social_media_content'].apply(lambda x: x.get('avg_faces_per_frame', 0)),
    'text_percent': cv_data['social_media_content'].apply(lambda x: x.get('text_percentage', 0)),
    'volume_level': cv_data['audio_features'].apply(lambda x: x.get('quality_metrics', {}).get('volume_level', 0)),
    'dynamic_range': cv_data['audio_features'].apply(lambda x: x.get('dynamic_analysis', {}).get('dynamic_range', 0)),
    'peak_volume': cv_data['audio_features'].apply(lambda x: x.get('dynamic_analysis', {}).get('peak_volume', 0)),
    'video_filename': cv_data['video_filename']
})

print("\nSample video filename:", cv_data['video_filename'].iloc[0])
print("\nSample social_media_content:", cv_data['social_media_content'].iloc[0])

TypeError: 'NoneType' object is not subscriptable

In [None]:
# Cell 5: Data Merging
full_data = pd.merge(
    features,
    viral_data[['ad_link', 'views', 'length']],
    left_on='video_filename',
    right_on='ad_link',
    how='inner'
).drop(columns=['video_filename', 'ad_link'])

print("\nMerged Data Shape:", full_data.shape)
full_data.head(2)


In [None]:

# Cell 6: Train/Test Split
X = full_data[['duration', 'total_faces', 'text_frames', 'avg_faces', 
              'text_percent', 'volume_level', 'dynamic_range', 'peak_volume', 'length']]
y = full_data['views']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
# Cell 7: Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
# Cell 8: Model Training
model = GradientBoostingRegressor(
    n_estimators=1000,
    learning_rate=0.01,
    max_depth=5,
    subsample=0.8,
    random_state=42
)

start_time = time.time()
model.fit(X_train_scaled, y_train)
print(f"Training time: {time.time()-start_time:.1f}s")


In [None]:
# Cell 9: Evaluation
predictions = model.predict(X_test_scaled)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
r2 = r2_score(y_test, predictions)

print(f"RMSE: {rmse:,.2f}")
print(f"R²: {r2:.3f}")


In [None]:
# Cell 10: Save Model
joblib.dump(model, 'models/view_predictor.pkl')
joblib.dump(scaler, 'models/scaler.pkl')


In [None]:
# Cell 11: Feature Importance
importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10,6))
plt.barh(importance['feature'], importance['importance'])
plt.gca().invert_yaxis()
plt.title('Feature Importance')
plt.show()