In [None]:
# YouTube Trend Prediction System
# This code predicts the trending potential of a YouTube video based on its title, description, and tags
# It also provides recommendations to improve the video's chances of trending

import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import xgboost as xgb
from transformers import AutoTokenizer, AutoModel
import torch
from collections import Counter
import string
import joblib
import warnings
import time
from tqdm import tqdm
from imblearn.over_sampling import SMOTE
import gensim.downloader as api
import pickle
import re
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import SelectKBest, chi2


In [None]:
# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Download necessary NLTK resources
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Data preprocessing
class TextPreprocessor:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))

    def clean_text(self, text):
        if text == '[none]' or pd.isna(text):
            return ''

        # Convert to lowercase
        text = text.lower()

        # Remove punctuation and special characters
        text = re.sub(r'[^\w\s]', ' ', text)

        # Remove numbers
        text = re.sub(r'\d+', ' ', text)

        # Remove extra whitespaces
        text = re.sub(r'\s+', ' ', text).strip()

        # Tokenize
        tokens = word_tokenize(text)

        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(word) for word in tokens if word not in self.stop_words and len(word) > 2]

        return ' '.join(tokens)

    def process_tags(self, tags):
        if tags == '[none]' or pd.isna(tags):
            return ''

        # Remove brackets if present
        if tags.startswith('[') and tags.endswith(']'):
            tags = tags[1:-1]

        # Split by common delimiters and clean
        tag_list = re.split(r'[,|;]', tags)
        tag_list = [tag.strip().lower() for tag in tag_list if tag.strip()]

        return ' '.join(tag_list)

print("\nPreprocessing data...")
text_processor = TextPreprocessor()


Preprocessing data...


In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('corpus')
nltk.download('omw-1.4')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Error loading corpus: Package 'corpus' not found in index
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# Function to predict trending potential and provide recommendations
def predict_trending_potential(title, description, tags):
    if not title:
        print("No Title - 0% Chance to Trend")
        print("")
        print("Recommendations :")
        print("Consider making your title longer (at least 40 characters).")
        return

    else:

      # Load saved models and vectorizers
      model = joblib.load('/content/drive/MyDrive/best_model.pkl')
      tfidf_title = joblib.load('/content/drive/MyDrive/tfidf_title.pkl')
      tfidf_description = joblib.load('/content/drive/MyDrive/tfidf_description.pkl')
      tfidf_tags = joblib.load('/content/drive/MyDrive/tfidf_tags.pkl')
      tfidf_combined = joblib.load('/content/drive/MyDrive/tfidf_combined.pkl')
      text_processor = joblib.load('/content/drive/MyDrive/text_processor.pkl')

      # Preprocess inputs
      clean_title = text_processor.clean_text(title)
      clean_description = text_processor.clean_text(description)
      clean_tags = text_processor.process_tags(tags)
      combined_text = clean_title + ' ' + clean_description + ' ' + clean_tags

      # Extract text features
      features = {}
      features['title_length'] = len(title) if title else 0
      features['title_word_count'] = len(clean_title.split()) if clean_title else 0
      features['has_description'] = 0 if description in ['[none]', ''] or pd.isna(description) else 1
      features['description_length'] = len(description) if description and description != '[none]' and not pd.isna(description) else 0
      features['description_word_count'] = len(clean_description.split()) if clean_description else 0
      features['has_tags'] = 0 if tags in ['[none]', ''] or pd.isna(tags) else 1
      features['tag_count'] = len(re.split(r'[,|;]', tags)) if tags and tags != '[none]' and not pd.isna(tags) else 0
      features['title_has_question'] = 1 if '?' in title else 0
      features['title_has_exclamation'] = 1 if '!' in title else 0
      features['title_starts_with_number'] = 1 if re.match(r'^\d+', title) else 0
      features['title_has_all_caps'] = 1 if any(word.isupper() and len(word) > 1 for word in title.split()) else 0
      features['title_has_brackets'] = 1 if ('(' in title and ')' in title) or ('[' in title and ']' in title) else 0

      # Transform to vectors
      X_title = tfidf_title.transform([clean_title])
      X_description = tfidf_description.transform([clean_description])
      X_tags = tfidf_tags.transform([clean_tags])
      X_combined = tfidf_combined.transform([combined_text])

      # Create feature array
      X_text_features = np.array(list(features.values())).reshape(1, -1)
      X_combined_array = X_combined.toarray()

      # Combine features
      X = np.hstack((X_text_features, X_combined_array))

      # Make prediction
      probability = model.predict_proba(X)[0, 1]
      percentage = probability * 100

      # Generate recommendations
      recommendations = []

      # Title recommendations
      if features['title_length'] < 30:
          recommendations.append("Consider making your title longer (at least 40 characters).")
      if not features['title_has_question'] and probability < 0.7:
          recommendations.append("Consider adding a question mark to your title to increase engagement.")
      if not features['title_has_exclamation'] and probability < 0.7:
          recommendations.append("Consider adding an exclamation mark to make your title more exciting.")
      if not features['title_starts_with_number'] and probability < 0.7:
          recommendations.append("Consider starting your title with a number (e.g., '5 Ways to...').")
      if not features['title_has_all_caps'] and probability < 0.7:
          recommendations.append("Consider adding ONE word in ALL CAPS for emphasis.")

      # Description recommendations
      if features['description_length'] < 100:
          recommendations.append("Your description is too short. Aim for at least 200 characters.")
      elif features['description_length'] < 200:
          recommendations.append("Consider expanding your description with more details.")

      # Tags recommendations
      if features['tag_count'] < 5:
          recommendations.append("Add more tags to increase discoverability. Aim for at least 10 tags.")
      elif features['tag_count'] < 10:
          recommendations.append("Consider adding more tags to reach the optimal count of 15-20 tags.")

      # Check for trending words
      trending_words_dict = dict(trending_words)
      title_words = set(clean_title.split())
      desc_words = set(clean_description.split())
      tag_words = set(clean_tags.split())

      all_words = title_words.union(desc_words).union(tag_words)
      missing_trending_words = [word for word, ratio in trending_words[:20] if word not in all_words]

      if missing_trending_words:
          recommendations.append(f"Consider including some of these high-performing words: {', '.join(missing_trending_words[:5])}")

      # Title length recommendations based on data
      avg_trending_title_length = 40  # Replace with actual value from your analysis
      if features['title_length'] < avg_trending_title_length * 0.7:
          recommendations.append(f"Your title is shorter than most trending videos. Consider lengthening it to around {avg_trending_title_length} characters.")
      elif features['title_length'] > avg_trending_title_length * 1.5:
          recommendations.append(f"Your title might be too long. Consider shortening it to around {avg_trending_title_length} characters.")

      # Return results
      return {
          'trending_probability': probability,
          'trending_percentage': percentage,
          'recommendations': recommendations
      }

In [None]:
# Example usage
trending_words = [('goo', 12.879195145493743), ('google', 10.263471237548654), ('jimmy', 8.54731531915323), ('punjabi', 7.401170038109182), ('late', 7.346954547179068), ('star', 6.872629479757715), ('2018"', 6.8162734646580505), ('list', 5.950030840021772), ('facebook', 5.719230899477845), ('bit', 5.646139900219482), ('twitter', 5.493023020752765), ('show"', 5.328208914436512), ('dubbed', 5.046648316156261), ('telugu', 4.789440307339502), ('celebrity', 4.682535718809587), ('director', 4.362665665779235), ('itunes', 4.25467170443928), ('producer', 4.1810224422202005), ('"punjabi', 4.123725802308281), ('movies"', 4.116880692213481), ('comedy"', 4.022820361183379), ('show', 3.9117611445681164), ('"telugu', 3.8792548012889316), ('production', 3.865690347846606), ('com', 3.606691947979348), ('playlist', 3.4739099124808055), ('videos"', 3.429145771353231), ('singh', 3.370740634054742), ('"comedy', 3.286671475013391), ('www', 3.2794115379011965), ('vijay', 3.249725274899692), ('"amit', 3.205696011861368), ('talk', 3.143498052796781), ('nbc', 3.1138312903274303), ('user', 3.04694109040745), ('starring', 3.0314305961590726), ('comedy', 3.014006804610808), ('film', 2.982351704420323), ('plus', 2.925764138623556), ('interview', 2.886233525855246), ('sun', 2.868552366153559), ('bucket', 2.847505833240214), ('south', 2.8457364243672356), ('entertainment', 2.7721232933475566), ('christmas', 2.7309459307013904), ('apps', 2.670345634703313), ('latest', 2.6503051581701578), ('interview"', 2.6432446637209), ('"desi', 2.6381906586277437), ('soundcloud', 2.621086445751337)]

print("\nPrediction Example:")
sample_title = "10 Amazing Facts About Space You Won't Believe!"
sample_description = "In this video, we explore the most fascinating and mind-blowing facts about our universe that most people don't know. From black holes to distant galaxies, prepare to have your mind blown!"
sample_tags = "space, universe, astronomy, facts, amazing, science, education, blackhole"

prediction = predict_trending_potential(sample_title, sample_description, sample_tags)

if not sample_title:
    print("")
else:
  print(f"Title: {sample_title}")
  print(f"Description: {sample_description}")
  print(f"Tags: {sample_tags}")
  print(f"Trending Probability: {prediction['trending_probability']:.4f}")
  print(f"Trending Percentage: {prediction['trending_percentage']:.2f}%")
  print("\nRecommendations:")
  for i, rec in enumerate(prediction['recommendations'], 1):
      print(f"{i}. {rec}")



Prediction Example:
Title: 10 Amazing Facts About Space You Won't Believe!
Description: In this video, we explore the most fascinating and mind-blowing facts about our universe that most people don't know. From black holes to distant galaxies, prepare to have your mind blown!
Tags: space, universe, astronomy, facts, amazing, science, education, blackhole
Trending Probability: 0.8499
Trending Percentage: 84.99%

Recommendations:
1. Consider expanding your description with more details.
2. Consider adding more tags to reach the optimal count of 15-20 tags.
3. Consider including some of these high-performing words: goo, google, jimmy, punjabi, late


In [None]:
# Example usage
print("\nPrediction Example:")
sample_title = "Top 10 Hacks"
sample_description = "These are the top 10 hacks which are never shown by anyone"
sample_tags = "Top10, Hacks, New, Technology, Phone"

prediction = predict_trending_potential(sample_title, sample_description, sample_tags)

if not sample_title:
    print("")
else:
  print(f"Title: {sample_title}")
  print(f"Description: {sample_description}")
  print(f"Tags: {sample_tags}")
  print(f"Trending Probability: {prediction['trending_probability']:.4f}")
  print(f"Trending Percentage: {prediction['trending_percentage']:.2f}%")
  print("\nRecommendations:")
  for i, rec in enumerate(prediction['recommendations'], 1):
      print(f"{i}. {rec}")




Prediction Example:
Title: Top 10 Hacks
Description: These are the top 10 hacks which are never shown by anyone
Tags: Top10, Hacks, New, Technology, Phone
Trending Probability: 0.3352
Trending Percentage: 33.52%

Recommendations:
1. Consider making your title longer (at least 40 characters).
2. Consider adding a question mark to your title to increase engagement.
3. Consider adding an exclamation mark to make your title more exciting.
4. Consider starting your title with a number (e.g., '5 Ways to...').
5. Consider adding ONE word in ALL CAPS for emphasis.
6. Your description is too short. Aim for at least 200 characters.
7. Consider adding more tags to reach the optimal count of 15-20 tags.
8. Consider including some of these high-performing words: goo, google, jimmy, punjabi, late
9. Your title is shorter than most trending videos. Consider lengthening it to around 40 characters.


In [None]:
# Example usage
print("\nPrediction Example:")
sample_title = ""
sample_description = ""
sample_tags = ""

prediction = predict_trending_potential(sample_title, sample_description, sample_tags)

if not sample_title:
    print("")
else:
  print(f"Title: {sample_title}")
  print(f"Description: {sample_description}")
  print(f"Tags: {sample_tags}")
  print(f"Trending Probability: {prediction['trending_probability']:.4f}")
  print(f"Trending Percentage: {prediction['trending_percentage']:.2f}%")
  print("\nRecommendations:")
  for i, rec in enumerate(prediction['recommendations'], 1):
      print(f"{i}. {rec}")



Prediction Example:
No Title - 0% Chance to Trend

Recommendations :
Consider making your title longer (at least 40 characters).

