In [1]:


from newspaper import Article
import lxml.html.clean

import requests

import re
import os  # Import os to handle file operations


In [2]:
# Specify the URL of the article
url = "https://insights.blackcoffer.com/ai-and-ml-based-youtube-analytics-and-content-creation-tool-for-optimizing-subscriber-engagement-and-content-strategy/"

# Create an Article object
article = Article(url)

# Download and parse the article
article.download()
article.parse()

# Perform NLP on the article (optional)
article.nlp()

# Extract title and text
title = article.title
text = article.text

# Print title and text (optional)
print("Title:", title)
print("\nText:\n", text)

# Save extracted content to a text file with URL_ID as filename
url_id = "Netclan20241017"  # You can modify this to use a specific URL_ID if needed
with open(f"{url_id}.txt", 'w', encoding='utf-8') as f:
    f.write(f"Title: {title}\n\n{text}")

print(f"Extracted content saved to {url_id}.txt")


Title: AI and ML-Based YouTube Analytics and Content Creation Tool for Optimizing Subscriber Engagement and Content Strategy

Text:
 Client Background

Client: A leading IT & tech firm in the USA

Industry Type: IT

Products & Services: IT Consulting, IT Support, SaaS, Marketing Strategy

Organization Size: 10+

The Problem

Building AI and ML based YouTube analytics and content creation tool that will help youtuber to understand their subscriber’s watching behaviour, help them in content research, creation and publication.

Our Solution

Created a MERN stack web application and integrated AI models to helps youtuber to generated titles, descriptions, tags, hashtags, captions etc. Help them to check thumbnail quality, analysis on the videos using video auditor tool, analysis on comments using sentiments analysis, help to under their subscribers using churn predication AI model.

Solution Architecture

https://www.figma.com/file/WQs01mmmNBZ1SjNE2IV8Sl/Youtube-Web-App-By-SHiV?type=design

In [3]:
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from collections import Counter
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# Load stop words
stop_words = set()
for file in ["Complete_StopWords"]:
    with open(file, "r", encoding="utf-8") as f:
        stop_words.update(word.strip().lower() for word in f)

# Load positive and negative words
positive_words = set()
negative_words = set()
with open("positive-words", "r") as f:
    positive_words.update(word.strip().lower() for word in f if word.strip() not in stop_words)
with open("negative-words", "r") as f:
    negative_words.update(word.strip().lower() for word in f if word.strip() not in stop_words)


In [5]:
def clean_text(text):
    # Remove punctuation and convert to lowercase
    text = re.sub(r'[^\w\s]', '', text).lower()
    # Tokenize words and sentences
    words = word_tokenize(text)
    sentences = sent_tokenize(text)
    # Remove stop words
    cleaned_words = [word for word in words if word not in stop_words]
    return cleaned_words, sentences

In [6]:
def calculate_sentiment_scores(words):
    positive_score = sum(1 for word in words if word in positive_words)
    negative_score = sum(1 for word in words if word in negative_words)
    return positive_score, negative_score

In [7]:
def calculate_polarity_subjectivity(positive_score, negative_score, total_words):
    polarity = (positive_score - negative_score) / (positive_score + negative_score + 0.000001)
    subjectivity = (positive_score + negative_score) / (total_words + 0.000001)
    return polarity, subjectivity

In [8]:
def calculate_readability_metrics(words, sentences):
    total_words = len(words)
    total_sentences = len(sentences)
    avg_sentence_length = total_words / total_sentences if total_sentences > 0 else 0
    avg_words_per_sentence = total_words / total_sentences if total_sentences > 0 else 0
    complex_words = sum(1 for word in words if len(re.findall(r'[aeiouy]', word)) > 2)
    percentage_complex_words = complex_words / total_words if total_words > 0 else 0
    fog_index = 0.4 * (avg_sentence_length + percentage_complex_words)
    return avg_sentence_length, avg_words_per_sentence, percentage_complex_words, fog_index, complex_words

In [9]:
def calculate_word_stats(words):
    total_words = len(words)
    avg_word_length = sum(len(word) for word in words) / total_words if total_words > 0 else 0
    return total_words, avg_word_length

In [10]:
def syllable_count(word):
    word = word.lower()
    syllables = len(re.findall(r'[aeiouy]', word))
    if word.endswith(('es', 'ed')):
        syllables = max(1, syllables - 1)
    return syllables

In [11]:
def count_personal_pronouns(text):
    pronouns = re.findall(r'\b(I|we|my|ours|us)\b', text, re.IGNORECASE)
    return len(pronouns)

In [12]:
def analyze_text(text):
    cleaned_words, sentences = clean_text(text)
    positive_score, negative_score = calculate_sentiment_scores(cleaned_words)
    polarity, subjectivity = calculate_polarity_subjectivity(positive_score, negative_score, len(cleaned_words))
    avg_sentence_length, avg_words_per_sentence, percentage_complex_words, fog_index, complex_word_count = calculate_readability_metrics(cleaned_words, sentences)
    total_words, avg_word_length = calculate_word_stats(cleaned_words)
    syllable_per_word = sum(syllable_count(word) for word in cleaned_words) / total_words if total_words > 0 else 0
    personal_pronouns = count_personal_pronouns(text)
    
    return {
        "URL_ID": file.split(".")[0],
        "url": url,
        "Positive Score": positive_score,
        "Negative Score": negative_score,
        "Polarity Score": polarity,
        "Subjectivity Score": subjectivity,
        "Average Sentence Length": avg_sentence_length,
        "Average Words Per Sentence": avg_words_per_sentence,
        "Percentage of Complex Words": percentage_complex_words,
        "Fog Index": fog_index,
        "Complex Word Count": complex_word_count,
        "Word Count": total_words,
        "Syllable Per Word": syllable_per_word,
        "Personal Pronouns": personal_pronouns,
        "Average Word Length": avg_word_length,
    }

In [13]:
results = []

for file in os.listdir("."):
    if file.endswith(".txt"):
        with open(file, "r", encoding="utf-8", errors="ignore") as f:
            text = f.read()

        results.append(analyze_text(text))
        
# Save results to Excel
output_df = pd.DataFrame(results)
output_df.to_csv("Output Data Structure5.csv", index=False)