In [1]:
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import requests
import csv
from pymongo import MongoClient
import json

API_KEY = 'XXXXXXXXXXXXXXXXXXXX'

def save_next_page_token(token, filename='next_page_token.json'):
    with open(filename, 'w') as file:
        json.dump({"nextPageToken": token}, file)

def load_next_page_token(filename='next_page_token.json'):
    try:
        with open(filename, 'r') as file:
            data = json.load(file)
            return data.get("nextPageToken")
    except FileNotFoundError:
        return None

def tags_to_csv():
    """
    Récupère les vidéos correspondant aux tags définis, puis enregistre les métadonnées dans un fichier CSV.
    """
    tags = [
        "Cloud Computing", "Kubernetes", "Docker", "DevOps", "Machine Learning", "AI", "Data Science", "Big Data"
    ]

    # Récupérer les identifiants des vidéos
    video_ids = get_technology_video_ids(tags, max_results=50)

    # Écrire les informations des vidéos dans un fichier CSV
    with open('video_info.csv', mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['video_id', 'title', 'publishedAt', 'channelId', 'description', 'view_count', 
                         'like_count', 'favorite_count', 'comment_count', 'tags', 'defaultLanguage'])

        for video_id in video_ids:
            video_info = get_video_info(video_id)
            if video_info:
                writer.writerow([
                    video_id,
                    video_info['snippet']['title'],
                    video_info['snippet']['publishedAt'],
                    video_info['snippet']['channelId'],
                    video_info['snippet']['description'],
                    video_info['statistics'].get('viewCount', 'N/A'),
                    video_info['statistics'].get('likeCount', 'N/A'),
                    video_info['statistics'].get('favoriteCount', 'N/A'),
                    video_info['statistics'].get('commentCount', 'N/A'),
                    ", ".join(video_info['snippet'].get('tags', [])),
                    video_info['snippet'].get('defaultLanguage', 'N/A')
                ])

def get_technology_video_ids(tags, max_results=50):
    """
    Récupère les identifiants de vidéos YouTube correspondant à une liste de tags et une catégorie.

    Args:
        tags: Une liste de chaînes de caractères représentant les tags à rechercher.
        max_results: Le nombre maximum de résultats à récupérer.

    Returns:
        Une liste d'identifiants de vidéos.
    """
    youtube = build("youtube", "v3", developerKey=API_KEY, cache_discovery=False)

    all_video_ids = []
    next_page_token = load_next_page_token() 

    # Diviser les tags en sous-listes de 10 tags maximum
    tag_groups = [tags[i:i + 10] for i in range(0, len(tags), 10)]

    for tag_group in tag_groups:
        while True:
            try:
                search_response = youtube.search().list(
                    part="snippet",
                    maxResults=min(max_results, 50),  # Limite à 50 résultats par requête
                    type="video",
                    videoCategoryId="28",  # Catégorie Technologie
                    q=" ".join(tag_group),
                    pageToken=next_page_token
                ).execute()

                video_ids = [item['id']['videoId'] for item in search_response['items']]
                all_video_ids.extend(video_ids)

                next_page_token = search_response.get('nextPageToken')
                print('next_page_token', next_page_token)
                save_next_page_token(next_page_token)
                if not next_page_token or len(all_video_ids) >= max_results:
                    break
            except HttpError as err:
                print(f"Erreur lors de la récupération des vidéos : {err}")
                break

    return all_video_ids[:max_results]


def get_video_info(video_id):
    """
    Récupère les informations détaillées d'une vidéo YouTube à partir de son identifiant.

    Args:
        video_id: L'identifiant de la vidéo.

    Returns:
        Un dictionnaire contenant les informations de la vidéo, ou None en cas d'erreur.
    """
    url = "https://www.googleapis.com/youtube/v3/videos"
    params = {
        "id": video_id,
        "part": "snippet,statistics",
        "key": API_KEY
    }
    response = requests.get(url, params=params)

    if response.status_code == 200:
        return response.json()['items'][0]
    else:
        print(f"Erreur pour l'ID vidéo {video_id}: {response.status_code}")
        return None

tags_to_csv()

next_page_token CKwCEAA


In [2]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def clean_tags_videos():
    # Dictionnaire de mappage
    tag_mapping = {
        "ml": "machine learning",
        "ai": "artificial intelligence",
        "devops": "dev ops",
        "datascience": "data science",
        "gcp": "google cloud platform",
        "kuebernetes": "kubernetes",
        "kubeflow": "kubeflow",
        "bigdata": "big data",
        "bigdataldn": "big data london",
        "eks": "amazon elastic kubernetes service",
        "aks": "azure kubernetes service",
        "mlops": "machine learning operations",
        "tf": "tensorflow",
        "gke": "google kubernetes engine",
        "does": "devops enterprise summit",
        "does19": "devops enterprise summit 2019",
        "ndc": "norwegian developer conference",
        "odsc": "open data science conference",
        "sparkaisummit": "apache spark ai summit",
        "docker container": "docker",
        "containerization": "containers",
        "cloudbased platform": "cloud platform",
        "machine language models": "machine learning models",
        "no tags": "",
        "type conference talk full production": "conference",
        "prpr google cloud next": "google cloud next",
        "purpose educate": "education",
        "app development": "application development",
        "backend devlopment": "backend development",
        "kubernetes explained": "kubernetes",
        "cloud onair": "cloud",
        "it infrastructure": "infrastructure",
        "webdev": "web development",
        "lesson": "tutorial",
        "tutorialspoint": "tutorial",
        "grok 3 cosmic diplomacy": "grok",
        "xspeculation alliances": "alliances",
        "3000 ai sims": "ai simulations",
        "interstellar prompt tools": "prompt tools",
        "advanced diplomacy reasoning": "diplomacy",
        "cosmic social dynamics": "social dynamics",
        "grokai": "grok",
        "prompting": "prompt",
        "promptengineering": "prompt engineering",
        "all things open": "open source",
        "ato2020": "all things open 2020",
        "technology": "tech",
        "tania allard phd": "tania allard",
        "microsoftnshivani patel": "shivani patel",
        "aws cloud": "amazon web services",
        "cloud computing explained": "cloud computing",
        "docker tutorial": "docker",
        "docker container": "docker",
        "docker containers explained": "docker",
        "singularity container": "singularity",
        "cloud computing tutorial for beginners": "cloud computing",
        "docker vs kubernetes": "docker and kubernetes",
        "kuebernetes tutorials": "kubernetes tutorials",
        "container tutorial": "containers",
        "natural language processsing": "natural language processing",
        "machine vision": "computer vision",
        "cloud": "cloud computing",
        "salman iqbal": "salman iqbal",
        "ndc conferences 2021 oslo live": "ndc oslo 2021",
        "ai conference": "artificial intelligence conference",
        "virtual conference": "virtual event",
        "containers": "containerization",
        "linux kernel": "linux",
        "cgroups": "control groups",
        "cloud foundry": "cloud",
        "runtimes": "runtime",
        "nodejs": "node.js",
        "vms": "virtual machines",
        "host os": "host operating system",
        "hypervisor": "virtualization",
        "guest os": "guest operating system",
        "agile": "agile methodology",
        "portability": "portable",
        "scalability": "scalable",
        "ibm cloud kubernetes service": "ibm cloud",
        "manifest": "manifest file",
        "yaml": "yaml",
        "image": "docker image",
        "runtime engine": "runtime",
        "docker engine": "docker",
        "cognitive api": "cognitive services",
        "watson": "ibm watson",
        "python": "programming",
        "cloudnative": "cloud native",
        "architecture": "software architecture",
        "continuous integration": "ci",
        "continuous delivery": "cd",
        "aws": "amazon web services",
        "eks": "amazon eks",
        "container day": "container event",
        "big data ldn": "big data london",
        "bigdata": "big data",
        "conference": "event",
        "big data expo": "big data exhibition",
        "big data": "big data analytics",
        "100 seconds of code": "coding tutorial",
        "backend development": "backend",
        "kubernetes architecture": "kubernetes",
        "linux": "operating system",
        "virtualization": "virtual machines",
        "linux on the desktop": "linux desktop",
        "hypervisors": "virtualization",
        "embedded systems": "embedded",
        "linux containers": "containers",
        "linux security": "security",
        "runai": "run ai",
        "anaconda": "data science platform",
        "brainchip": "ai hardware",
        "devoxxbe": "devoxx belgium",
        "devoxxbe19": "devoxx belgium 2019",
        "microsoft azure kubernetes": "azure kubernetes service",
        "aks cluster": "azure kubernetes service",
        "aks devops": "azure devops",
        "data engineer": "data engineering",
        "dataanalyst": "data analyst",
        "engineer": "software engineer",
        "databricks": "data platform",
        "deep learning": "dl",
        "mlops": "machine learning operations",
        "tf serving": "tensorflow serving",
        "sagemaker": "aws sagemaker",
        "pytorch": "deep learning framework",
        "tensorflow": "ml framework",
        "onnx": "open neural network exchange",
        "model deployment": "deployment",
        "workshop": "training",
        "sql server": "microsoft sql server",
        "sql server 2019": "sql server",
        "big data clusters": "big data cluster",
        "polybase": "data integration",
        "enhanced polybase": "polybase",
        "data virtualization": "virtualization",
        "database administrator": "dba",
        "dba": "database administration",
        "devops enterprise summit usa": "devops enterprise summit",
        "jordan edwards": "speaker",
        "microsoftnshivani patel": "speaker",
        "machine learning": "ml",
        "data science": "ds",
        "datascience": "data science",
        "odsc": "open data science conference",
        "docker": "containerization",
        "databricks": "data platform",
        "how to use kubeflow": "kubeflow tutorial",
        "what is kubeflow": "kubeflow",
        "google kubernetes engine": "gke",
        "google compute engine": "gce",
        "machine language models": "ml models",
        "python tutorial": "python",
        "docker container": "docker",
        "ml models": "machine learning models",
        "gke": "google kubernetes engine",
        "python": "programming language",
        "fairing": "kubeflow fairing",
        "google cloud ai huddle": "google ai",
        "google": "google cloud",
        "karthik ramasamy": "author",
        "gds yes": "global data science",
        "sparkaisummit": "apache spark ai summit",
        "apache hadoop": "hadoop",
        "apache spark": "spark",
        "docker  container": "docker"
    }

    # Charger le fichier
    file_path = "video_info.csv"
    df = pd.read_csv(file_path)

    # Convertir 'publishedAt' en datetime
    # df["publishedAt"] = pd.to_datetime(df["publishedAt"], format="%d%m%Y")
    df["publishedAt"] = pd.to_datetime(df["publishedAt"], utc=True)

    # Ajouter des colonnes pour le filtrage
    df["year"] = df["publishedAt"].dt.year  # Année
    df["month"] = df["publishedAt"].dt.strftime("%m")
    df["month_name"] = df["publishedAt"].dt.strftime("%B")

    # Catégorisation des vidéos par popularité
    bins = [0, 1000, 10000, 100000, 1000000, float("inf")]
    labels = ["0-1K", "1K-10K", "10K-100K", "100K-1M", "1M+"]
    df["popularity_category"] = pd.cut(df["view_count"], bins=bins, labels=labels, right=False)

    # Remplacement des valeurs NaN pour éviter les erreurs
    df["like_count"].fillna(0, inplace=True)
    df["comment_count"].fillna(0, inplace=True)

    # Nettoyage des tags pour enlever les caractères spéciaux
    def clean_tags(tag_list):
        cleaned_tags = [re.sub(r'[^a-zA-Z0-9\s]', '', tag).strip() for tag in tag_list]
        return cleaned_tags

    # Normalisation des tags avec le dictionnaire de mappage
    def normalize_tags(tags, mapping):
        return [mapping.get(tag.lower(), tag.lower()) for tag in tags]

    # Appliquer le nettoyage et la normalisation des tags
    df["tags"] = df["tags"].apply(lambda x: normalize_tags(clean_tags(x.split(",")), tag_mapping) if pd.notna(x) else [])

    # Explosion de la colonne
    df = df.explode("tags")

    # Supprimer les lignes avec des tags vides après l'explosion
    df = df[df["tags"] != ""]

    df = df.drop_duplicates()

    # # Enregistrer le fichier transformé
    output_path = "video_info_dashboard.csv"
    df.to_csv(output_path, index=False)

clean_tags_videos()

In [3]:
def csv_to_mongo_compass():
    """
    Lit les données depuis le fichier CSV généré et les insère dans une base de données MongoDB locale.
    """
    client = MongoClient("mongodb+srv://gaetan:05XXXXXXfR@cluster0.wxcxg.mongodb.net/")
    db = client['youtube_data']
    collection = db['videos3']

    # Charger les données depuis le fichier CSV
    with open('video_info_dashboard.csv', mode='r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        video_data = [row for row in reader]

    # Insérer les données dans MongoDB
    if video_data:
        collection.insert_many(video_data)
        print(f"{len(video_data)} vidéos ont été insérées dans la base MongoDB.")
    else:
        print("Aucune donnée trouvée dans le fichier CSV.")

csv_to_mongo_compass()

330 vidéos ont été insérées dans la base MongoDB.


In [4]:
import pandas as pd

# Charger les fichiers CSV
merged_df = pd.read_csv("video_info_dashboard_merged.csv")
print("merged_df", len(merged_df))
new_df = pd.read_csv("video_info_dashboard.csv")
print("new_df", len(new_df))

# Concaténer les données
merged_df = pd.concat([merged_df, new_df], ignore_index=True)
merged_df = merged_df.drop_duplicates()

# Sauvegarder le fichier mis à jour
merged_df.to_csv("video_info_dashboard_merged.csv", index=False)

print("Fusion terminée avec succès !", len(merged_df))

merged_df 4261
new_df 330
Fusion terminée avec succès ! 4288
