In [1]:
import pandas as pd
import csv
import numpy as np
import seaborn as sns
import string
import matplotlib.pyplot as plt

from collections import Counter
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Load the data
youtube_data = pd.read_csv('/content/all_data.csv', on_bad_lines='skip')

FileNotFoundError: [Errno 2] No such file or directory: '/content/all_data.csv'

In [None]:
# Drop irrelevant columns
col_to_drop = ['publish_time', 'thumbnail_link', 'trending_date']
youtube_data = youtube_data.drop(columns = col_to_drop)

# Drop duplicate videos
youtube_data = youtube_data.drop_duplicates(subset='video_id')

In [None]:
# Print the data
youtube_data.head()

In [None]:
# Print the number of unique values in each column
youtube_data.nunique()

In [None]:
# Check the number of missing values
# You can see that there are 338 rows that just have no data in them
youtube_data.isna().sum()

In [None]:
# Drop these 338 items that have no data and the data with no countries
youtube_data = youtube_data.dropna(subset=['title'])
youtube_data = youtube_data.dropna(subset=['country'])

In [None]:
# Replace na descriptions with an empty string
youtube_data['description'].fillna('', inplace=True)

In [None]:
# Re-check the number of na values
youtube_data.isna().sum()

In [None]:
# Create a column with the number of tags
def count_tags(tags_str):
    if pd.isna(tags_str):
        return 0
    else:
        return len(tags_str.split('|'))

youtube_data['num_tags'] = youtube_data['tags'].apply(count_tags)

In [None]:
# Create a column with the title length
def count_tags(title_str):
    if pd.isna(title_str):
        return 0
    else:
        return len(title_str)

youtube_data['title_len'] = youtube_data['title'].apply(count_tags)

In [None]:
# Create a column with the description length
def count_tags(desc_str):
    if pd.isna(desc_str):
        return 0
    else:
        return len(desc_str)

youtube_data['description_len'] = youtube_data['description'].apply(count_tags)

In [None]:
# Create a column with the percent of the title that is capitalized
def percent_uppercase(title):
    if pd.isna(title):
        return 0
    else:
        uppercase_count = sum(1 for char in title if char.isupper())
        total_chars = len(title)
        if total_chars == 0:
            return 0
        else:
            return (uppercase_count / total_chars) * 100

youtube_data['title_uppercase_percent'] = youtube_data['title'].apply(percent_uppercase)

In [None]:
# Print the column types and save them based on type
youtube_data.dtypes

In [None]:
# Change category_id to object
youtube_data['category_id'] = youtube_data['category_id'].astype(str)

# Save numerical and object columns
numerical_columns = youtube_data.select_dtypes(include=np.number).columns.to_list()
print(numerical_columns)

object_columns = youtube_data.select_dtypes(include='object').columns.to_list()
print()
print(object_columns)

In [None]:
youtube_data[numerical_columns].describe()

In [None]:
youtube_data[object_columns].describe()

In [None]:
# Should heatmap of correclations
corr = youtube_data[numerical_columns].corr()
sns.heatmap(corr)

In [None]:
youtube_data.head()

**Classification - Predict if a Video Will Have 1M+ Views**

**Decision Tree**

In [None]:
# Create a new columne that is 1 if the video has over 1/4 of a million views and 0 if it doesn't
youtube_data['quarter_million'] = np.where(youtube_data['views'] > 250000, 1, 0)

# Print the percent of columns with over 1/4 of a million views
print((youtube_data['quarter_million'] == 1).mean() * 100)

In [None]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = youtube_data[numerical_columns].drop(columns=['views'])
y = youtube_data['quarter_million']

# Split the train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Standardize
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create and fit the decision tree
dt = DecisionTreeClassifier(max_depth=7, min_samples_split=50, min_samples_leaf=10, ccp_alpha=0.0)
dt = dt.fit(X_train, y_train)

y_dt_predicted = dt.predict(X_test)

# Print classifcation report
print(classification_report(y_test,y_dt_predicted))

The decision tree indicates that the model performs quite well, but better for classifying videos that don't receive a quarter million views.

**K Neighbors**

---



In [None]:
from sklearn.neighbors import KNeighborsClassifier

# KNN Algorithm
knn = KNeighborsClassifier(n_neighbors=7, weights='distance', leaf_size=10)
knn.fit(X_train, y_train)
y_knn_predicted = knn.predict(X_test)

# Print report
print(classification_report(y_test,y_knn_predicted))

**Naive Bayes**

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

# Naive Bayes Algorithm
nb = GaussianNB(priors=[0.3, 0.7])
nb.fit(X_train, y_train)
y_nb_predicted = nb.predict(X_test)

# Print report
print(classification_report(y_test, y_nb_predicted))

**Neural Network**

In [None]:
from sklearn.neural_network import MLPClassifier

nn = MLPClassifier()

nn.fit(X_train, y_train)
y_nn_predicted = nn.predict(X_test)

# Print report
print(classification_report(y_test, y_nn_predicted))

In [None]:
from sklearn.inspection import permutation_importance
import pandas as pd

# Example feature names, adjust according to your dataset
feature_names = X.columns if isinstance(X, pd.DataFrame) else ['feature1', 'feature2', 'feature3', ...]

# Assuming nn is your trained MLPClassifier
result = permutation_importance(nn, X_test, y_test, n_repeats=10, random_state=42, n_jobs=-1)

# Print the importance of each feature
for i in range(len(feature_names)):
    print(f"{feature_names[i]}: {result.importances_mean[i]}")


In [None]:
X.columns()