# EDA Project - Youtube Analysis
## Author - Dakshay Ahuja(2010990178)
***

In [None]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from scipy import stats
from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
yt_views = pd.read_csv("Dataset/train.csv")
print(yt_views.head())

***
### Check the Dimensions of the Data

In [None]:
rows, columns = yt_views.shape
print(f"The dataset contains {rows} rows and {columns} columns.")

***
### Summary Statistics

In [None]:
# Summary statistics for numerical features
print("Summary statistics for numerical features:")
print(yt_views.describe())

# General overview including data types
print("\nGeneral overview of data:")
print(yt_views.info())

***
### Identifying Numeric and Categorical Columns

In [None]:
# Identifying numeric columns
numeric_columns = yt_views.select_dtypes(include=[np.number]).columns.tolist()
print(f"Numeric columns: {numeric_columns} \n")

# Identifying categorical columns
categorical_columns = yt_views.select_dtypes(include=['object']).columns.tolist()
print(f"Categorical columns: {categorical_columns}")

***
### Fixing Data type of columns

In [None]:
# Columns to be converted
columns_to_convert = ['category_id','Tag_count', 'likes', 'dislike', 'views', 'comment_count', 'Trend_day_count', 'Trend_tag_count']

# Convert columns to numeric data type
for column in columns_to_convert:
    yt_views[column] = pd.to_numeric(yt_views[column], errors='coerce')

print("Data types after conversion:")
print(yt_views.dtypes)

***
### Missing Value Analysis

##### Dropping unnecessary columns

In [None]:
yt_views.drop(['Unnamed: 17', 'Unnamed: 18'], axis=1, inplace=True)

##### Display the Count of Missing Values in Each Column

In [None]:
# Using isnull() method to check for missing values, sum() to aggregate them
missing_values_count = yt_views.isnull().sum()

# Printing the count of missing values for each column
print("Missing Values per Column:")
print(missing_values_count)

##### Display Missing Values as a Percentage for Each Column

In [None]:
# Calculate the total number of rows in the DataFrame
total_rows = len(yt_views)

# Calculate the percentage of missing values for each column
missing_values_percentage = (yt_views.isnull().sum() / total_rows) * 100

# Printing the percentage of missing values for each column
print("Missing Values Percentage per Column:")
print(missing_values_percentage)

#### Fill Missing Values for Numeric Columns

In [None]:
# Columns to be filled (numeric ones from the previous step)
columns_to_fill_numeric = ['subscriber', 'Trend_day_count', 'Tag_count', 'Trend_tag_count', 'comment_count', 'likes', 'dislike', 'views']

# Filling missing values with the mean of each respective column
for column in columns_to_fill_numeric:
    mean_value = yt_views[column].mean()
    yt_views[column].fillna(mean_value, inplace=True)

# Confirm that there are no more missing values in these columns
print("Missing Values After Filling Numeric Columns:")
print(yt_views[columns_to_fill_numeric].isnull().sum())

#### Fill Missing Values for Categorical Columns

In [None]:
# Columns to be filled (categorical ones)
columns_to_fill_cat = ['category_id', 'channel_title', 'title', 'tags', 'description']

# Filling missing values with the mode of each respective column
for column in columns_to_fill_cat:
    mode_value = yt_views[column].mode()[0]
    yt_views[column].fillna(mode_value, inplace=True)

# Confirm that there are no more missing values in these columns
print("Missing Values After Filling Categorical Columns:")
print(yt_views[columns_to_fill_cat].isnull().sum())

***
### Detecting and treating Outliers

In [None]:
# Numeric Columns to check for outliers
numerical_columns = ['subscriber', 'Trend_day_count', 'Tag_count', 'Trend_tag_count',
                     'comment_count', 'likes', 'dislike', 'views']

# Loop through each numerical column to replace outliers with median
for col in numerical_columns:
    Q1 = yt_views[col].quantile(0.25)
    Q3 = yt_views[col].quantile(0.75)
    IQR = Q3 - Q1

    # Define bounds for the outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Identify outliers
    outliers = (yt_views[col] < lower_bound) | (yt_views[col] > upper_bound)

    # Print the number of outliers in each column
    print(f"Number of outliers in {col}: {outliers.sum()}")

    # Find the median
    median_value = yt_views[col].median()

    # Replace outliers with median using .loc[]
    yt_views.loc[outliers, col] = median_value


***
## Visualisations
***

#### Histograms for Numerical Columns

In [None]:
# Histogram for 'views'
plt.hist(yt_views['views'], bins=50)
plt.title('Distribution of Views')
plt.xlabel('Views')
plt.ylabel('Frequency')
plt.show()

#### Bar Plot for Categorical Columns

In [None]:
# Bar plot for 'category_id'
sns.countplot(data=yt_views, x='category_id')
plt.title('Distribution of Categories')
plt.show()

#### Correlation Heatmap

In [None]:
# Correlation heatmap
corr_matrix = yt_views.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

#### Scatter Plot for Target Variable

In [None]:
# Scatter plot between 'likes' and 'views'
sns.scatterplot(data=yt_views, x='likes', y='views')
plt.title('Likes vs Views')
plt.show()

#### Box Plot for Outliers

In [None]:
# Box plot for 'views'
sns.boxplot(yt_views['views'])
plt.title('Boxplot of Views')
plt.show()

#### Line Plot for Time Series Data

In [None]:
# Line plot to visualize 'views' against 'Trend_day_count'
sns.lineplot(data=yt_views, x='Trend_day_count', y='views')
plt.title('Views Over Trend Day Count')
plt.show()

#### Pair Plot for Multi-Variate Analysis

In [None]:
# Pair plot for numerical columns
sns.pairplot(yt_views[['views', 'likes', 'comment_count']])
plt.title('Pair Plot')
plt.show()

#### Violin Plot for Mixed Data

In [None]:
# Violin plot for 'views' by 'category_id'
sns.violinplot(data=yt_views, x='category_id', y='views')
plt.title('Violin Plot of Views by Category')
plt.show()

#### Density Plot

In [None]:
# Density plot for 'views'
sns.kdeplot(yt_views['views'], shade=True)
plt.title('Density Plot of Views')
plt.show()

#### Strip Plot

In [None]:
# Strip Plot
sns.stripplot(x="category_id", y="views", data=yt_views, jitter=True)
plt.title('Strip Plot of Views by Category')
plt.show()

#### Facet Grid with Scatter Plot

In [None]:
# Facet Grid with Scatter Plot
g = sns.FacetGrid(yt_views, col='category_id', col_wrap=4, height=5, aspect=1)
g = g.map(sns.scatterplot, 'likes', 'views')
plt.suptitle('Scatter Plot of Views by Likes, Faceted by Category')
plt.show()

#### Joint Plot

In [None]:
# Joint Plot
sns.jointplot(data=yt_views, x='likes', y='views', kind='hex')
plt.suptitle('Joint Plot of Views and Likes')
plt.show()

#### Cluster Map

In [None]:
# Cluster Map
sns.clustermap(yt_views.corr(), annot=True, cmap='coolwarm')
plt.title('Cluster Map of Correlations')
plt.show()

#### 3D Plot

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

ax.scatter(yt_views['views'], yt_views['likes'], yt_views['comment_count'])
ax.set_xlabel('Views')
ax.set_ylabel('Likes')
ax.set_zlabel('Comment Count')
plt.title('3D Scatter Plot of Views, Likes, and Comment Count')
plt.show()

*** 
### Predicting the number of views a YouTube video will get using a simple linear regression mode

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Create a copy of the original DataFrame
yt_views_copy = yt_views.copy()

# Preprocessing
# Drop unnecessary columns
columns_to_drop = ['Video_id', 'channel_title', 'subscriber', 'title', 'tags', 'description']
yt_views_copy = yt_views_copy.drop(columns=columns_to_drop)

# Convert 'TRUE'/'FALSE' string to actual boolean and then to integer
bool_columns = ['comment_disabled', 'like dislike disabled', 'tag appered in title']
for col in bool_columns:
    yt_views_copy[col] = yt_views_copy[col].apply(lambda x: 1 if x == 'TRUE' else 0 if x == 'FALSE' else x)

# Handling missing values by replacing them with the median of the column
for column in yt_views_copy.columns:
    yt_views_copy[column].fillna(yt_views_copy[column].median(), inplace=True)

# Convert boolean columns to integer (after making sure there are no NaNs)
for col in bool_columns:
    yt_views_copy[col] = yt_views_copy[col].astype(int)

# Feature and target variables
X = yt_views_copy.drop('views', axis=1)
y = yt_views_copy['views']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Root Mean Squared Error: {rmse}')

***
### Sentiment Analysis on the 'description' column using VADER sentiment analysis tool from the Natural Language Toolkit (NLTK) library

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Initialize VADER
sia = SentimentIntensityAnalyzer()

# Sentiment analysis on 'description' and add a new column for the compound score
yt_views['sentiment'] = yt_views['description'].apply(lambda x: sia.polarity_scores(str(x))['compound'])
print(yt_views.head())

***
### Sentiment Analysis using Naive Bayes classifier along with TF-IDF for feature extraction.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Create a copy of the original DataFrame
yt_views_copy = yt_views.copy()

# Drop rows with empty descriptions or labels in the copy
yt_views_copy.dropna(subset=['description', 'comment_disabled'], inplace=True)

# Convert 'TRUE'/'FALSE' string to boolean if needed
yt_views_copy['comment_disabled'] = yt_views_copy['comment_disabled'].apply(lambda x: True if x == 'TRUE' else False if x == 'FALSE' else x)

# Convert boolean label to integer (True:1, False:0)
yt_views_copy['comment_disabled'] = yt_views_copy['comment_disabled'].astype(int)

# Feature extraction with TF-IDF
vectorizer = TfidfVectorizer(sublinear_tf=True, encoding='utf-8', decode_error='ignore')
X = vectorizer.fit_transform(yt_views_copy['description'])

# Convert boolean label to integer (True:1, False:0)
y = yt_views_copy['comment_disabled'].astype(int)

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Naive Bayes Classifier
clf = MultinomialNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Evaluate the model
print("Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))