# Importing packages and data

In [77]:
import pandas as pd
import numpy as np
import sklearn

In [110]:
df = pd.read_csv("data.csv")
df.head()

Unnamed: 0,Rank,Grade,Channel name,Video Uploads,Subscribers,Video views
0,1st,A++,Zee TV,82757,18752951,20869786591
1,2nd,A++,T-Series,12661,61196302,47548839843
2,3rd,A++,Cocomelon - Nursery Rhymes,373,19238251,9793305082
3,4th,A++,SET India,27323,31180559,22675948293
4,5th,A++,WWE,36756,32852346,26273668433


In [111]:
# Renaming columns
df = df.rename(index=str, columns={"Rank": "rank",
                                   "Grade": "grade",
                                   "Channel name": "channel",
                                   "Video Uploads": "uploads",
                                   "Subscribers": "subscribers",
                                   "Video views": "views"})
df.head()

Unnamed: 0,rank,grade,channel,uploads,subscribers,views
0,1st,A++,Zee TV,82757,18752951,20869786591
1,2nd,A++,T-Series,12661,61196302,47548839843
2,3rd,A++,Cocomelon - Nursery Rhymes,373,19238251,9793305082
3,4th,A++,SET India,27323,31180559,22675948293
4,5th,A++,WWE,36756,32852346,26273668433


In [112]:
# checking for NAs
df.isna().sum()

rank           0
grade          0
channel        0
uploads        0
subscribers    0
views          0
dtype: int64

In [113]:
# checking types
df.dtypes

rank           object
grade          object
channel        object
uploads        object
subscribers    object
views           int64
dtype: object

In [114]:
# replacing omitted values
df.uploads = df.uploads.replace("None", None)
df.subscribers = df.subscribers.replace("None", None)
df.views = df.views.replace("None", None)

# Transforming types
df.uploads = pd.to_numeric(df.uploads)
df.subscribers = pd.to_numeric(df.subscribers)
df.views = pd.to_numeric(df.views)

In [115]:
# checking types again
df.dtypes

rank           object
grade          object
channel        object
uploads         int64
subscribers     int64
views           int64
dtype: object

# Feature engineering

In [121]:
# Creating new features

df["views_per_video"] = df["views"] / df["uploads"]
df["views_per_subscriber"] = df["views"] / df["subscribers"]

# words = number of words in channel name, special signs also counted as words

df["words"] = df["channel"].str.lower().str.split().apply(lambda x: len(x))
df.head()

Unnamed: 0,rank,grade,channel,uploads,subscribers,views,views_per_video,views_per_subscriber,words
0,1st,A++,Zee TV,82757,18752951,20869786591,252181.5,1112.880132,2
1,2nd,A++,T-Series,12661,61196302,47548839843,3755536.0,776.988777,1
2,3rd,A++,Cocomelon - Nursery Rhymes,373,19238251,9793305082,26255510.0,509.053816,4
3,4th,A++,SET India,27323,31180559,22675948293,829921.6,727.246368,2
4,5th,A++,WWE,36756,32852346,26273668433,714813.0,799.750144,1


# Visual Data Exploration

In [122]:
import matplotlib.pyplot as plt