# Youtube 頻道排行資料整理
資料網址：https://raw.githubusercontent.com/Code-Gym/python-dataset/master/youtube-channels-data-from-socialblade.csv

In [1]:
import pandas as pd

tubes= pd.read_csv('https://raw.githubusercontent.com/Code-Gym/python-dataset/master/youtube-channels-data-from-socialblade.csv')
tubes

Unnamed: 0,Rank,Grade,Channel name,Video Uploads,Subscribers,Video views
0,1st,A++,Zee TV,82757,18752951,20869786591
1,2nd,A++,T-Series,12661,61196302,47548839843
2,3rd,A++,Cocomelon - Nursery Rhymes,373,19238251,9793305082
3,4th,A++,SET India,27323,31180559,22675948293
4,5th,A++,WWE,36756,32852346,26273668433
...,...,...,...,...,...,...
4995,"4,996th",B+,Uras Benlioğlu,706,2072942,441202795
4996,"4,997th",B+,HI-TECH MUSIC LTD,797,1055091,377331722
4997,"4,998th",B+,Mastersaint,110,3265735,311758426
4998,"4,999th",B+,Bruce McIntosh,3475,32990,14563764


### 確認欄位的資料型態

In [2]:
tubes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Rank           5000 non-null   object
 1   Grade          5000 non-null   object
 2   Channel name   5000 non-null   object
 3   Video Uploads  5000 non-null   object
 4   Subscribers    5000 non-null   object
 5   Video views    5000 non-null   int64 
dtypes: int64(1), object(5)
memory usage: 234.5+ KB


### 將欄位Rank轉為數字型態
* 移除掉字串後面兩個字元
* 移除掉千分位分隔符號
* 將欄位Rank轉為int型態

In [3]:
tubes['Rank'] = tubes['Rank'].str[0:-2].str.replace(',','').astype('int')
tubes

Unnamed: 0,Rank,Grade,Channel name,Video Uploads,Subscribers,Video views
0,1,A++,Zee TV,82757,18752951,20869786591
1,2,A++,T-Series,12661,61196302,47548839843
2,3,A++,Cocomelon - Nursery Rhymes,373,19238251,9793305082
3,4,A++,SET India,27323,31180559,22675948293
4,5,A++,WWE,36756,32852346,26273668433
...,...,...,...,...,...,...
4995,4996,B+,Uras Benlioğlu,706,2072942,441202795
4996,4997,B+,HI-TECH MUSIC LTD,797,1055091,377331722
4997,4998,B+,Mastersaint,110,3265735,311758426
4998,4999,B+,Bruce McIntosh,3475,32990,14563764


In [4]:
tubes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Rank           5000 non-null   int64 
 1   Grade          5000 non-null   object
 2   Channel name   5000 non-null   object
 3   Video Uploads  5000 non-null   object
 4   Subscribers    5000 non-null   object
 5   Video views    5000 non-null   int64 
dtypes: int64(2), object(4)
memory usage: 234.5+ KB


### 將欄位Subscribers轉為int型態
若是欄位資訊包含非數字的字串，轉int型態時會發生錯誤

In [5]:
tubes['Subscribers'].astype('int')

ValueError: invalid literal for int() with base 10: '-- '

### 找出包含連字號(Hyphen)的索引值

In [None]:
tubes[tubes['Subscribers'].str.contains('--')]

### 使用函式drop刪除有連字號的資料
官網文件：https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop.html

In [None]:
#tubes['Subscribers'].str.contains('--')的结果是一個布林值的Series
#.index則是獲取满足條件的索引值，這些索引值可以用於進一步操作，例如刪除或選擇特定的行
tubes.drop(labels=tubes[tubes['Subscribers'].str.contains('--')].index, axis='index', inplace=True)
tubes

### 將欄位Subscribers轉為int型態

In [None]:
tubes['Subscribers'] = tubes['Subscribers'].astype('int')
tubes.info()

### 尋找Grade欄位有幾種等級
函式unique ( ) <br>
A++, A+, A, A-, B+

In [7]:
tubes['Grade'].unique()

array([ 5.,  4.,  3., nan,  2.,  1.])

### 將欄位Grade等級，轉換為數字
* 建立對應等級和數字的字典變數
* 使用函式map轉換資料

In [6]:
grade_map = {'A++ ':5, 'A+ ':4, 'A ':3, 'A- ':2, 'B+ ':1}
tubes['Grade'] = tubes['Grade'].map(grade_map)
tubes

Unnamed: 0,Rank,Grade,Channel name,Video Uploads,Subscribers,Video views
0,1,5.0,Zee TV,82757,18752951,20869786591
1,2,5.0,T-Series,12661,61196302,47548839843
2,3,5.0,Cocomelon - Nursery Rhymes,373,19238251,9793305082
3,4,5.0,SET India,27323,31180559,22675948293
4,5,5.0,WWE,36756,32852346,26273668433
...,...,...,...,...,...,...
4995,4996,1.0,Uras Benlioğlu,706,2072942,441202795
4996,4997,1.0,HI-TECH MUSIC LTD,797,1055091,377331722
4997,4998,1.0,Mastersaint,110,3265735,311758426
4998,4999,1.0,Bruce McIntosh,3475,32990,14563764
