# Let's test your knowledge about Python, data structures, NumPy, and pandas.
We are going to createa a dummy dataset for some Youtubers regarding their channals. We will create data including:
1.  Decide the size of the dataset by using a SIZE constant variable
1.  ChannelID (`Y0001`, `Y0002`, ...)
1.  Number of subscribers
1.  Number of videos
1.  Number of total views
1.  Category (`['music','news','gaming','food','travel']`)
1.  Language (`['English','Spanish','Japanese','Franch','Russia','Chinese']`)
1.  Type(`['Corperate','Goverment','NGO','Individual']`)


Then we will play with the dummy dataset with our knowledge of pandas, including:
1.  Access
1.  Sampling
1.  Filtering
1.  Aggregation
1.  Manipulation


## Setup environment
Let's import random, Numpy, and pandas

In [1]:
import random
import numpy as np
import pandas as pd

## Dummy dataset generation

### Setup SIZE

In [2]:
# We make this variable SIZE because it supposed to be constant for the runtime.
SIZE = 10000

### Generate ChannelID

In [3]:
ids = ['Y{0:04d}'.format(x) for x in range(SIZE)]
ids[:5], ids[-5:]

(['Y0000', 'Y0001', 'Y0002', 'Y0003', 'Y0004'],
 ['Y9995', 'Y9996', 'Y9997', 'Y9998', 'Y9999'])

### Generate # of subscribers

#### We first simulate a normal distribution

In [4]:
subs = np.random.normal(100000, 50000, (SIZE))
subs[:5], subs[-5:]

(array([131110.87661181, 132838.82710538,  81102.88077267, 116162.81397205,
         82044.06876434]),
 array([ 37523.02387094,  79413.03308778, 100200.16970086, 123748.14866583,
         13167.19165345]))

#### We convert the float numbers to integer

In [5]:
subs = subs.astype(int)
subs[:5], subs[-5:]

(array([131110, 132838,  81102, 116162,  82044]),
 array([ 37523,  79413, 100200, 123748,  13167]))

#### We check if there are non-positive numbers of subscribers

In [6]:
subs[subs <= 0 ]

array([-41150,  -4084,  -2176, -13408,  -3200, -50003, -30932,  -1862,
       -20141,  -9898, -28910, -55530, -35123,  -3769, -42658, -79004,
        -2187, -12186, -25603, -11158, -21568, -28672, -49000,  -8689,
        -8450, -43585, -24194, -27851, -39199, -18005, -20109,  -6448,
       -14114,  -6383,   -680, -15285,   -192, -54221, -16001, -17277,
       -17969, -46835,  -4164,  -2213,  -3396, -12536, -14126, -10973,
        -7054,  -3812,  -5755,  -8183,  -1823, -40982, -22126,  -5933,
       -13563, -62450, -48882, -45437,  -5678, -30699,  -9161, -42062,
        -9908, -25939,    -48, -13992,  -4275,  -8570,  -9042,  -2108,
       -20864,  -7636, -17426, -75897, -11575, -36034, -10478, -30881,
        -3427,  -5375, -16215,   -843, -13126,   -638,  -1073,  -4904,
       -16997,   -298, -32631, -11147,   -164, -13036,   -147,  -1108,
        -2643, -21025, -13331,  -7166, -16407, -16292, -23189,  -6154,
       -17922,   -237,  -3721,  -2981, -28802, -14371, -46565,   -875,
      

#### We set them to be 1

In [7]:
subs[subs <= 0 ] = 1
subs[subs <= 0 ]

array([], dtype=int32)

### Generate number of videos


#### We use a uniform distribution this time (just for practice)

In [8]:
nvideos = [np.random.randint(1, 100) for i in range(SIZE)]
nvideos[:5], nvideos[-5:]

([80, 1, 43, 48, 73], [69, 53, 94, 78, 91])

### We simulate the number of views

#### To make it real, we will use the # of subscribers and # of videos as factors to get the # of total views

In [9]:
views = [ int(x*1.5*np.random.random() + y*2*np.random.random() + np.random.randint(-1000, 1000))
for x, y in zip(subs, nvideos)]
views[:5], views[-5:]

([85907, 186507, 105601, 130846, 68985], [44448, 89431, 111682, 90457, 13063])

In [10]:
views = np.array(views)
views

array([ 85907, 186507, 105601, ..., 111682,  90457,  13063])

In [11]:
views[views < 0]

array([-736,  -28, -520, -780, -181,  -86, -477, -469, -715, -968, -590,
       -500, -227, -514, -349, -361, -307, -748,  -26, -155, -207, -388,
       -950, -543, -182, -821, -647, -787, -646, -691, -305, -140, -734,
        -75,  -67, -190, -635, -783, -706, -235,  -35, -474,  -30,  -76,
       -660, -448, -126,  -60, -735, -356, -739, -756, -880,  -79, -664,
       -493, -194, -431, -865, -695, -768, -876, -113,  -53, -577, -119,
       -860,  -65, -684, -582, -525,  -76, -265, -224, -562, -742, -717,
       -334, -528, -108, -506, -628, -724, -603, -914, -526, -351, -552,
        -76, -900, -562, -116, -781, -884, -700, -537, -152,  -78, -912,
       -245,  -70, -746, -770, -560, -771, -215, -926, -244, -814, -506,
       -679, -779, -406, -178, -761, -875, -847, -547, -238, -243, -704,
       -233, -804, -542, -353, -561,  -38, -421, -797, -449, -707, -853,
       -212, -309, -730, -649, -262, -791, -379, -950,  -66,  -72, -859,
       -497, -376, -111])

In [11]:
views[views < 0] = 0
views[views < 0]

array([], dtype=int32)

### Category

>Category (`['music','news','gaming','food','travel']`)

In [12]:
category = np.random.choice(['music','news','gaming','food','travel'], SIZE)
category[:20]

array(['news', 'gaming', 'music', 'music', 'gaming', 'music', 'gaming',
       'travel', 'food', 'news', 'news', 'gaming', 'news', 'travel',
       'music', 'gaming', 'news', 'travel', 'gaming', 'travel'],
      dtype='<U6')

### Language
>Language (`['English','Spanish','Japanese','Franch','Russia','Chinese']`)


In [13]:
language = np.random.choice(['English','Spanish','Japanese','Franch','Russia','Chinese'],
                            SIZE, p=[0.5, 0.1, 0.1, 0.1, 0.1, 0.1])
language[:20]

array(['Franch', 'Chinese', 'Japanese', 'English', 'Chinese', 'Japanese',
       'English', 'Spanish', 'Chinese', 'English', 'Japanese', 'English',
       'Japanese', 'Spanish', 'Franch', 'Spanish', 'Russia', 'Russia',
       'Russia', 'Japanese'], dtype='<U8')

### Type
>Type(`['Corperate','Goverment','NGO','Individual']`)

In [14]:
tp = np.random.choice(['Corperate','Goverment','NGO','Individual'], SIZE, p=[0.1, 0.2, 0.3, 0.4])
tp[:20]

array(['NGO', 'NGO', 'NGO', 'Individual', 'Goverment', 'Individual',
       'NGO', 'Individual', 'Individual', 'Goverment', 'Individual',
       'Individual', 'Goverment', 'NGO', 'Goverment', 'Individual', 'NGO',
       'Individual', 'NGO', 'Individual'], dtype='<U10')

### Now we have all attributes, let's put them into a dataframe

In [15]:
df = pd.DataFrame({'ChannelID': ids,
                   'subs': subs,
                   'nvideos': nvideos,
                   'views': views,
                   'Category': category,
                   'Language':language,
                   'Type': tp},
                  index = ids)
df.head()

Unnamed: 0,ChannelID,subs,nvideos,views,Category,Language,Type
Y0000,Y0000,131110,80,85907,news,Franch,NGO
Y0001,Y0001,132838,1,186507,gaming,Chinese,NGO
Y0002,Y0002,81102,43,105601,music,Japanese,NGO
Y0003,Y0003,116162,48,130846,music,English,Individual
Y0004,Y0004,82044,73,68985,gaming,Chinese,Goverment


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, Y0000 to Y9999
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ChannelID  10000 non-null  object
 1   subs       10000 non-null  int32 
 2   nvideos    10000 non-null  int64 
 3   views      10000 non-null  int32 
 4   Category   10000 non-null  object
 5   Language   10000 non-null  object
 6   Type       10000 non-null  object
dtypes: int32(2), int64(1), object(4)
memory usage: 546.9+ KB


### Let's save the dummy dataset to `youtube_channels.csv`

In [18]:
df.to_csv('youtube_channels.csv', index=False)

## Let's play with the dataframe a little bit

### Sampling

#### Select certain rows

In [19]:
df_sub1 = df.loc[:'Y0100']
df_sub1

Unnamed: 0,ChannelID,subs,nvideos,views,Category,Language,Type
Y0000,Y0000,28045,61,34515,food,Russia,Individual
Y0001,Y0001,112407,8,137830,food,English,Goverment
Y0002,Y0002,75610,89,75561,food,Franch,Goverment
Y0003,Y0003,68566,17,14567,food,English,Individual
Y0004,Y0004,154427,92,172224,gaming,English,NGO
...,...,...,...,...,...,...,...
Y0096,Y0096,72594,18,8188,news,Franch,Individual
Y0097,Y0097,125410,90,152805,travel,English,Individual
Y0098,Y0098,137602,73,676,music,English,Individual
Y0099,Y0099,119988,34,52080,news,English,NGO


#### Select certain rows and columns

In [19]:
df_sub2 = df.loc[:'Y1000', ['subs','views']]
df_sub2

Unnamed: 0,subs,views
Y0000,131110,85907
Y0001,132838,186507
Y0002,81102,105601
Y0003,116162,130846
Y0004,82044,68985
...,...,...
Y0996,30722,45014
Y0997,72011,27862
Y0998,116705,14700
Y0999,64891,45268


#### Select randome rows

In [20]:
df_sub3 = df.iloc[random.sample(range(0, SIZE), 100)]
df_sub3

Unnamed: 0,ChannelID,subs,nvideos,views,Category,Language,Type
Y7613,Y7613,72617,65,78012,travel,English,NGO
Y4256,Y4256,188452,51,15235,gaming,Japanese,NGO
Y9327,Y9327,144857,84,149312,travel,Franch,NGO
Y5006,Y5006,226231,1,260537,news,Japanese,NGO
Y4101,Y4101,234172,57,133687,music,English,NGO
...,...,...,...,...,...,...,...
Y1156,Y1156,31156,12,41153,music,Russia,Goverment
Y4675,Y4675,81575,15,119368,food,English,NGO
Y6215,Y6215,1,4,686,food,Chinese,NGO
Y2359,Y2359,56640,90,11896,music,English,Individual


#### Select random rows with selected columns

In [21]:
df_sub4 = df.iloc[random.sample(range(0, SIZE), 100)][['subs','views']]
df_sub4

Unnamed: 0,subs,views
Y3686,85001,78705
Y8094,138764,93073
Y2528,73296,18403
Y5571,109524,108565
Y5774,99401,26856
...,...,...
Y0717,39747,57996
Y8458,75224,16835
Y2621,92906,82603
Y1817,146519,189389


### Filtering

#### Super Popular Channel

In [22]:
df_superp = df[df['subs'] > 300000]
df_superp

Unnamed: 0,ChannelID,subs,nvideos,views,Category,Language,Type
Y0075,Y0075,314748,65,449893,news,English,Goverment


#### Popular Channel

In [23]:
df_p = df[df['subs'] > 100000]
df_p

Unnamed: 0,ChannelID,subs,nvideos,views,Category,Language,Type
Y0000,Y0000,131110,80,85907,news,Franch,NGO
Y0001,Y0001,132838,1,186507,gaming,Chinese,NGO
Y0003,Y0003,116162,48,130846,music,English,Individual
Y0005,Y0005,143581,76,210554,music,Japanese,Individual
Y0011,Y0011,104118,40,1170,gaming,English,Individual
...,...,...,...,...,...,...,...
Y9991,Y9991,116267,48,64783,music,English,Corperate
Y9992,Y9992,139295,65,51608,news,English,Corperate
Y9994,Y9994,167435,7,22403,news,Spanish,NGO
Y9997,Y9997,100200,94,111682,gaming,Russia,Goverment


#### Start up channel

In [24]:
df_begin = df[df['subs'] < 100]
df_begin

Unnamed: 0,ChannelID,subs,nvideos,views,Category,Language,Type
Y0016,Y0016,1,7,0,news,Russia,NGO
Y0029,Y0029,1,37,639,food,Japanese,Individual
Y0076,Y0076,1,16,0,travel,Russia,Individual
Y0081,Y0081,1,9,465,travel,Spanish,Goverment
Y0121,Y0121,1,21,0,travel,English,NGO
...,...,...,...,...,...,...,...
Y9676,Y9676,1,25,521,travel,Russia,Individual
Y9690,Y9690,1,81,0,gaming,Japanese,Corperate
Y9833,Y9833,1,99,831,gaming,English,NGO
Y9854,Y9854,1,96,776,food,English,Individual


#### Popular English Channel

In [25]:
df_EnglishP = df[(df['subs'] > 100000) & (df['Language'] == 'English')]
df_EnglishP

Unnamed: 0,ChannelID,subs,nvideos,views,Category,Language,Type
Y0003,Y0003,116162,48,130846,music,English,Individual
Y0011,Y0011,104118,40,1170,gaming,English,Individual
Y0026,Y0026,132796,62,175277,food,English,Individual
Y0031,Y0031,122275,89,63855,food,English,Goverment
Y0033,Y0033,183555,12,120727,music,English,Individual
...,...,...,...,...,...,...,...
Y9982,Y9982,137130,79,10359,news,English,Individual
Y9987,Y9987,179280,12,38671,music,English,Corperate
Y9989,Y9989,144054,10,74751,food,English,Individual
Y9991,Y9991,116267,48,64783,music,English,Corperate


#### Gaming channel with many videos

In [26]:
df_gaming_nv = df[(df['Category'] == 'gaming') & (df['nvideos'] > 90)]
df_gaming_nv

Unnamed: 0,ChannelID,subs,nvideos,views,Category,Language,Type
Y0035,Y0035,143836,95,81724,gaming,Spanish,Individual
Y0053,Y0053,93467,98,69951,gaming,English,Goverment
Y0185,Y0185,146238,94,213054,gaming,English,Individual
Y0190,Y0190,129937,91,64737,gaming,Franch,Individual
Y0209,Y0209,124912,96,171512,gaming,English,Individual
...,...,...,...,...,...,...,...
Y9887,Y9887,102292,99,5575,gaming,English,NGO
Y9899,Y9899,172115,93,48933,gaming,English,NGO
Y9914,Y9914,67566,91,37650,gaming,English,Individual
Y9936,Y9936,11190,91,14161,gaming,English,NGO


#### Non-Corperate and News channel

In [27]:
df_nc_news = df[(df['Category'] == 'news') & (df['Type'] != 'Corperate')]
df_nc_news

Unnamed: 0,ChannelID,subs,nvideos,views,Category,Language,Type
Y0000,Y0000,131110,80,85907,news,Franch,NGO
Y0009,Y0009,17546,11,25032,news,English,Goverment
Y0010,Y0010,79586,69,66617,news,Japanese,Individual
Y0012,Y0012,187031,91,213905,news,Japanese,Goverment
Y0016,Y0016,1,7,0,news,Russia,NGO
...,...,...,...,...,...,...,...
Y9982,Y9982,137130,79,10359,news,English,Individual
Y9986,Y9986,113979,67,110287,news,Spanish,Individual
Y9994,Y9994,167435,7,22403,news,Spanish,NGO
Y9995,Y9995,37523,69,44448,news,Spanish,Individual


#### Top Subs **and** Top nvideos

In [28]:
df_subs_and_nvideos = df[(df['subs'] > 200000) & (df['nvideos'] > 90)]
df_subs_and_nvideos

Unnamed: 0,ChannelID,subs,nvideos,views,Category,Language,Type
Y1620,Y1620,222463,91,58074,travel,English,Goverment
Y3562,Y3562,202368,96,262098,gaming,English,NGO
Y4026,Y4026,213651,94,256379,food,English,Individual
Y4563,Y4563,219840,92,230818,travel,English,Individual
Y5112,Y5112,222442,93,296837,music,English,NGO
Y5174,Y5174,249110,92,50593,news,Franch,Goverment
Y5225,Y5225,201099,92,10307,gaming,English,NGO
Y5436,Y5436,215684,97,230188,music,English,Individual
Y6151,Y6151,213768,91,74724,food,Chinese,Corperate
Y6370,Y6370,215687,98,168385,food,English,NGO


#### Top Subs **or** Top nvideos

In [29]:
df_subs_or_nvideos = df[(df['subs'] > 200000) | (df['nvideos'] > 90)]
df_subs_or_nvideos

Unnamed: 0,ChannelID,subs,nvideos,views,Category,Language,Type
Y0012,Y0012,187031,91,213905,news,Japanese,Goverment
Y0019,Y0019,125537,93,46156,travel,Japanese,Individual
Y0028,Y0028,174202,97,177610,news,Russia,NGO
Y0032,Y0032,114416,94,17468,travel,Spanish,Individual
Y0035,Y0035,143836,95,81724,gaming,Spanish,Individual
...,...,...,...,...,...,...,...
Y9976,Y9976,173771,95,152090,news,English,Corperate
Y9977,Y9977,73643,98,51195,travel,English,Individual
Y9988,Y9988,209415,29,144174,music,Chinese,Individual
Y9997,Y9997,100200,94,111682,gaming,Russia,Goverment


### Aggregation

In [30]:
byCategory = df.groupby('Category')
byCategory.sum()

Unnamed: 0_level_0,ChannelID,subs,nvideos,views,Language,Type
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
food,Y0008Y0021Y0022Y0026Y0027Y0029Y0031Y0036Y0041Y...,206252257,100224,155139034,ChineseEnglishChineseEnglishRussiaJapaneseEngl...,IndividualNGOCorperateIndividualNGOIndividualG...
gaming,Y0001Y0004Y0006Y0011Y0015Y0018Y0020Y0024Y0025Y...,197574466,100047,149426437,ChineseChineseEnglishEnglishSpanishRussiaChine...,NGOGovermentNGOIndividualIndividualNGOGovermen...
music,Y0002Y0003Y0005Y0014Y0033Y0037Y0038Y0049Y0052Y...,200923599,99206,148884401,JapaneseEnglishJapaneseFranchEnglishFranchChin...,NGOIndividualIndividualGovermentIndividualNGOC...
news,Y0000Y0009Y0010Y0012Y0016Y0028Y0030Y0040Y0046Y...,196803789,98485,149390247,FranchEnglishJapaneseJapaneseRussiaRussiaRussi...,NGOGovermentIndividualGovermentNGONGOGoverment...
travel,Y0007Y0013Y0017Y0019Y0023Y0032Y0047Y0051Y0061Y...,203550368,99345,153144172,SpanishSpanishRussiaJapaneseEnglishSpanishSpan...,IndividualNGOIndividualIndividualNGOIndividual...


In [31]:
byCategory.mean()

TypeError: agg function failed [how->mean,dtype->object]

In [32]:
byLanguage = df.groupby('Language')
byLanguage.sum()

Unnamed: 0_level_0,ChannelID,subs,nvideos,views,Category,Type
Language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Chinese,Y0001Y0004Y0008Y0020Y0022Y0038Y0051Y0054Y0056Y...,99533269,48672,75452085,gaminggamingfoodgamingfoodmusictravelmusicgami...,NGOGovermentIndividualGovermentCorperateCorper...
English,Y0003Y0006Y0009Y0011Y0021Y0023Y0024Y0026Y0031Y...,504138594,250781,378818506,musicgamingnewsgamingfoodtravelgamingfoodfoodm...,IndividualNGOGovermentIndividualNGONGOCorperat...
Franch,Y0000Y0014Y0025Y0037Y0050Y0061Y0086Y0096Y0118Y...,99648798,48127,73644727,newsmusicgamingmusicgamingtraveltravelfoodmusi...,NGOGovermentNGONGOGovermentNGOGovermentIndivid...
Japanese,Y0002Y0005Y0010Y0012Y0019Y0029Y0044Y0067Y0087Y...,99931289,49761,75726709,musicmusicnewsnewstravelfoodfoodfoodnewsnewsfo...,NGOIndividualIndividualGovermentIndividualIndi...
Russia,Y0016Y0017Y0018Y0027Y0028Y0030Y0040Y0049Y0059Y...,104141457,51943,78157977,newstravelgamingfoodnewsnewsnewsmusicfoodnewst...,NGOIndividualNGONGONGOGovermentGovermentCorper...
Spanish,Y0007Y0013Y0015Y0032Y0035Y0047Y0055Y0062Y0070Y...,97711072,48023,74184287,traveltravelgamingtravelgamingtravelfoodtravel...,IndividualNGOIndividualIndividualIndividualNGO...


In [33]:
byLanguage.mean()

TypeError: agg function failed [how->mean,dtype->object]

In [34]:
byLanType = df.groupby(['Language', 'Type'])
byLanType.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,subs,subs,subs,subs,subs,subs,subs,subs,nvideos,nvideos,nvideos,nvideos,nvideos,views,views,views,views,views,views,views,views
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Language,Type,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
Chinese,Corperate,86.0,104834.55814,53203.489018,1.0,71248.0,100600.0,139654.25,254585.0,86.0,50.27907,...,80.0,98.0,86.0,83346.465116,60964.750596,0.0,35173.75,74770.0,122153.5,251515.0
Chinese,Goverment,209.0,100488.559809,47688.350051,1.0,68137.0,103180.0,133836.0,257936.0,209.0,49.301435,...,74.0,99.0,209.0,76285.138756,62792.686559,0.0,25900.0,57873.0,118864.0,283545.0
Chinese,Individual,397.0,97019.382872,50799.021041,1.0,57795.0,97128.0,131284.0,291598.0,397.0,46.866499,...,71.0,99.0,397.0,73390.20403,58258.077254,0.0,25399.0,60226.0,105807.0,262697.0
Chinese,NGO,311.0,99674.254019,49513.903787,1.0,66345.5,100575.0,135697.5,269763.0,311.0,49.639871,...,75.5,99.0,311.0,74613.453376,55075.742236,0.0,27496.0,67668.0,110278.5,247243.0
English,Corperate,521.0,98310.775432,48296.011582,1.0,66308.0,97269.0,129627.0,250252.0,521.0,50.554702,...,76.0,99.0,521.0,70245.996161,56865.880065,0.0,25957.0,56664.0,102379.0,313190.0
English,Goverment,969.0,101099.763674,50558.676498,1.0,68580.0,97790.0,134598.0,314748.0,969.0,49.344685,...,72.0,99.0,969.0,75944.74097,61429.43379,0.0,26851.0,64325.0,109064.0,449893.0
English,Individual,2050.0,101520.757073,49468.833725,1.0,67062.0,101133.0,135454.0,246504.0,2050.0,49.754146,...,75.0,99.0,2050.0,76743.376585,61844.597222,0.0,26201.5,62286.5,114085.25,327138.0
English,NGO,1473.0,99684.62797,50141.50375,1.0,63452.0,100124.0,134602.0,298903.0,1473.0,50.665988,...,77.0,99.0,1473.0,75564.131704,62333.142143,0.0,25518.0,60120.0,114469.0,378233.0
Franch,Corperate,82.0,98384.146341,44971.88945,1.0,70829.0,97292.0,123449.25,212576.0,82.0,48.585366,...,70.25,98.0,82.0,78820.890244,55994.981578,0.0,36657.0,62487.5,117069.5,223166.0
Franch,Goverment,175.0,102135.222857,50739.499818,1.0,64264.5,103619.0,134961.5,264029.0,175.0,49.725714,...,74.0,99.0,175.0,75523.114286,63510.006736,0.0,25672.0,60808.0,113872.0,332631.0


In [35]:
byLanType.describe()[['subs','views']]

Unnamed: 0_level_0,Unnamed: 1_level_0,subs,subs,subs,subs,subs,subs,subs,subs,views,views,views,views,views,views,views,views
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Language,Type,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
Chinese,Corperate,86.0,104834.55814,53203.489018,1.0,71248.0,100600.0,139654.25,254585.0,86.0,83346.465116,60964.750596,0.0,35173.75,74770.0,122153.5,251515.0
Chinese,Goverment,209.0,100488.559809,47688.350051,1.0,68137.0,103180.0,133836.0,257936.0,209.0,76285.138756,62792.686559,0.0,25900.0,57873.0,118864.0,283545.0
Chinese,Individual,397.0,97019.382872,50799.021041,1.0,57795.0,97128.0,131284.0,291598.0,397.0,73390.20403,58258.077254,0.0,25399.0,60226.0,105807.0,262697.0
Chinese,NGO,311.0,99674.254019,49513.903787,1.0,66345.5,100575.0,135697.5,269763.0,311.0,74613.453376,55075.742236,0.0,27496.0,67668.0,110278.5,247243.0
English,Corperate,521.0,98310.775432,48296.011582,1.0,66308.0,97269.0,129627.0,250252.0,521.0,70245.996161,56865.880065,0.0,25957.0,56664.0,102379.0,313190.0
English,Goverment,969.0,101099.763674,50558.676498,1.0,68580.0,97790.0,134598.0,314748.0,969.0,75944.74097,61429.43379,0.0,26851.0,64325.0,109064.0,449893.0
English,Individual,2050.0,101520.757073,49468.833725,1.0,67062.0,101133.0,135454.0,246504.0,2050.0,76743.376585,61844.597222,0.0,26201.5,62286.5,114085.25,327138.0
English,NGO,1473.0,99684.62797,50141.50375,1.0,63452.0,100124.0,134602.0,298903.0,1473.0,75564.131704,62333.142143,0.0,25518.0,60120.0,114469.0,378233.0
Franch,Corperate,82.0,98384.146341,44971.88945,1.0,70829.0,97292.0,123449.25,212576.0,82.0,78820.890244,55994.981578,0.0,36657.0,62487.5,117069.5,223166.0
Franch,Goverment,175.0,102135.222857,50739.499818,1.0,64264.5,103619.0,134961.5,264029.0,175.0,75523.114286,63510.006736,0.0,25672.0,60808.0,113872.0,332631.0


In [36]:
byLanType.describe().loc['English',['subs','views']]

Unnamed: 0_level_0,subs,subs,subs,subs,subs,subs,subs,subs,views,views,views,views,views,views,views,views
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Corperate,521.0,98310.775432,48296.011582,1.0,66308.0,97269.0,129627.0,250252.0,521.0,70245.996161,56865.880065,0.0,25957.0,56664.0,102379.0,313190.0
Goverment,969.0,101099.763674,50558.676498,1.0,68580.0,97790.0,134598.0,314748.0,969.0,75944.74097,61429.43379,0.0,26851.0,64325.0,109064.0,449893.0
Individual,2050.0,101520.757073,49468.833725,1.0,67062.0,101133.0,135454.0,246504.0,2050.0,76743.376585,61844.597222,0.0,26201.5,62286.5,114085.25,327138.0
NGO,1473.0,99684.62797,50141.50375,1.0,63452.0,100124.0,134602.0,298903.0,1473.0,75564.131704,62333.142143,0.0,25518.0,60120.0,114469.0,378233.0


### Manipulation

In [37]:
df['subpervideo'] = df['subs']/df['nvideos']
df['subpervideo'].describe()

count     10000.000000
mean       5307.643414
std       13790.270483
min           0.010101
25%        1194.877222
50%        2032.823495
75%        3970.044565
max      226231.000000
Name: subpervideo, dtype: float64

In [38]:
df['viewspervideo'] = df['views']/df['nvideos']
df['viewspervideo'].describe()

count     10000.000000
mean       4006.464071
std       12092.529599
min           0.000000
25%         547.935852
50%        1388.834444
75%        3023.065476
max      260537.000000
Name: viewspervideo, dtype: float64

In [39]:
df['viewspersub'] = df['views']/df['subs']
df['viewspersub'].describe()

count    10000.000000
mean         7.116707
std         66.949583
min          0.000000
25%          0.374160
50%          0.745979
75%          1.145403
max       1077.000000
Name: viewspersub, dtype: float64

In [40]:
df.head(10)

Unnamed: 0,ChannelID,subs,nvideos,views,Category,Language,Type,subpervideo,viewspervideo,viewspersub
Y0000,Y0000,131110,80,85907,news,Franch,NGO,1638.875,1073.8375,0.655228
Y0001,Y0001,132838,1,186507,gaming,Chinese,NGO,132838.0,186507.0,1.404018
Y0002,Y0002,81102,43,105601,music,Japanese,NGO,1886.093023,2455.837209,1.302076
Y0003,Y0003,116162,48,130846,music,English,Individual,2420.041667,2725.958333,1.12641
Y0004,Y0004,82044,73,68985,gaming,Chinese,Goverment,1123.890411,945.0,0.840829
Y0005,Y0005,143581,76,210554,music,Japanese,Individual,1889.223684,2770.447368,1.466448
Y0006,Y0006,90787,77,82629,gaming,English,NGO,1179.051948,1073.103896,0.910141
Y0007,Y0007,76030,32,54348,travel,Spanish,Individual,2375.9375,1698.375,0.714823
Y0008,Y0008,97254,65,139271,food,Chinese,Individual,1496.215385,2142.630769,1.432034
Y0009,Y0009,17546,11,25032,news,English,Goverment,1595.090909,2275.636364,1.42665


## Congratulations! You completed our course: Data Wrangling -- Fundamentals