# Let's test your knowledge about Python, data structures, NumPy, and pandas.
We are going to createa a dummy dataset for some Youtubers regarding their channals. We will create data including:
1.  Decide the size of the dataset by using a SIZE constant variable
1.  ChannelID (`Y0001`, `Y0002`, ...)
1.  Number of subscribers
1.  Number of videos
1.  Number of total views
1.  Category (`['music','news','gaming','food','travel']`)
1.  Language (`['English','Spanish','Japanese','Franch','Russia','Chinese']`)
1.  Type(`['Corperate','Goverment','NGO','Individual']`)


Then we will play with the dummy dataset with our knowledge of pandas, including:
1.  Access
1.  Sampling
1.  Filtering
1.  Aggregation
1.  Manipulation


## Setup environment
Let's import random, Numpy, and pandas

In [1]:
import random
import numpy as np
import pandas as pd

## Dummy dataset generation

### Setup SIZE

In [2]:
# We make this variable SIZE because it supposed to be constant for the runtime.
SIZE = 10000

### Generate ChannelID

In [3]:
ids = ['Y{0:04d}'.format(x) for x in range(SIZE)]
ids[:5], ids[-5:]

(['Y0000', 'Y0001', 'Y0002', 'Y0003', 'Y0004'],
 ['Y9995', 'Y9996', 'Y9997', 'Y9998', 'Y9999'])

### Generate # of subscribers

#### We first simulate a normal distribution

In [4]:
subs = np.random.normal(100000, 50000, (SIZE))
subs[:5], subs[-5:]

(array([ 28045.65114534, 112407.63315093,  75610.76598577,  68566.13035803,
        154427.69953221]),
 array([119716.26919506, 111044.52008844,  63854.96432585, 233726.38844679,
         66319.49650509]))

#### We convert the float numbers to integer

In [5]:
subs = subs.astype(int)
subs[:5], subs[-5:]

(array([ 28045, 112407,  75610,  68566, 154427]),
 array([119716, 111044,  63854, 233726,  66319]))

#### We check if there are non-positive numbers of subscribers

In [6]:
subs[subs <= 0 ]

array([-43555, -10281,  -9595,  -9578, -31035, -16535, -26981,   -228,
        -8651, -11262,  -4140,  -7238,  -8909, -30504,   -352,  -1969,
        -6037, -20941,  -3692, -29614, -24630, -20008, -11979, -42748,
        -3704,  -1762, -13142, -15088,  -2583, -15622, -11416, -47707,
       -10122, -10234, -36979,  -1200, -21980, -31153, -32859, -15957,
       -62227,  -3793, -12368,  -3275,  -4564, -40634, -36394,  -5238,
        -7393, -58930, -33036,  -7504,  -5827, -31303, -28492, -18510,
       -18404, -10121, -10687, -33655,  -1446, -10605, -44041,  -8288,
       -12448, -23102,  -8137,  -6630, -36058, -67239, -28889, -35601,
       -64242, -18212, -41231,  -7935,  -7460, -25507,  -3606, -28889,
       -13148,  -9799, -16342,  -3244, -17594, -16562,  -3315,  -7712,
       -22337, -14823, -74174, -61623, -20826,  -4478,   -751, -51497,
        -8895,  -1238,   -671,  -2369,  -4539,  -5003, -20124,  -1109,
       -36848,  -9377, -29740, -11504,   -872,  -7431,  -1976,  -3418,
      

#### We set them to be 1

In [7]:
subs[subs <= 0 ] = 1
subs[subs <= 0 ]

array([], dtype=int64)

### Generate number of videos


#### We use a uniform distribution this time (just for practice)

In [8]:
nvideos = [np.random.randint(1, 100) for i in range(SIZE)]
nvideos[:5], nvideos[-5:]

([61, 8, 89, 17, 92], [68, 19, 66, 11, 69])

### We simulate the number of views

#### To make it real, we will use the # of subscribers and # of videos as factors to get the # of total views

In [9]:
views = [ int(x*1.5*np.random.random() + y*2*np.random.random() + np.random.randint(-1000, 1000))
for x, y in zip(subs, nvideos)]
views[:5], views[-5:]

([34515, 137830, 75561, 14567, 172224], [5922, 58326, 78399, 78299, 3725])

In [10]:
views = np.array(views)
views

array([ 34515, 137830,  75561, ...,  78399,  78299,   3725])

In [11]:
views[views < 0]

array([-736,  -28, -520, -780, -181,  -86, -477, -469, -715, -968, -590,
       -500, -227, -514, -349, -361, -307, -748,  -26, -155, -207, -388,
       -950, -543, -182, -821, -647, -787, -646, -691, -305, -140, -734,
        -75,  -67, -190, -635, -783, -706, -235,  -35, -474,  -30,  -76,
       -660, -448, -126,  -60, -735, -356, -739, -756, -880,  -79, -664,
       -493, -194, -431, -865, -695, -768, -876, -113,  -53, -577, -119,
       -860,  -65, -684, -582, -525,  -76, -265, -224, -562, -742, -717,
       -334, -528, -108, -506, -628, -724, -603, -914, -526, -351, -552,
        -76, -900, -562, -116, -781, -884, -700, -537, -152,  -78, -912,
       -245,  -70, -746, -770, -560, -771, -215, -926, -244, -814, -506,
       -679, -779, -406, -178, -761, -875, -847, -547, -238, -243, -704,
       -233, -804, -542, -353, -561,  -38, -421, -797, -449, -707, -853,
       -212, -309, -730, -649, -262, -791, -379, -950,  -66,  -72, -859,
       -497, -376, -111])

In [12]:
views[views < 0] = 0
views[views < 0]

array([], dtype=int64)

### Category

>Category (`['music','news','gaming','food','travel']`)

In [13]:
category = np.random.choice(['music','news','gaming','food','travel'], SIZE)
category[:20]

array(['food', 'food', 'food', 'food', 'gaming', 'food', 'news', 'food',
       'food', 'news', 'music', 'music', 'food', 'gaming', 'news',
       'travel', 'travel', 'news', 'gaming', 'gaming'], dtype='<U6')

### Language
>Language (`['English','Spanish','Japanese','Franch','Russia','Chinese']`)


In [14]:
language = np.random.choice(['English','Spanish','Japanese','Franch','Russia','Chinese'],
                            SIZE, p=[0.5, 0.1, 0.1, 0.1, 0.1, 0.1])
language[:20]

array(['Russia', 'English', 'Franch', 'English', 'English', 'English',
       'English', 'Franch', 'English', 'English', 'English', 'English',
       'English', 'English', 'English', 'Russia', 'Russia', 'English',
       'Chinese', 'Russia'], dtype='<U8')

### Type
>Type(`['Corperate','Goverment','NGO','Individual']`)

In [15]:
tp = np.random.choice(['Corperate','Goverment','NGO','Individual'], SIZE, p=[0.1, 0.2, 0.3, 0.4])
tp[:20]

array(['Individual', 'Goverment', 'Goverment', 'Individual', 'NGO',
       'Individual', 'NGO', 'Goverment', 'Corperate', 'Individual',
       'Goverment', 'NGO', 'NGO', 'Goverment', 'Corperate', 'Goverment',
       'Individual', 'NGO', 'NGO', 'NGO'], dtype='<U10')

### Now we have all attributes, let's put them into a dataframe

In [16]:
df = pd.DataFrame({'ChannelID': ids,
                   'subs': subs,
                   'nvideos': nvideos,
                   'views': views,
                   'Category': category,
                   'Language':language,
                   'Type': tp},
                  index = ids)
df.head()

Unnamed: 0,ChannelID,subs,nvideos,views,Category,Language,Type
Y0000,Y0000,28045,61,34515,food,Russia,Individual
Y0001,Y0001,112407,8,137830,food,English,Goverment
Y0002,Y0002,75610,89,75561,food,Franch,Goverment
Y0003,Y0003,68566,17,14567,food,English,Individual
Y0004,Y0004,154427,92,172224,gaming,English,NGO


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, Y0000 to Y9999
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ChannelID  10000 non-null  object
 1   subs       10000 non-null  int64 
 2   nvideos    10000 non-null  int64 
 3   views      10000 non-null  int64 
 4   Category   10000 non-null  object
 5   Language   10000 non-null  object
 6   Type       10000 non-null  object
dtypes: int64(3), object(4)
memory usage: 625.0+ KB


### Let's save the dummy dataset to `youtube_channels.csv`

In [18]:
df.to_csv('/content/youtube_channels.csv', index=False)

## Let's play with the dataframe a little bit

### Sampling

#### Select certain rows

In [19]:
df_sub1 = df.loc[:'Y0100']
df_sub1

Unnamed: 0,ChannelID,subs,nvideos,views,Category,Language,Type
Y0000,Y0000,28045,61,34515,food,Russia,Individual
Y0001,Y0001,112407,8,137830,food,English,Goverment
Y0002,Y0002,75610,89,75561,food,Franch,Goverment
Y0003,Y0003,68566,17,14567,food,English,Individual
Y0004,Y0004,154427,92,172224,gaming,English,NGO
...,...,...,...,...,...,...,...
Y0096,Y0096,72594,18,8188,news,Franch,Individual
Y0097,Y0097,125410,90,152805,travel,English,Individual
Y0098,Y0098,137602,73,676,music,English,Individual
Y0099,Y0099,119988,34,52080,news,English,NGO


#### Select certain rows and columns

In [20]:
df_sub2 = df.loc[:'Y1000', ['subs','views']]
df_sub2

Unnamed: 0,subs,views
Y0000,28045,34515
Y0001,112407,137830
Y0002,75610,75561
Y0003,68566,14567
Y0004,154427,172224
...,...,...
Y0996,35571,51580
Y0997,102212,95143
Y0998,49603,63737
Y0999,8028,2545


#### Select randome rows

In [21]:
df_sub3 = df.iloc[random.sample(range(0, SIZE), 100)]
df_sub3

Unnamed: 0,ChannelID,subs,nvideos,views,Category,Language,Type
Y7934,Y7934,1,58,479,news,English,NGO
Y4974,Y4974,48515,47,25066,food,Russia,NGO
Y8898,Y8898,99912,73,1385,food,Russia,Individual
Y1580,Y1580,37341,13,34180,gaming,Chinese,Goverment
Y4149,Y4149,173878,19,226348,music,Japanese,NGO
...,...,...,...,...,...,...,...
Y8747,Y8747,101132,80,150711,news,Russia,Individual
Y6329,Y6329,114735,80,170020,food,English,Individual
Y0242,Y0242,178802,55,220842,news,Russia,Individual
Y2615,Y2615,219678,1,254137,music,English,Corperate


#### Select random rows with selected columns

In [22]:
df_sub4 = df.iloc[random.sample(range(0, SIZE), 100)][['subs','views']]
df_sub4

Unnamed: 0,subs,views
Y7633,79223,40161
Y7965,77167,40551
Y4786,28367,35095
Y1991,179690,176777
Y1831,41955,4277
...,...,...
Y8427,93260,4679
Y6860,185962,266192
Y1589,166517,67476
Y3098,64708,2307


### Filtering

#### Super Popular Channel

In [23]:
df_superp = df[df['subs'] > 300000]
df_superp

Unnamed: 0,ChannelID,subs,nvideos,views,Category,Language,Type


#### Popular Channel

In [24]:
df_p = df[df['subs'] > 100000]
df_p

Unnamed: 0,ChannelID,subs,nvideos,views,Category,Language,Type
Y0001,Y0001,112407,8,137830,food,English,Goverment
Y0004,Y0004,154427,92,172224,gaming,English,NGO
Y0005,Y0005,171466,61,183685,food,English,Individual
Y0008,Y0008,142673,87,154982,food,English,Corperate
Y0013,Y0013,112532,20,74012,gaming,English,Goverment
...,...,...,...,...,...,...,...
Y9993,Y9993,149002,3,140254,gaming,Japanese,Goverment
Y9994,Y9994,100716,70,90956,news,Japanese,NGO
Y9995,Y9995,119716,68,5922,food,English,Goverment
Y9996,Y9996,111044,19,58326,music,Franch,Individual


#### Start up channel

In [25]:
df_begin = df[df['subs'] < 100]
df_begin

Unnamed: 0,ChannelID,subs,nvideos,views,Category,Language,Type
Y0123,Y0123,1,40,0,gaming,English,Individual
Y0189,Y0189,1,17,0,music,English,Individual
Y0198,Y0198,1,47,826,news,Russia,Goverment
Y0221,Y0221,1,45,0,travel,English,NGO
Y0248,Y0248,1,83,885,travel,English,Individual
...,...,...,...,...,...,...,...
Y9769,Y9769,1,42,0,gaming,English,Goverment
Y9791,Y9791,1,64,199,travel,English,Individual
Y9864,Y9864,1,14,54,music,English,Corperate
Y9901,Y9901,1,77,0,music,English,NGO


#### Popular English Channel

In [26]:
df_EnglishP = df[(df['subs'] > 100000) & (df['Language'] == 'English')]
df_EnglishP

Unnamed: 0,ChannelID,subs,nvideos,views,Category,Language,Type
Y0001,Y0001,112407,8,137830,food,English,Goverment
Y0004,Y0004,154427,92,172224,gaming,English,NGO
Y0005,Y0005,171466,61,183685,food,English,Individual
Y0008,Y0008,142673,87,154982,food,English,Corperate
Y0013,Y0013,112532,20,74012,gaming,English,Goverment
...,...,...,...,...,...,...,...
Y9979,Y9979,135756,35,83628,gaming,English,NGO
Y9984,Y9984,152586,41,8405,travel,English,NGO
Y9985,Y9985,115796,8,146382,music,English,Individual
Y9995,Y9995,119716,68,5922,food,English,Goverment


#### Gaming channel with many videos

In [27]:
df_gaming_nv = df[(df['Category'] == 'gaming') & (df['nvideos'] > 90)]
df_gaming_nv

Unnamed: 0,ChannelID,subs,nvideos,views,Category,Language,Type
Y0004,Y0004,154427,92,172224,gaming,English,NGO
Y0039,Y0039,114186,96,145534,gaming,English,Individual
Y0043,Y0043,83225,99,2881,gaming,English,Individual
Y0075,Y0075,54639,95,70018,gaming,Franch,Individual
Y0299,Y0299,119074,99,79042,gaming,English,Individual
...,...,...,...,...,...,...,...
Y9750,Y9750,101044,96,148319,gaming,English,NGO
Y9784,Y9784,100741,91,83593,gaming,Franch,Individual
Y9793,Y9793,83465,99,0,gaming,English,NGO
Y9924,Y9924,71820,92,106613,gaming,English,NGO


#### Non-Corperate and News channel

In [28]:
df_nc_news = df[(df['Category'] == 'news') & (df['Type'] != 'Corperate')]
df_nc_news

Unnamed: 0,ChannelID,subs,nvideos,views,Category,Language,Type
Y0006,Y0006,25407,30,29163,news,English,NGO
Y0009,Y0009,42078,66,42632,news,English,Individual
Y0017,Y0017,182756,86,150412,news,English,NGO
Y0027,Y0027,87038,54,70244,news,English,Individual
Y0050,Y0050,93176,96,43607,news,Franch,NGO
...,...,...,...,...,...,...,...
Y9964,Y9964,32626,86,14193,news,Spanish,Individual
Y9966,Y9966,81409,78,90670,news,Japanese,Individual
Y9969,Y9969,73792,19,105253,news,Spanish,Individual
Y9994,Y9994,100716,70,90956,news,Japanese,NGO


#### Top Subs **and** Top nvideos

In [29]:
df_subs_and_nvideos = df[(df['subs'] > 200000) & (df['nvideos'] > 90)]
df_subs_and_nvideos

Unnamed: 0,ChannelID,subs,nvideos,views,Category,Language,Type
Y0457,Y0457,200578,98,179438,music,English,NGO
Y1234,Y1234,270322,91,246792,music,English,Individual
Y1752,Y1752,252441,97,94488,music,Spanish,NGO
Y1966,Y1966,239748,95,234882,gaming,English,Goverment
Y2118,Y2118,203434,94,251128,gaming,Russia,Goverment
Y2264,Y2264,209237,99,214449,news,English,Individual
Y2369,Y2369,215407,91,222893,travel,English,Individual
Y2655,Y2655,216743,98,294638,gaming,Chinese,NGO
Y2788,Y2788,253750,95,41430,food,Russia,Individual
Y3241,Y3241,204562,91,36758,travel,English,NGO


#### Top Subs **or** Top nvideos

In [30]:
df_subs_or_nvideos = df[(df['subs'] > 200000) | (df['nvideos'] > 90)]
df_subs_or_nvideos

Unnamed: 0,ChannelID,subs,nvideos,views,Category,Language,Type
Y0004,Y0004,154427,92,172224,gaming,English,NGO
Y0010,Y0010,94659,92,19164,music,English,Goverment
Y0018,Y0018,214402,24,213773,gaming,Chinese,NGO
Y0034,Y0034,77831,95,30652,food,English,NGO
Y0039,Y0039,114186,96,145534,gaming,English,Individual
...,...,...,...,...,...,...,...
Y9955,Y9955,87258,96,52558,travel,Franch,Goverment
Y9973,Y9973,120056,98,109683,food,English,NGO
Y9977,Y9977,131093,94,186946,travel,English,Individual
Y9987,Y9987,72528,91,28345,gaming,English,Goverment


### Aggregation

In [31]:
byCategory = df.groupby('Category')
byCategory.sum()

  byCategory.sum()


Unnamed: 0_level_0,subs,nvideos,views
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
food,201680961,99402,153058502
gaming,199010103,100746,152632987
music,197883579,99674,149484275
news,198667773,100718,151458880
travel,200028733,98658,150532994


In [32]:
byCategory.mean()

  byCategory.mean()


Unnamed: 0_level_0,subs,nvideos,views
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
food,100040.159226,49.306548,75921.875992
gaming,99108.617032,50.172311,76012.443725
music,100346.642495,50.544625,75803.384888
news,99632.784855,50.510532,75957.311936
travel,99516.782587,49.083582,74892.036816


In [33]:
byLanguage = df.groupby('Language')
byLanguage.sum()

  byLanguage.sum()


Unnamed: 0_level_0,subs,nvideos,views
Language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Chinese,104324228,53327,79330991
English,496591607,250622,376477909
Franch,101118223,49236,76222840
Japanese,106116938,51277,83271965
Russia,92832378,46971,72219624
Spanish,96287775,47765,69644309


In [34]:
byLanguage.mean()

  byLanguage.mean()


Unnamed: 0_level_0,subs,nvideos,views
Language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Chinese,99831.797129,51.030622,75914.823923
English,99697.170648,50.315599,75582.796426
Franch,101320.864729,49.334669,76375.591182
Japanese,101450.227533,49.021989,79609.909178
Russia,97308.572327,49.235849,75701.91195
Spanish,98655.507172,48.939549,71356.873975


In [35]:
byLanType = df.groupby(['Language', 'Type'])
byLanType.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,subs,subs,subs,subs,subs,subs,subs,subs,nvideos,nvideos,nvideos,nvideos,nvideos,views,views,views,views,views,views,views,views
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Language,Type,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
Chinese,Corperate,110.0,99457.336364,50733.441208,1.0,62248.0,98675.5,135087.5,237038.0,110.0,46.336364,...,75.5,99.0,110.0,71285.681818,65986.120904,0.0,20212.75,53308.0,97552.0,311890.0
Chinese,Goverment,237.0,102804.552743,52212.092706,1.0,70381.0,100310.0,134932.0,263176.0,237.0,52.801688,...,78.0,98.0,237.0,74188.147679,64197.005122,0.0,23461.0,52070.0,111075.0,294214.0
Chinese,Individual,406.0,95629.667488,50165.313076,1.0,58861.0,96165.5,132292.75,247324.0,406.0,51.647783,...,76.0,99.0,406.0,75170.899015,62639.002587,0.0,24870.0,59563.0,115691.25,336057.0
Chinese,NGO,292.0,103402.729452,51112.394733,1.0,69207.5,101904.5,135814.75,261685.0,292.0,50.503425,...,73.0,99.0,292.0,80094.486301,60962.878111,35.0,33152.5,68132.5,117533.25,294638.0
English,Corperate,509.0,96385.499018,51289.569872,1.0,59867.0,94118.0,130254.0,286875.0,509.0,48.263261,...,73.0,99.0,509.0,71423.113949,60978.184166,0.0,23229.0,57982.0,102267.0,303855.0
English,Goverment,942.0,100659.650743,48636.473989,1.0,66303.0,102239.0,135400.0,239748.0,942.0,51.132696,...,76.0,99.0,942.0,75625.174098,60702.564724,0.0,26101.5,62926.5,112054.25,311153.0
English,Individual,2042.0,99512.581293,49236.312216,1.0,66363.0,100693.0,132786.75,282862.0,2042.0,51.214985,...,76.0,99.0,2042.0,75803.887855,61317.29723,0.0,25055.75,64702.5,113763.5,359679.0
English,NGO,1488.0,100473.995968,49121.816322,1.0,66272.75,100893.0,132323.75,260881.0,1488.0,49.266129,...,73.0,99.0,1488.0,76675.464382,60287.765229,0.0,27884.5,62826.5,113427.5,329993.0
Franch,Corperate,115.0,95224.33913,47015.224298,1.0,63358.5,93194.0,128991.0,255294.0,115.0,51.347826,...,79.0,99.0,115.0,68803.686957,62554.067493,888.0,25039.0,46710.0,103121.0,368409.0
Franch,Goverment,184.0,106364.016304,49287.658376,1.0,68470.25,105015.0,143614.25,224179.0,184.0,47.043478,...,71.25,99.0,184.0,78024.032609,63402.414183,0.0,24841.75,64288.0,116918.5,268350.0


In [36]:
byLanType.describe()[['subs','views']]

Unnamed: 0_level_0,Unnamed: 1_level_0,subs,subs,subs,subs,subs,subs,subs,subs,views,views,views,views,views,views,views,views
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Language,Type,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
Chinese,Corperate,110.0,99457.336364,50733.441208,1.0,62248.0,98675.5,135087.5,237038.0,110.0,71285.681818,65986.120904,0.0,20212.75,53308.0,97552.0,311890.0
Chinese,Goverment,237.0,102804.552743,52212.092706,1.0,70381.0,100310.0,134932.0,263176.0,237.0,74188.147679,64197.005122,0.0,23461.0,52070.0,111075.0,294214.0
Chinese,Individual,406.0,95629.667488,50165.313076,1.0,58861.0,96165.5,132292.75,247324.0,406.0,75170.899015,62639.002587,0.0,24870.0,59563.0,115691.25,336057.0
Chinese,NGO,292.0,103402.729452,51112.394733,1.0,69207.5,101904.5,135814.75,261685.0,292.0,80094.486301,60962.878111,35.0,33152.5,68132.5,117533.25,294638.0
English,Corperate,509.0,96385.499018,51289.569872,1.0,59867.0,94118.0,130254.0,286875.0,509.0,71423.113949,60978.184166,0.0,23229.0,57982.0,102267.0,303855.0
English,Goverment,942.0,100659.650743,48636.473989,1.0,66303.0,102239.0,135400.0,239748.0,942.0,75625.174098,60702.564724,0.0,26101.5,62926.5,112054.25,311153.0
English,Individual,2042.0,99512.581293,49236.312216,1.0,66363.0,100693.0,132786.75,282862.0,2042.0,75803.887855,61317.29723,0.0,25055.75,64702.5,113763.5,359679.0
English,NGO,1488.0,100473.995968,49121.816322,1.0,66272.75,100893.0,132323.75,260881.0,1488.0,76675.464382,60287.765229,0.0,27884.5,62826.5,113427.5,329993.0
Franch,Corperate,115.0,95224.33913,47015.224298,1.0,63358.5,93194.0,128991.0,255294.0,115.0,68803.686957,62554.067493,888.0,25039.0,46710.0,103121.0,368409.0
Franch,Goverment,184.0,106364.016304,49287.658376,1.0,68470.25,105015.0,143614.25,224179.0,184.0,78024.032609,63402.414183,0.0,24841.75,64288.0,116918.5,268350.0


In [37]:
byLanType.describe().loc['English',['subs','views']]

Unnamed: 0_level_0,subs,subs,subs,subs,subs,subs,subs,subs,views,views,views,views,views,views,views,views
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Corperate,509.0,96385.499018,51289.569872,1.0,59867.0,94118.0,130254.0,286875.0,509.0,71423.113949,60978.184166,0.0,23229.0,57982.0,102267.0,303855.0
Goverment,942.0,100659.650743,48636.473989,1.0,66303.0,102239.0,135400.0,239748.0,942.0,75625.174098,60702.564724,0.0,26101.5,62926.5,112054.25,311153.0
Individual,2042.0,99512.581293,49236.312216,1.0,66363.0,100693.0,132786.75,282862.0,2042.0,75803.887855,61317.29723,0.0,25055.75,64702.5,113763.5,359679.0
NGO,1488.0,100473.995968,49121.816322,1.0,66272.75,100893.0,132323.75,260881.0,1488.0,76675.464382,60287.765229,0.0,27884.5,62826.5,113427.5,329993.0


### Manipulation

In [38]:
df['subpervideo'] = df['subs']/df['nvideos']
df['subpervideo'].describe()

count     10000.000000
mean       5261.265431
std       13647.291270
min           0.010101
25%        1160.077077
50%        1996.801082
75%        3985.608989
max      235418.000000
Name: subpervideo, dtype: float64

In [39]:
df['viewspervideo'] = df['views']/df['nvideos']
df['viewspervideo'].describe()

count     10000.000000
mean       3984.010792
std       11965.410055
min           0.000000
25%         533.549567
50%        1348.045304
75%        3020.480612
max      254193.000000
Name: viewspervideo, dtype: float64

In [40]:
df['viewspersub'] = df['views']/df['subs']
df['viewspersub'].describe()

count    10000.000000
mean         8.448969
std         73.398496
min          0.000000
25%          0.376409
50%          0.766664
75%          1.140467
max       1134.000000
Name: viewspersub, dtype: float64

In [41]:
df.head(10)

Unnamed: 0,ChannelID,subs,nvideos,views,Category,Language,Type,subpervideo,viewspervideo,viewspersub
Y0000,Y0000,28045,61,34515,food,Russia,Individual,459.754098,565.819672,1.230701
Y0001,Y0001,112407,8,137830,food,English,Goverment,14050.875,17228.75,1.226169
Y0002,Y0002,75610,89,75561,food,Franch,Goverment,849.550562,849.0,0.999352
Y0003,Y0003,68566,17,14567,food,English,Individual,4033.294118,856.882353,0.212452
Y0004,Y0004,154427,92,172224,gaming,English,NGO,1678.554348,1872.0,1.115245
Y0005,Y0005,171466,61,183685,food,English,Individual,2810.918033,3011.229508,1.071262
Y0006,Y0006,25407,30,29163,news,English,NGO,846.9,972.1,1.147833
Y0007,Y0007,90808,74,133107,food,Franch,Goverment,1227.135135,1798.743243,1.465807
Y0008,Y0008,142673,87,154982,food,English,Corperate,1639.91954,1781.402299,1.086274
Y0009,Y0009,42078,66,42632,news,English,Individual,637.545455,645.939394,1.013166


## Congratulations! You completed our course: Data Wrangling -- Fundamentals