In [1]:
# Import Dependencies
import os
import csv
import pandas as pd

In [2]:
# Create a path to the csv and read it into a Pandas DataFrame
csv_path = "ted_talks.csv"
ted_df = pd.read_csv(csv_path)

ted_df.head()

Unnamed: 0,comments,description,duration,event,languages,main_speaker,name,title,views
0,4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,Do schools kill creativity?,47227110
1,265,With the same humor and humanity he exuded in ...,977,TED2006,43,Al Gore,Al Gore: Averting the climate crisis,Averting the climate crisis,3200520
2,124,New York Times columnist David Pogue takes aim...,1286,TED2006,26,David Pogue,David Pogue: Simplicity sells,Simplicity sells,1636292
3,200,"In an emotionally charged talk, MacArthur-winn...",1116,TED2006,35,Majora Carter,Majora Carter: Greening the ghetto,Greening the ghetto,1697550
4,593,You've never seen data presented like this. Wi...,1190,TED2006,48,Hans Rosling,Hans Rosling: The best stats you've ever seen,The best stats you've ever seen,12005869


In [14]:
# Figure out the minimum and maximum views for a TED Talk
maxv = ted_df["views"].max()
minv = ted_df["views"].min()

47227110


In [13]:
# Create bins in which to place values based upon TED Talk views
step = int((maxv-minv)/10)
bins = []
for i in range(minv, maxv, step):
    bins.append(i)
print(bins)
print(step)
# Create labels for these bins
labels = ["50443 - 4768109", "4768109 - 9485775", "9485775 - 14203441", "14203441 - 18921107", "18921107 - 23638773",
          "23638773 - 28356439", "28356439 - 33074105", "33074105 - 37791771", "37791771 - 42509437", "42509437 - 47227103"]

[50443, 4768109, 9485775, 14203441, 18921107, 23638773, 28356439, 33074105, 37791771, 42509437, 47227103]
4717666


In [5]:
# Slice the data and place it into bins
pd.cut(ted_df["views"], bins = bins, labels = labels).head()

0                   NaN
1       50443 - 4768109
2       50443 - 4768109
3       50443 - 4768109
4    9485775 - 14203441
Name: views, dtype: category
Categories (10, object): [50443 - 4768109 < 4768109 - 9485775 < 9485775 - 14203441 < 14203441 - 18921107 ... 28356439 - 33074105 < 33074105 - 37791771 < 37791771 - 42509437 < 42509437 - 47227103]

In [6]:
# Place the data series into a new column inside of the DataFrame
ted_df["View group"] = pd.cut(ted_df["views"], bins = bins, labels = labels)
ted_df.head()

Unnamed: 0,comments,description,duration,event,languages,main_speaker,name,title,views,View group
0,4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,Do schools kill creativity?,47227110,
1,265,With the same humor and humanity he exuded in ...,977,TED2006,43,Al Gore,Al Gore: Averting the climate crisis,Averting the climate crisis,3200520,50443 - 4768109
2,124,New York Times columnist David Pogue takes aim...,1286,TED2006,26,David Pogue,David Pogue: Simplicity sells,Simplicity sells,1636292,50443 - 4768109
3,200,"In an emotionally charged talk, MacArthur-winn...",1116,TED2006,35,Majora Carter,Majora Carter: Greening the ghetto,Greening the ghetto,1697550,50443 - 4768109
4,593,You've never seen data presented like this. Wi...,1190,TED2006,48,Hans Rosling,Hans Rosling: The best stats you've ever seen,The best stats you've ever seen,12005869,9485775 - 14203441


In [10]:
# Create a GroupBy object based upon "View Group"
ted_view_df = ted_df.groupby(["View group"])

# Find how many rows fall into each bin
print(ted_view_df["View group"].count())

# Get the average of each column within the GroupBy object
ted_view_df[["comments", "duration", "languages"]].mean()

View group
50443 - 4768109        2435
4768109 - 9485775        78
9485775 - 14203441       15
14203441 - 18921107      11
18921107 - 23638773       6
23638773 - 28356439       0
28356439 - 33074105       1
33074105 - 37791771       1
37791771 - 42509437       0
42509437 - 47227103       1
Name: View group, dtype: int64


Unnamed: 0_level_0,comments,duration,languages
View group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
50443 - 4768109,172.064887,823.062012,26.758111
4768109 - 9485775,434.897436,844.410256,38.0
9485775 - 14203441,779.6,910.0,42.8
14203441 - 18921107,724.909091,950.818182,41.727273
18921107 - 23638773,866.0,861.666667,42.166667
23638773 - 28356439,,,
28356439 - 33074105,1927.0,1219.0,52.0
33074105 - 37791771,1930.0,1084.0,45.0
37791771 - 42509437,,,
42509437 - 47227103,2290.0,1262.0,51.0
