In [1]:
import pandas as pd

In [2]:
snl = pd.read_csv('https://github.com/gdv/foundationsCS/raw/main/students/ex-data/snldb/snl_title.csv')
snl.head()

Unnamed: 0,sid,eid,tid,title,titleType
0,3,20,1978052013,"""Space is the Place"", ""Space-Loneliness""",Musical Performance
1,2,21,1977051416,,Goodnights
2,3,18,1978042215,,Goodnights
3,3,20,1978052014,,Goodnights
4,3,18,1978042214,Next Week In Review,Show


In [3]:
episodes = pd.read_csv('https://github.com/gdv/foundationsCS/raw/main/students/ex-data/snldb/snl_episode.csv', parse_dates=["aired"])
episodes.head()

Unnamed: 0,sid,eid,year,aired
0,3,20,1977,1978-05-20
1,3,19,1977,1978-05-13
2,3,18,1977,1978-04-22
3,2,21,1976,1977-05-14
4,2,22,1976,1977-05-21


In [4]:
seasons = pd.read_csv('https://github.com/gdv/foundationsCS/raw/main/students/ex-data/snldb/snl_season.csv')
seasons.head()

Unnamed: 0,sid,year
0,1,1975
1,2,1976
2,3,1977
3,4,1978
4,5,1979


## Compute the sketches of the season with year 1978

The condition for selecting the rows is a single equality

In [5]:
y1978 = pd.merge(seasons[seasons["year"] == 1978], snl, on="sid")
y1978

Unnamed: 0,sid,year,eid,tid,title,titleType
0,4,1978,19,1979051915,,Goodnights
1,4,1978,20,1979052614,,Guest Performance
2,4,1978,18,1979051212,,Goodnights
3,4,1978,17,1979041412,,Goodnights
4,4,1978,17,1979041411,September Song,Miscellaneous
...,...,...,...,...,...,...
239,4,1978,15,197903176,St. Mickey's Knights of Columbus,Sketch
240,4,1978,15,197903173,The Navy,Commercial
241,4,1978,15,197903172,,Monologue
242,4,1978,15,197903174,Fred Garvin: Male Prostitute,Sketch


## Compute the sketches of the seasons with year 1978-1982

In this case the condition of the selection is more complex and requires the conjuction of two different conditions.
The precedence order betweeen operators implies that we have to use the parenthesis.

In [6]:
years = seasons[(seasons["year"] >= 1978) & (seasons["year"] <= 1982)]
years

Unnamed: 0,sid,year
3,4,1978
4,5,1979
5,6,1980
6,7,1981
7,8,1982


In [7]:
pd.merge(years, snl, on="sid")

Unnamed: 0,sid,year,eid,tid,title,titleType
0,4,1978,19,1979051915,,Goodnights
1,4,1978,20,1979052614,,Guest Performance
2,4,1978,18,1979051212,,Goodnights
3,4,1978,17,1979041412,,Goodnights
4,4,1978,17,1979041411,September Song,Miscellaneous
...,...,...,...,...,...,...
1393,8,1982,15,198303128,The Buckwheat Story,Commercial
1394,8,1982,15,198303124,Donny & Marie St. Patrick's Day Special,Show
1395,8,1982,15,198303122,,Monologue
1396,8,1982,15,198303123,Buckwheat Jeans,Commercial


## Compute the sketches aired in 1978 (consider the table snl_episode.csv)

First we merge the episodes with the snl dataframe, so that we have the air date of each sketch.

In [8]:
all = pd.merge(snl, episodes, on=("sid", "eid"))
all.head()

Unnamed: 0,sid,eid,tid,title,titleType,year,aired
0,3,20,1978052013,"""Space is the Place"", ""Space-Loneliness""",Musical Performance,1977,1978-05-20
1,3,20,1978052014,,Goodnights,1977,1978-05-20
2,3,20,1978052012,The Franken and Davis Show,Show,1977,1978-05-20
3,3,20,1978052011,Bad Conceptual Art,Show,1977,1978-05-20
4,3,20,197805209,More Insects To Worry About,Show,1977,1978-05-20


Then we extract only the relevant sketches

We add a new column for the year

In [9]:
all["year"] = all["aired"].dt.year
all

Unnamed: 0,sid,eid,tid,title,titleType,year,aired
0,3,20,1978052013,"""Space is the Place"", ""Space-Loneliness""",Musical Performance,1978,1978-05-20
1,3,20,1978052014,,Goodnights,1978,1978-05-20
2,3,20,1978052012,The Franken and Davis Show,Show,1978,1978-05-20
3,3,20,1978052011,Bad Conceptual Art,Show,1978,1978-05-20
4,3,20,197805209,More Insects To Worry About,Show,1978,1978-05-20
...,...,...,...,...,...,...,...
11693,11,6,198512215,Hildy,Sketch,1985,1985-12-21
11694,11,6,198512214,A Dozen Eggs,Sketch,1985,1985-12-21
11695,11,6,198512212,,Monologue,1985,1985-12-21
11696,11,6,198512213,Critic,Commercial,1985,1985-12-21


Then we extract only the relevant sketches

In [10]:
all[all["year"] == 1978]

Unnamed: 0,sid,eid,tid,title,titleType,year,aired
0,3,20,1978052013,"""Space is the Place"", ""Space-Loneliness""",Musical Performance,1978,1978-05-20
1,3,20,1978052014,,Goodnights,1978,1978-05-20
2,3,20,1978052012,The Franken and Davis Show,Show,1978,1978-05-20
3,3,20,1978052011,Bad Conceptual Art,Show,1978,1978-05-20
4,3,20,197805209,More Insects To Worry About,Show,1978,1978-05-20
...,...,...,...,...,...,...,...
2371,3,10,197801285,"""Runaway""",Musical Performance,1978,1978-01-28
2372,3,10,197801284,X-Police,Sketch,1978,1978-01-28
2373,3,10,197801282,,Monologue,1978,1978-01-28
2374,3,10,197801283,The Olympia Cafe,Sketch,1978,1978-01-28


## For each season, compute the average number of top 1000 users

In [11]:
ratings = pd.read_csv('https://github.com/gdv/foundationsCS/raw/main/students/ex-data/snldb/snl_rating.csv')
ratings.head()

Unnamed: 0,sid,eid,1,10,2,3,4,5,6,7,...,Males Aged 45+_avg,Males under 18,Males under 18_avg,Males_avg,Non-US users,Non-US users_avg,Top 1000 voters,Top 1000 voters_avg,US users,US users_avg
0,7,1,0,3,0,0,5,2,10,4,...,7.2,0,,7.1,6,6.4,10,6.4,22,6.8
1,10,1,2,4,0,2,0,4,5,1,...,7.3,0,,6.1,7,5.4,7,6.4,13,6.6
2,6,13,0,3,0,0,1,0,2,9,...,7.8,0,,7.9,5,7.7,9,7.4,17,7.7
3,2,1,0,8,0,2,1,7,8,15,...,7.1,0,,7.2,12,7.4,15,6.7,33,6.9
4,3,20,0,6,0,2,1,3,4,13,...,6.9,0,,7.1,7,7.3,14,7.3,21,6.8


In [12]:
avg_rating_of_season = ratings.groupby("sid")["Top 1000 voters"].mean()
avg_rating_of_season

sid
1     21.125000
2     11.652174
3     13.500000
4     12.250000
5     10.000000
6     10.846154
7      9.750000
8      9.850000
9      9.052632
10     7.888889
11     8.666667
12     6.850000
13     7.615385
14     7.400000
15     8.750000
16    11.050000
17    10.550000
18    11.650000
19    11.700000
20    10.000000
21     7.850000
22     8.300000
23     9.000000
24     9.526316
25    10.300000
26    11.050000
27    10.050000
28     8.450000
29    11.050000
30    14.750000
31    17.052632
32    18.100000
33    17.666667
34    20.500000
35    27.681818
36    31.272727
37    32.772727
38    30.809524
39    26.571429
40    25.619048
41    25.190476
42    18.769231
Name: Top 1000 voters, dtype: float64

##  For each season, compute the difference between the maximum and the minimum ratings

First we extract the max and min rating for each season. Since we will merge the resulting tables by using the `sid` column, the `merge` is simpler if both dataframes retain the `sid` column (that is, it does not become part of the index).

In [13]:
mins = ratings.groupby("sid", as_index = False)["Top 1000 voters"].min()
mins.head()

Unnamed: 0,sid,Top 1000 voters
0,1,14
1,2,8
2,3,8
3,4,8
4,5,7


In [14]:
maxs = ratings.groupby("sid", as_index = False)["Top 1000 voters"].max()
maxs.head()

Unnamed: 0,sid,Top 1000 voters
0,1,39
1,2,15
2,3,17
3,4,18
4,5,14


In [15]:
merged = pd.merge(mins, maxs, on='sid', suffixes=['_min', '_max'])
merged.head()

Unnamed: 0,sid,Top 1000 voters_min,Top 1000 voters_max
0,1,14,39
1,2,8,15
2,3,8,17
3,4,8,18
4,5,7,14


In [16]:
merged["diff"] = merged["Top 1000 voters_max"] - merged["Top 1000 voters_min"]
merged[["sid", "diff"]]

Unnamed: 0,sid,diff
0,1,25
1,2,7
2,3,9
3,4,10
4,5,7
5,6,7
6,7,6
7,8,10
8,9,6
9,10,7
