In [1]:
# to scrape data from wikipedia, we need install the package called lxml
# we can do that here from our notebook or, if we think we will use it often, we could modify our "install_packages"
# shell script to install it automatically each time we start a job in UCloud
%pip install lxml

# import pandas so we can put data in a nice dataframe
# we'll abbreviate pandas as pd, because that's what everybody does
import pandas as pd

Note: you may need to restart the kernel to use updated packages.


## Scraping data from the web
Using `pandas.read_html`, we can read data from websites where data is presented in a table-like format. Wikipedia has lots of these, and is a great source for data to play with. Below, we'll look at data from the [List of Sesame Street Muppets](https://en.wikipedia.org/wiki/List_of_Sesame_Street_Muppets)

In [2]:
# scrape table data from websites

rawdata = pd.read_html("https://en.wikipedia.org/wiki/List_of_Sesame_Street_Muppets")
df = rawdata[1]
df

Unnamed: 0,Character,Actor/Muppet performer,Description,Unnamed: 3
0,Abby Cadabby,Leslie Carrara-Rudolph[13],A 4-year-old fairy-in-training with tiny wings...,
1,Alice Snuffleupagus,Judy Sladky[15],"The baby sister of Aloysius Snuffleupagus (""Sn...",
2,Alistair Cookie,Frank Oz[17],"Played by Cookie Monster, he is a parody of Br...",
3,The Amazing Mumford,"Jerry Nelson,[20] John Kennedy[21]","A ""W.C. Fields-esque"" magician whose magic tri...",
4,Anything Muppets,Various[22],Writer Christopher Finch called Anything Muppe...,Writer Christopher Finch called Anything Muppe...
...,...,...,...,...
114,Telly Monster,"Bob Payne (1979), Brian Muehl (1979–1984),[126...","According to Sesame Workshop, ""Telly Monster i...",
115,The Twiddlebugs,"Jim Henson, Frank Oz, Jerry Nelson, Richard Hu...","A family of fuzzy insects (Thomas, Tessie and ...",
116,The Two-Headed Monster,Left Head: Jerry Nelson[40] Right Head: Richar...,"A purple monster with two heads, who generally...",
117,Wes,Bradley Freeman Jr.,A young Muppet boy. His first appearances incl...,


In [3]:
list(df)

['Character', 'Actor/Muppet performer', 'Description', 'Unnamed: 3']

## Removing an unwanted column
Below are two ways to get rid of the final column (there are more ways to do this!) If you find the "axis = 1" part in the first method confusing, well, I do too, and [we are not alone](https://stackoverflow.com/questions/22149584/what-does-axis-in-pandas-mean)! My advice for now is to just accept it and move on.

In [4]:

# method 1
df = df.drop(['Unnamed: 3'], axis = 1)
df

Unnamed: 0,Character,Actor/Muppet performer,Description
0,Abby Cadabby,Leslie Carrara-Rudolph[13],A 4-year-old fairy-in-training with tiny wings...
1,Alice Snuffleupagus,Judy Sladky[15],"The baby sister of Aloysius Snuffleupagus (""Sn..."
2,Alistair Cookie,Frank Oz[17],"Played by Cookie Monster, he is a parody of Br..."
3,The Amazing Mumford,"Jerry Nelson,[20] John Kennedy[21]","A ""W.C. Fields-esque"" magician whose magic tri..."
4,Anything Muppets,Various[22],Writer Christopher Finch called Anything Muppe...
...,...,...,...
114,Telly Monster,"Bob Payne (1979), Brian Muehl (1979–1984),[126...","According to Sesame Workshop, ""Telly Monster i..."
115,The Twiddlebugs,"Jim Henson, Frank Oz, Jerry Nelson, Richard Hu...","A family of fuzzy insects (Thomas, Tessie and ..."
116,The Two-Headed Monster,Left Head: Jerry Nelson[40] Right Head: Richar...,"A purple monster with two heads, who generally..."
117,Wes,Bradley Freeman Jr.,A young Muppet boy. His first appearances incl...


In [5]:

# method 2
df = df.loc[:,list(df)[0:3]] 
df

Unnamed: 0,Character,Actor/Muppet performer,Description
0,Abby Cadabby,Leslie Carrara-Rudolph[13],A 4-year-old fairy-in-training with tiny wings...
1,Alice Snuffleupagus,Judy Sladky[15],"The baby sister of Aloysius Snuffleupagus (""Sn..."
2,Alistair Cookie,Frank Oz[17],"Played by Cookie Monster, he is a parody of Br..."
3,The Amazing Mumford,"Jerry Nelson,[20] John Kennedy[21]","A ""W.C. Fields-esque"" magician whose magic tri..."
4,Anything Muppets,Various[22],Writer Christopher Finch called Anything Muppe...
...,...,...,...
114,Telly Monster,"Bob Payne (1979), Brian Muehl (1979–1984),[126...","According to Sesame Workshop, ""Telly Monster i..."
115,The Twiddlebugs,"Jim Henson, Frank Oz, Jerry Nelson, Richard Hu...","A family of fuzzy insects (Thomas, Tessie and ..."
116,The Two-Headed Monster,Left Head: Jerry Nelson[40] Right Head: Richar...,"A purple monster with two heads, who generally..."
117,Wes,Bradley Freeman Jr.,A young Muppet boy. His first appearances incl...


In [6]:
# take a column from a dataframe and assign it to a list variable

a = list(df.Character)

In [7]:
# find the first four items in the list
a[0:4]

['Abby Cadabby',
 'Alice Snuffleupagus',
 'Alistair Cookie',
 'The Amazing Mumford']

In [8]:
# find the last three items in the list
a[-3:]

['The Two-Headed Monster', 'Wes', 'Zoe']

In [9]:
# find items in the middle of the list
a[7:10]

['Arlene Frantic', 'Baby Bear', 'Barkley']

In [10]:
# select the first 20 items from a list
b = a[0:20]
b


['Abby Cadabby',
 'Alice Snuffleupagus',
 'Alistair Cookie',
 'The Amazing Mumford',
 'Anything Muppets',
 'AM Monsters',
 'Aristotle',
 'Arlene Frantic',
 'Baby Bear',
 'Barkley',
 'Beautiful Day Monster[broken anchor]',
 'Bennett Snerf',
 'Benny',
 'Bert',
 'Betty Lou',
 'Biff',
 'Big Bird',
 'Bip Bippadotta',
 'Bruno',
 'Buster']

In [11]:
# remove the last item in a list
b.pop()

'Buster'

In [12]:
b

['Abby Cadabby',
 'Alice Snuffleupagus',
 'Alistair Cookie',
 'The Amazing Mumford',
 'Anything Muppets',
 'AM Monsters',
 'Aristotle',
 'Arlene Frantic',
 'Baby Bear',
 'Barkley',
 'Beautiful Day Monster[broken anchor]',
 'Bennett Snerf',
 'Benny',
 'Bert',
 'Betty Lou',
 'Biff',
 'Big Bird',
 'Bip Bippadotta',
 'Bruno']

In [13]:
b.pop(2)

'Alistair Cookie'

In [14]:
b.append('Kermit')

In [15]:
b

['Abby Cadabby',
 'Alice Snuffleupagus',
 'The Amazing Mumford',
 'Anything Muppets',
 'AM Monsters',
 'Aristotle',
 'Arlene Frantic',
 'Baby Bear',
 'Barkley',
 'Beautiful Day Monster[broken anchor]',
 'Bennett Snerf',
 'Benny',
 'Bert',
 'Betty Lou',
 'Biff',
 'Big Bird',
 'Bip Bippadotta',
 'Bruno',
 'Kermit']

In [16]:
# insert an item into a list at a particular position
b.insert(5, 'Kermit')
b

['Abby Cadabby',
 'Alice Snuffleupagus',
 'The Amazing Mumford',
 'Anything Muppets',
 'AM Monsters',
 'Kermit',
 'Aristotle',
 'Arlene Frantic',
 'Baby Bear',
 'Barkley',
 'Beautiful Day Monster[broken anchor]',
 'Bennett Snerf',
 'Benny',
 'Bert',
 'Betty Lou',
 'Biff',
 'Big Bird',
 'Bip Bippadotta',
 'Bruno',
 'Kermit']

In [17]:
# replace an item in a list
b[1]='Fozzy Bear'
b

['Abby Cadabby',
 'Fozzy Bear',
 'The Amazing Mumford',
 'Anything Muppets',
 'AM Monsters',
 'Kermit',
 'Aristotle',
 'Arlene Frantic',
 'Baby Bear',
 'Barkley',
 'Beautiful Day Monster[broken anchor]',
 'Bennett Snerf',
 'Benny',
 'Bert',
 'Betty Lou',
 'Biff',
 'Big Bird',
 'Bip Bippadotta',
 'Bruno',
 'Kermit']

In [18]:
# use list comprehensions to manipulate items in a list in a vectorized fashion 
c = [x + ' is a cute monster' for x in b]
c

['Abby Cadabby is a cute monster',
 'Fozzy Bear is a cute monster',
 'The Amazing Mumford is a cute monster',
 'Anything Muppets is a cute monster',
 'AM Monsters is a cute monster',
 'Kermit is a cute monster',
 'Aristotle is a cute monster',
 'Arlene Frantic is a cute monster',
 'Baby Bear is a cute monster',
 'Barkley is a cute monster',
 'Beautiful Day Monster[broken anchor] is a cute monster',
 'Bennett Snerf is a cute monster',
 'Benny is a cute monster',
 'Bert is a cute monster',
 'Betty Lou is a cute monster',
 'Biff is a cute monster',
 'Big Bird is a cute monster',
 'Bip Bippadotta is a cute monster',
 'Bruno is a cute monster',
 'Kermit is a cute monster']

In [19]:
# reset b to original first 20 items
b = a[0:20]
b

['Abby Cadabby',
 'Alice Snuffleupagus',
 'Alistair Cookie',
 'The Amazing Mumford',
 'Anything Muppets',
 'AM Monsters',
 'Aristotle',
 'Arlene Frantic',
 'Baby Bear',
 'Barkley',
 'Beautiful Day Monster[broken anchor]',
 'Bennett Snerf',
 'Benny',
 'Bert',
 'Betty Lou',
 'Biff',
 'Big Bird',
 'Bip Bippadotta',
 'Bruno',
 'Buster']

In [20]:
# use list comprehensions to compare two lists, and select items from list 1 that match items in list 2

favs = ['Barkley',  'Betty Lou', 'Bert', 'Kermit']
c = [x for x in b if x in favs]
c

['Barkley', 'Bert', 'Betty Lou']

In [21]:
# use list comprehensions to compare two lists, and select items from list 1 that are not in list 2
b = a[0:20]
favs = ['Barkley',  'Betty Lou', 'Bert']
c = [x for x in b if x not in favs]
c

# this gives us all the monsters in the list that are NOT my favorites

# Not really. I love all monsters.

['Abby Cadabby',
 'Alice Snuffleupagus',
 'Alistair Cookie',
 'The Amazing Mumford',
 'Anything Muppets',
 'AM Monsters',
 'Aristotle',
 'Arlene Frantic',
 'Baby Bear',
 'Beautiful Day Monster[broken anchor]',
 'Bennett Snerf',
 'Benny',
 'Biff',
 'Big Bird',
 'Bip Bippadotta',
 'Bruno',
 'Buster']

## More fun with list comprehensions

In [22]:
# add 10 to each number in d
d = [1,2,3,4,5]
d = [x + 10 for x in d]
d

[11, 12, 13, 14, 15]

In [23]:
# divide each number in d by 2
d = [x / 2 for x in d]
d

[5.5, 6.0, 6.5, 7.0, 7.5]

## Dataframe manipulation

In [24]:
df = pd.read_csv("https://raw.githubusercontent.com/ethanweed/ExPsyLing/master/Data/StudentSleep.csv")
df

Unnamed: 0,Student 1,Student 2,Student 3,Student 4,Student 5,Student 6
0,10,8,4,12,10,6
1,7,8,3,14,5,6
2,7,7,5,11,8,8
3,8,9,6,10,9,5
4,2,6,6,12,5,6
5,5,7,5,14,6,7
6,6,7,6,12,9,7


In [25]:
df.shape

(7, 6)

In [26]:
df1 = df.iloc[0:4]
df1

Unnamed: 0,Student 1,Student 2,Student 3,Student 4,Student 5,Student 6
0,10,8,4,12,10,6
1,7,8,3,14,5,6
2,7,7,5,11,8,8
3,8,9,6,10,9,5


In [27]:
df.shape[0]

7

In [28]:
df2 = df.iloc[4:df.shape[0]]
df2

Unnamed: 0,Student 1,Student 2,Student 3,Student 4,Student 5,Student 6
4,2,6,6,12,5,6
5,5,7,5,14,6,7
6,6,7,6,12,9,7


In [29]:
df3 = pd.concat([df2, df1])
df3

Unnamed: 0,Student 1,Student 2,Student 3,Student 4,Student 5,Student 6
4,2,6,6,12,5,6
5,5,7,5,14,6,7
6,6,7,6,12,9,7
0,10,8,4,12,10,6
1,7,8,3,14,5,6
2,7,7,5,11,8,8
3,8,9,6,10,9,5


In [30]:
df3.iloc[1]

Student 1     5
Student 2     7
Student 3     5
Student 4    14
Student 5     6
Student 6     7
Name: 5, dtype: int64

In [31]:
df3 = pd.concat([df1, df2])
df3

Unnamed: 0,Student 1,Student 2,Student 3,Student 4,Student 5,Student 6
0,10,8,4,12,10,6
1,7,8,3,14,5,6
2,7,7,5,11,8,8
3,8,9,6,10,9,5
4,2,6,6,12,5,6
5,5,7,5,14,6,7
6,6,7,6,12,9,7


In [32]:
df3['average'] = df3.mean(axis = 1)
df3

Unnamed: 0,Student 1,Student 2,Student 3,Student 4,Student 5,Student 6,average
0,10,8,4,12,10,6,8.333333
1,7,8,3,14,5,6,7.166667
2,7,7,5,11,8,8,7.666667
3,8,9,6,10,9,5,7.833333
4,2,6,6,12,5,6,6.166667
5,5,7,5,14,6,7,7.333333
6,6,7,6,12,9,7,7.833333


In [33]:
m = df3.pop('average')

In [34]:
list(m)

[8.333333333333334,
 7.166666666666667,
 7.666666666666667,
 7.833333333333333,
 6.166666666666667,
 7.333333333333333,
 7.833333333333333]

In [35]:
df3

Unnamed: 0,Student 1,Student 2,Student 3,Student 4,Student 5,Student 6
0,10,8,4,12,10,6
1,7,8,3,14,5,6
2,7,7,5,11,8,8
3,8,9,6,10,9,5
4,2,6,6,12,5,6
5,5,7,5,14,6,7
6,6,7,6,12,9,7


In [36]:
column_means = list(df3.mean())
column_means

[6.428571428571429,
 7.428571428571429,
 5.0,
 12.142857142857142,
 7.428571428571429,
 6.428571428571429]

In [37]:
colnames = list(df3)
colnames

['Student 1', 'Student 2', 'Student 3', 'Student 4', 'Student 5', 'Student 6']

In [38]:
list(zip(colnames, column_means))

[('Student 1', 6.428571428571429),
 ('Student 2', 7.428571428571429),
 ('Student 3', 5.0),
 ('Student 4', 12.142857142857142),
 ('Student 5', 7.428571428571429),
 ('Student 6', 6.428571428571429)]

In [39]:
student_means = dict(zip(colnames, column_means))
student_means

{'Student 1': 6.428571428571429,
 'Student 2': 7.428571428571429,
 'Student 3': 5.0,
 'Student 4': 12.142857142857142,
 'Student 5': 7.428571428571429,
 'Student 6': 6.428571428571429}

In [40]:
print('Student 4\'s average:', round(student_means['Student 4'], 3))

Student 4's average: 12.143


In [41]:
df_means = pd.DataFrame(zip(colnames, column_means))
df_means.columns = ['Students', 'Sleep Hours']
df_means

Unnamed: 0,Students,Sleep Hours
0,Student 1,6.428571
1,Student 2,7.428571
2,Student 3,5.0
3,Student 4,12.142857
4,Student 5,7.428571
5,Student 6,6.428571


In [42]:
df_transposed = df3.transpose()
df_transposed

Unnamed: 0,0,1,2,3,4,5,6
Student 1,10,7,7,8,2,5,6
Student 2,8,8,7,9,6,7,7
Student 3,4,3,5,6,6,5,6
Student 4,12,14,11,10,12,14,12
Student 5,10,5,8,9,5,6,9
Student 6,6,6,8,5,6,7,7


In [43]:
colnames = list(df_transposed)
colnames

[0, 1, 2, 3, 4, 5, 6]

In [44]:
newcols = ['Day ' + str(x+1) for x in colnames]
newcols

['Day 1', 'Day 2', 'Day 3', 'Day 4', 'Day 5', 'Day 6', 'Day 7']

In [45]:
df_transposed.columns = newcols
df_transposed

Unnamed: 0,Day 1,Day 2,Day 3,Day 4,Day 5,Day 6,Day 7
Student 1,10,7,7,8,2,5,6
Student 2,8,8,7,9,6,7,7
Student 3,4,3,5,6,6,5,6
Student 4,12,14,11,10,12,14,12
Student 5,10,5,8,9,5,6,9
Student 6,6,6,8,5,6,7,7


In [46]:
df_transposed.index.name = 'student'
df_transposed

Unnamed: 0_level_0,Day 1,Day 2,Day 3,Day 4,Day 5,Day 6,Day 7
student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Student 1,10,7,7,8,2,5,6
Student 2,8,8,7,9,6,7,7
Student 3,4,3,5,6,6,5,6
Student 4,12,14,11,10,12,14,12
Student 5,10,5,8,9,5,6,9
Student 6,6,6,8,5,6,7,7


In [47]:
df_transposed.reset_index(inplace = True)
df_transposed

Unnamed: 0,student,Day 1,Day 2,Day 3,Day 4,Day 5,Day 6,Day 7
0,Student 1,10,7,7,8,2,5,6
1,Student 2,8,8,7,9,6,7,7
2,Student 3,4,3,5,6,6,5,6
3,Student 4,12,14,11,10,12,14,12
4,Student 5,10,5,8,9,5,6,9
5,Student 6,6,6,8,5,6,7,7


In [48]:
df_long = pd.melt(df_transposed, id_vars = 'student')
df_long

Unnamed: 0,student,variable,value
0,Student 1,Day 1,10
1,Student 2,Day 1,8
2,Student 3,Day 1,4
3,Student 4,Day 1,12
4,Student 5,Day 1,10
5,Student 6,Day 1,6
6,Student 1,Day 2,7
7,Student 2,Day 2,8
8,Student 3,Day 2,3
9,Student 4,Day 2,14
