In [None]:
# to scrape data from wikipedia, we need install the package called lxml
# we can do that here from our notebook or, if we think we will use it often, we could modify our "install_packages"
# shell script to install it automatically each time we start a job in UCloud
%pip install lxml

# import pandas so we can put data in a nice dataframe
# we'll abbreviate pandas as pd, because that's what everybody does
import pandas as pd

## Scraping data from the web
Using `pandas.read_html`, we can read data from websites where data is presented in a table-like format. Wikipedia has lots of these, and is a great source for data to play with. Below, we'll look at data from the [List of Sesame Street Muppets](https://en.wikipedia.org/wiki/List_of_Sesame_Street_Muppets)

In [None]:
# scrape table data from websites

rawdata = pd.read_html("https://en.wikipedia.org/wiki/List_of_Sesame_Street_Muppets")
df = rawdata[1]
df

In [None]:
list(df)

## Removing an unwanted column
Below are two ways to get rid of the final column (there are more ways to do this!) If you find the "axis = 1" part in the first method confusing, well, I do too, and [we are not alone](https://stackoverflow.com/questions/22149584/what-does-axis-in-pandas-mean)! My advice for now is to just accept it and move on.

In [None]:

# method 1
df = df.drop(['Unnamed: 3'], axis = 1)
df

In [None]:

# method 2
df = df.loc[:,list(df)[0:3]] 
df

In [None]:
# take a column from a dataframe and assign it to a list variable

a = list(df.Character)

In [None]:
# find the first four items in the list
a[0:4]

In [None]:
# find the last three items in the list
a[-3:]

In [None]:
# find items in the middle of the list
a[7:10]

In [None]:
# select the first 20 items from a list
b = a[0:20]
b


In [None]:
# remove the last item in a list
b.pop()

In [None]:
b

In [None]:
b.pop(2)

In [None]:
b.append('Kermit')

In [None]:
b

In [None]:
# insert an item into a list at a particular position
b.insert(5, 'Kermit')
b

In [None]:
# replace an item in a list
b[1]='Fozzy Bear'
b

In [None]:
# use list comprehensions to manipulate items in a list in a vectorized fashion 
c = [x + ' is a cute monster' for x in b]
c

In [None]:
# reset b to original first 20 items
b = a[0:20]
b

In [None]:
# use list comprehensions to compare two lists, and select items from list 1 that match items in list 2

favs = ['Barkley',  'Betty Lou', 'Bert', 'Kermit']
c = [x for x in b if x in favs]
c

In [None]:
# use list comprehensions to compare two lists, and select items from list 1 that are not in list 2
b = a[0:20]
favs = ['Barkley',  'Betty Lou', 'Bert']
c = [x for x in b if x not in favs]
c

# this gives us all the monsters in the list that are NOT my favorites

# Not really. I love all monsters.

## More fun with list comprehensions

In [None]:
# add 10 to each number in d
d = [1,2,3,4,5]
d = [x + 10 for x in d]
d

In [None]:
# divide each number in d by 2
d = [x / 2 for x in d]
d

## Dataframe manipulation

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/ethanweed/ExPsyLing/master/Data/StudentSleep.csv")
df

In [None]:
df.shape

In [None]:
df1 = df.iloc[0:4]
df1

In [None]:
df.shape[0]

In [None]:
df2 = df.iloc[4:df.shape[0]]
df2

In [None]:
df3 = pd.concat([df2, df1])
df3

In [None]:
df3.iloc[1]

In [None]:
df3 = pd.concat([df1, df2])
df3

In [None]:
df3['average'] = df3.mean(axis = 1)
df3

In [None]:
m = df3.pop('average')

In [None]:
list(m)

In [None]:
df3

In [None]:
column_means = list(df3.mean())
column_means

In [None]:
colnames = list(df3)
colnames

In [None]:
list(zip(colnames, column_means))

In [None]:
student_means = dict(zip(colnames, column_means))
student_means

In [None]:
print('Student 4\'s average:', round(student_means['Student 4'], 3))

In [None]:
df_means = pd.DataFrame(zip(colnames, column_means))
df_means.columns = ['Students', 'Sleep Hours']
df_means

In [None]:
df_transposed = df3.transpose()
df_transposed

In [None]:
colnames = list(df_transposed)
colnames

In [None]:
newcols = ['Day ' + str(x+1) for x in colnames]
newcols

In [None]:
df_transposed.columns = newcols
df_transposed

In [None]:
df_transposed.index.name = 'student'
df_transposed

In [None]:
df_transposed.reset_index(inplace = True)
df_transposed

In [None]:
df_long = pd.melt(df_transposed, id_vars = 'student')
df_long