UCSanDiegoX: DSE200x Python for Data Science

Week 4 - Pandas

In [1]:
import pandas as pd

## Live Code: Why pandas

### Pandas Series

In [2]:
ser = pd.Series([100, 'foo', 300, 'bar', 500], ['tom', 'bob', 'nancy', 'dan', 'eric'])
print(ser)

# same, more verbose
print(pd.Series(
    data=[100, 'foo', 300, 'bar', 500],
    index=['tom', 'bob', 'nancy', 'dan', 'eric'])
)

tom      100
bob      foo
nancy    300
dan      bar
eric     500
dtype: object
tom      100
bob      foo
nancy    300
dan      bar
eric     500
dtype: object


In [3]:
# get the index for a series
print(ser.index)

# data information
print(ser.data.shape)

# access data through index
print('Nancy:', ser['nancy'])
# access multiple locations
print(ser.loc[['nancy','bob']])

# more explicit through loc
print('Nancy again:', ser.loc['nancy'])
print(ser.loc[['nancy', 'bob']])

# access through index location
print('Nancy is 2:', ser[2])
print(ser[[4, 3, 1]])

# more explicit through iloc
print('Nancy is 2 (again):', ser.iloc[2])
print(ser.iloc[[4, 3, 1]])

Index(['tom', 'bob', 'nancy', 'dan', 'eric'], dtype='object')
(5,)
Nancy: 300
nancy    300
bob      foo
dtype: object
Nancy again: 300
nancy    300
bob      foo
dtype: object
Nancy is 2: 300
eric    500
dan     bar
bob     foo
dtype: object
Nancy is 2 (again): 300
eric    500
dan     bar
bob     foo
dtype: object


In [4]:
# is there an item in index?
print('bob' in ser, 'Bob' in ser)

# broadcasting as in numpy - original series not affected
print(ser * 2)

# applying function only on specified elements - original series not affected
print(ser[['nancy', 'eric']] ** 2)

True False
tom         200
bob      foofoo
nancy       600
dan      barbar
eric       1000
dtype: object
nancy     90000
eric     250000
dtype: object


### Pandas DataFrame

In [5]:
# create a dataframe from a dictionary where values are Series
d = {
    'one': pd.Series(
        data= [100., 200., 300.],
        index= ['apple', 'ball', 'clock']),
    'two': pd.Series(
        data= [111., 222., 333., 4444.],
        index= ['apple', 'ball', 'cerill', 'dancy'])
}
df = pd.DataFrame(d)
print(df)
print('DF index:', df.index)
print('DF columns', df.columns)

          one     two
apple   100.0   111.0
ball    200.0   222.0
cerill    NaN   333.0
clock   300.0     NaN
dancy     NaN  4444.0
DF index: Index(['apple', 'ball', 'cerill', 'clock', 'dancy'], dtype='object')
DF columns Index(['one', 'two'], dtype='object')


In [6]:
# create a DF from dictionary, extracting only some labels
pd.DataFrame(d, index=['dancy', 'ball', 'apple'])

Unnamed: 0,one,two
dancy,,4444.0
ball,200.0,222.0
apple,100.0,111.0


In [7]:
# if non existent elements in indices are specified, those are created empty
pd.DataFrame(d, index=['dancy', 'whot', 'ball', 'apple'], columns=['two', 'five'])

Unnamed: 0,two,five
dancy,4444.0,
whot,,
ball,222.0,
apple,111.0,


DataFrame from list of Python dictionaries

In [8]:
data = [
    {'alex': 1, 'joe': 2},
    {'ema': 5, 'dora': 10, 'alice': 20}
]

# default index for rows, [0, 1, ...]
print(pd.DataFrame(data))

# providing index
print(pd.DataFrame(data, index=['orange', 'red']))

# providing columns to trim out data not required
print(pd.DataFrame(data, columns=['joe', 'dora','alice']))

   alex  alice  dora  ema  joe
0   1.0    NaN   NaN  NaN  2.0
1   NaN   20.0  10.0  5.0  NaN
        alex  alice  dora  ema  joe
orange   1.0    NaN   NaN  NaN  2.0
red      NaN   20.0  10.0  5.0  NaN
   joe  dora  alice
0  2.0   NaN    NaN
1  NaN  10.0   20.0


Basic DataFrame operations

In [9]:
df = pd.DataFrame(d)
print(df)
print('---')

# selecting a column (as Series)
s = df['one']
print(s)
# for efficiency, s is just an alias for df 'one' column
s['cerill'] = 42
print(df)

          one     two
apple   100.0   111.0
ball    200.0   222.0
cerill    NaN   333.0
clock   300.0     NaN
dancy     NaN  4444.0
---
apple     100.0
ball      200.0
cerill      NaN
clock     300.0
dancy       NaN
Name: one, dtype: float64
          one     two
apple   100.0   111.0
ball    200.0   222.0
cerill   42.0   333.0
clock   300.0     NaN
dancy     NaN  4444.0


In [10]:
# create new column from existing ones
df = pd.DataFrame(d)
df['three'] = df['one'] * df['two']
print(df)

# logical operation
df['flag'] = df['one'] > 250
print(df)

          one     two    three
apple   100.0   111.0  11100.0
ball    200.0   222.0  44400.0
cerill    NaN   333.0      NaN
clock   300.0     NaN      NaN
dancy     NaN  4444.0      NaN
          one     two    three   flag
apple   100.0   111.0  11100.0  False
ball    200.0   222.0  44400.0  False
cerill    NaN   333.0      NaN  False
clock   300.0     NaN      NaN   True
dancy     NaN  4444.0      NaN  False


In [11]:
dc = df.copy()
print(dc)

# popping a column
three = dc.pop('three')
print(three)

# deleting a column
del dc['two']

print(dc)

          one     two    three   flag
apple   100.0   111.0  11100.0  False
ball    200.0   222.0  44400.0  False
cerill    NaN   333.0      NaN  False
clock   300.0     NaN      NaN   True
dancy     NaN  4444.0      NaN  False
apple     11100.0
ball      44400.0
cerill        NaN
clock         NaN
dancy         NaN
Name: three, dtype: float64
          one   flag
apple   100.0  False
ball    200.0  False
cerill    NaN  False
clock   300.0   True
dancy     NaN  False


In [12]:
# insert a column in a given position
dc = df.copy()
print(dc)

dc.insert(2, 'copy_of_one', df['one'])
print(dc)

# create a new column that contains just the first two elements of another one
# than add it to same DF with a give column name - missing elements are created empty
dc['one_upper_half'] = dc['one'][:2]
dc

          one     two    three   flag
apple   100.0   111.0  11100.0  False
ball    200.0   222.0  44400.0  False
cerill    NaN   333.0      NaN  False
clock   300.0     NaN      NaN   True
dancy     NaN  4444.0      NaN  False
          one     two  copy_of_one    three   flag
apple   100.0   111.0        100.0  11100.0  False
ball    200.0   222.0        200.0  44400.0  False
cerill    NaN   333.0          NaN      NaN  False
clock   300.0     NaN        300.0      NaN   True
dancy     NaN  4444.0          NaN      NaN  False


Unnamed: 0,one,two,copy_of_one,three,flag,one_upper_half
apple,100.0,111.0,100.0,11100.0,False,100.0
ball,200.0,222.0,200.0,44400.0,False,200.0
cerill,,333.0,,,False,
clock,300.0,,300.0,,True,
dancy,,4444.0,,,False,


# Case Study: Movie Data Analysis

### Pandas: Data Ingestion

based ml-20m.zip from https://grouplens.org/datasets/movielens/

Use Pandas to Read the Dataset
- **ratings.csv:** *userId*,*movieId*,*rating*, *timestamp*
- **tags.csv:** *userId*,*movieId*, *tag*, *timestamp*
- **movies.csv:** *movieId*, *title*, *genres* <br>

Using the *read_csv* function in pandas, we will ingest these three files.

In [13]:
movies = pd.read_csv('./movielens/movies.csv')
print(type(movies))
movies.head(15)

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [14]:
# Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970
tags = pd.read_csv('./movielens/tags.csv')
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,1240597180
1,65,208,dark hero,1368150078
2,65,353,dark hero,1368150079
3,65,521,noir thriller,1368149983
4,65,592,dark hero,1368150078


In [15]:
ratings = pd.read_csv('./movielens/ratings.csv', parse_dates=['timestamp'])
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [16]:
# For current analysis, we will remove timestamp (we will come back to it!)

del ratings['timestamp']
del tags['timestamp']

### Data Structures

In [30]:
# Series

row_0 = tags.iloc[0]
print('Row type:', type(row_0))
print(row_0)
print('Row index:', row_0.index)
print('A cell value:', row_0['userId'])
print('Is there a rating cell in this row?', 'rating' in row_0)
print('Row name:', row_0.name)

row_0 = row_0.rename('first_row')
print('Renamed row:', row_0.name)

Row type: <class 'pandas.core.series.Series'>
userId              18
movieId           4141
tag        Mark Waters
Name: 0, dtype: object
Row index: Index(['userId', 'movieId', 'tag'], dtype='object')
A cell value: 18
Is there a rating cell in this row? False
Row name: 0
Renamed row: first_row
userId              18
movieId           4141
tag        Mark Waters
Name: Cpt. Paper, dtype: object
userId              18
movieId           4141
tag        Mark Waters
Name: first_row, dtype: object


In [37]:
# DataFrame

print(tags.head())
print('Tags DF index:', tags.index)
print('Tags DF columns:', tags.columns)

# Extract row 0, 11, 2000 from DataFrame
print(tags.iloc[[0,11,2000]])

   userId  movieId            tag
0      18     4141    Mark Waters
1      65      208      dark hero
2      65      353      dark hero
3      65      521  noir thriller
4      65      592      dark hero
Tags DF index: RangeIndex(start=0, stop=465564, step=1)
Tags DF columns: Index(['userId', 'movieId', 'tag'], dtype='object')
      userId  movieId                tag
0         18     4141        Mark Waters
11        65     1783      noir thriller
2000     910    68554  conspiracy theory


### Descriptive Statistics

In [38]:
# describing a column
ratings['rating'].describe()

count    2.000026e+07
mean     3.525529e+00
std      1.051989e+00
min      5.000000e-01
25%      3.000000e+00
50%      3.500000e+00
75%      4.000000e+00
max      5.000000e+00
Name: rating, dtype: float64

In [39]:
# describe the entire DF
ratings.describe()

Unnamed: 0,userId,movieId,rating
count,20000260.0,20000260.0,20000260.0
mean,69045.87,9041.567,3.525529
std,40038.63,19789.48,1.051989
min,1.0,1.0,0.5
25%,34395.0,902.0,3.0
50%,69141.0,2167.0,3.5
75%,103637.0,4770.0,4.0
max,138493.0,131262.0,5.0


In [40]:
# Just the mean for a column
ratings['rating'].mean()

3.5255285642993797

In [41]:
ratings.mean()

userId     69045.872583
movieId     9041.567330
rating         3.525529
dtype: float64

In [42]:
ratings['rating'].min()

0.5

In [43]:
ratings['rating'].max()

5.0

In [44]:
ratings['rating'].std()

1.051988919275684

In [45]:
ratings['rating'].mode()

0    4.0
dtype: float64

In [46]:
# not much sense in looking for correlation in this DF
ratings.corr()

Unnamed: 0,userId,movieId,rating
userId,1.0,-0.00085,0.001175
movieId,-0.00085,1.0,0.002606
rating,0.001175,0.002606,1.0


In [51]:
# ensure no rating is bigger than 5
filter_1 = ratings['rating'] > 5
print(filter_1.head())
print(filter_1.tail())
print(filter_1.any())

0    False
1    False
2    False
3    False
4    False
Name: rating, dtype: bool
20000258    False
20000259    False
20000260    False
20000261    False
20000262    False
Name: rating, dtype: bool
False


In [53]:
# ensure all ratings are bigger than 0
filter_2 = ratings['rating'] > 0
print(filter_2.head())
filter_2.all()

0    True
1    True
2    True
3    True
4    True
Name: rating, dtype: bool


True