In [56]:
import pandas as pd
import numpy as np

In [136]:
# assign a datatype
a=pd.Series([1, 2, 3, 4, 5, 6.2])
b=a.astype("int")
print(a)
print(b)

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
5    6.2
dtype: float64
0    1
1    2
2    3
3    4
4    5
5    6
dtype: int64


In [126]:
# datatypes
print("a: "+str(a.dtypes))
print("b: "+str(b.dtypes))

a: float64
b: int64


In [129]:
# create a pandas series with letters as the index
c = pd.Series(np.random.randn(7), index=['a', 'b', 'c', 'd', 'e', 'f', 'g'])  
print(c)

a   -1.245113
b    0.513797
c   -0.107182
d   -1.405299
e    2.324982
f   -0.404491
g   -0.217021
dtype: float64


In [130]:
# as above with no index statement, refers to the default behaviour which is integers starting from 0
d = pd.Series(np.random.randn(7))  
print(d)

0   -0.344869
1   -0.322142
2   -0.256439
3   -0.908999
4    1.492016
5   -0.313876
6   -0.771880
dtype: float64


In [133]:
# loc refers to the index as an absolute for each row, and iloc refers to numerical counting which will vary if the
# order of rows changes
c.loc["a":"c"]
c.iloc[0:3]

a   -1.245113
b    0.513797
c   -0.107182
dtype: float64

In [140]:
# be aware that null values (i.e. NaN) mean that integers will be cast as floats, 
# and you cannot cast them back to integers
d=pd.Series([1,2,2,np.nan,4])
print(d.dtypes)

float64


In [141]:
# this cell will not run
e=d.astype("int")

ValueError: Cannot convert NA to integer

In [143]:
# a super useful function is value_counts, which tells you how many times a value occurs
d.value_counts()

2.0    2
4.0    1
1.0    1
dtype: int64

In [146]:
# however be aware that the default behaviour of value_counts is to ignore null values, to
# return these include the argument dropna=False (another good one to know is ascending, and also try sort)
d.value_counts(dropna=False, ascending=True)

NaN     1
 1.0    1
 4.0    1
 2.0    2
dtype: int64

In [147]:
f = pd.Series(range(-3, 4))
print(f)

0   -3
1   -2
2   -1
3    0
4    1
5    2
6    3
dtype: int64


In [171]:
# boolean indexing logic allows us to select based on whether a condition is True or False
# the tilda ~ reverses the logic, so False becomes True
# ** note you can achieve this step without the .loc, but it's a good idea to get used to using it
# ** now so you are being explicit about the type of indexing you want to use
print(f.loc[f>0])
print(f.loc[~f>0])

4    1
5    2
6    3
dtype: int64
0   -3
1   -2
dtype: int64


In [151]:
# you can nest these - the OR operator is the pipe | and the AND operator is the ampersand &
# note parentheses are required for each condition
print(f.loc[(f < -1) | (f > 2)])
print(f.loc[(f>0)&(f>1)])

0   -3
1   -2
6    3
dtype: int64
5    2
6    3
dtype: int64


In [158]:
# let's create a dataframe with some assorted datatypes
g = pd.DataFrame(dict(A = np.random.rand(3),
                        B = 1,
                        C = 'foo',
                        D = pd.Timestamp('2001-01-02'),
                        E = pd.Series([1.0]*3).astype('float32'),
                                F = False,
                                G = pd.Series([1]*3,dtype='int8')))
g

Unnamed: 0,A,B,C,D,E,F,G
0,0.383265,1,foo,2001-01-02,1.0,False,1
1,0.785179,1,foo,2001-01-02,1.0,False,1
2,0.139806,1,foo,2001-01-02,1.0,False,1


In [159]:
g.dtypes

A           float64
B             int64
C            object
D    datetime64[ns]
E           float32
F              bool
G              int8
dtype: object

In [160]:
# either of these will achieve the same thing
print(g.get_dtype_counts())
print(g.dtypes.value_counts())

bool              1
datetime64[ns]    1
float32           1
float64           1
int64             1
int8              1
object            1
dtype: int64
bool              1
int64             1
object            1
int8              1
float32           1
float64           1
datetime64[ns]    1
dtype: int64


In [165]:
# a pretty common operation would be string replacements. no need to iterate over rows thankfully
g["H"]=g["C"].replace("foo", "bar")
g

Unnamed: 0,A,B,C,D,E,F,G,H
0,0.383265,1,foo,2001-01-02,1.0,False,1,bar
1,0.785179,1,foo,2001-01-02,1.0,False,1,bar
2,0.139806,1,foo,2001-01-02,1.0,False,1,bar


In [167]:
# you can simply adjust every row without iteration
g["I"]=(g["E"]*2)+50
g

Unnamed: 0,A,B,C,D,E,F,G,H,I
0,0.383265,1,foo,2001-01-02,1.0,False,1,bar,52.0
1,0.785179,1,foo,2001-01-02,1.0,False,1,bar,52.0
2,0.139806,1,foo,2001-01-02,1.0,False,1,bar,52.0


In [168]:
# you can use the apply function to perform an operation on all cells separately,
# or all cells across a row or a column
h = pd.DataFrame(np.random.randn(5, 4), columns=['a', 'b', 'c', 'd'])

In [169]:
h

Unnamed: 0,a,b,c,d
0,-1.141983,1.02509,1.328789,-0.732227
1,1.592377,1.042692,-1.447338,-0.030225
2,-1.167852,1.271577,1.409383,-0.49432
3,1.476038,1.214527,0.830593,0.687835
4,0.409842,1.514581,-1.480655,0.012069


In [170]:
# note the nulls, as some values were negative
h.apply(np.sqrt)

Unnamed: 0,a,b,c,d
0,,1.012468,1.152731,
1,1.261894,1.021123,,
2,,1.127642,1.187174,
3,1.214923,1.102056,0.911368,0.829358
4,0.640189,1.230683,,0.109857


In [172]:
# for eg the mean of each column, then axis=0
h.apply(np.mean, axis=0)

a    0.233684
b    1.213693
c    0.128155
d   -0.111374
dtype: float64

In [174]:
# for the mean of each row, then axis=1
h.apply(np.mean, axis=1)

0    0.119918
1    0.289377
2    0.254697
3    1.052248
4    0.113959
dtype: float64

In [173]:
# note the default is axis=0
h.apply(np.mean)

a    0.233684
b    1.213693
c    0.128155
d   -0.111374
dtype: float64

In [176]:
# whatever operation you want to try will assume you want to perform it to each row without iteration required
h["g"]=np.square(h["a"])+np.square(h["b"])
h

Unnamed: 0,a,b,c,d,g
0,-1.141983,1.02509,1.328789,-0.732227,2.354935
1,1.592377,1.042692,-1.447338,-0.030225,3.622872
2,-1.167852,1.271577,1.409383,-0.49432,2.980788
3,1.476038,1.214527,0.830593,0.687835,3.653762
4,0.409842,1.514581,-1.480655,0.012069,2.461926


In [180]:
# let's bring in a bigger dataset
music=pd.read_csv("Billboard.csv")
music.head()

Unnamed: 0,year,artist.inverted,track,time,genre,date.entered,date.peaked,x1st.week,x2nd.week,x3rd.week,...,x67th.week,x68th.week,x69th.week,x70th.week,x71st.week,x72nd.week,x73rd.week,x74th.week,x75th.week,x76th.week
0,2000,Destiny's Child,Independent Women Part I,3:38,Rock,2000-09-23,2000-11-18,78,63.0,49.0,...,,,,,,,,,,
1,2000,Santana,"Maria, Maria",4:18,Rock,2000-02-12,2000-04-08,15,8.0,6.0,...,,,,,,,,,,
2,2000,Savage Garden,I Knew I Loved You,4:07,Rock,1999-10-23,2000-01-29,71,48.0,43.0,...,,,,,,,,,,
3,2000,Madonna,Music,3:45,Rock,2000-08-12,2000-09-16,41,23.0,18.0,...,,,,,,,,,,
4,2000,"Aguilera, Christina",Come On Over Baby (All I Want Is You),3:38,Rock,2000-08-05,2000-10-14,57,47.0,45.0,...,,,,,,,,,,


In [181]:
# check the shape
music.shape

(317, 83)

In [185]:
# why aren't we displaying all columns above? let's set this as an option in pandas
# we can either manually input an arbitrary number above 83, or why not input the known value using shape
pd.set_option("display.max_columns", music.shape[1])
music.head()

Unnamed: 0,year,artist.inverted,track,time,genre,date.entered,date.peaked,x1st.week,x2nd.week,x3rd.week,x4th.week,x5th.week,x6th.week,x7th.week,x8th.week,x9th.week,x10th.week,x11th.week,x12th.week,x13th.week,x14th.week,x15th.week,x16th.week,x17th.week,x18th.week,x19th.week,x20th.week,x21st.week,x22nd.week,x23rd.week,x24th.week,x25th.week,x26th.week,x27th.week,x28th.week,x29th.week,x30th.week,x31st.week,x32nd.week,x33rd.week,x34th.week,x35th.week,x36th.week,x37th.week,x38th.week,x39th.week,x40th.week,x41st.week,x42nd.week,x43rd.week,x44th.week,x45th.week,x46th.week,x47th.week,x48th.week,x49th.week,x50th.week,x51st.week,x52nd.week,x53rd.week,x54th.week,x55th.week,x56th.week,x57th.week,x58th.week,x59th.week,x60th.week,x61st.week,x62nd.week,x63rd.week,x64th.week,x65th.week,x66th.week,x67th.week,x68th.week,x69th.week,x70th.week,x71st.week,x72nd.week,x73rd.week,x74th.week,x75th.week,x76th.week
0,2000,Destiny's Child,Independent Women Part I,3:38,Rock,2000-09-23,2000-11-18,78,63.0,49.0,33.0,23.0,15.0,7.0,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,3.0,7.0,10.0,12.0,15.0,22.0,29.0,31.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2000,Santana,"Maria, Maria",4:18,Rock,2000-02-12,2000-04-08,15,8.0,6.0,5.0,2.0,3.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,8.0,15.0,19.0,21.0,26.0,36.0,48.0,47.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2000,Savage Garden,I Knew I Loved You,4:07,Rock,1999-10-23,2000-01-29,71,48.0,43.0,31.0,20.0,13.0,7.0,6.0,4.0,4.0,4.0,6.0,4.0,2.0,1.0,1.0,1.0,2.0,1.0,2.0,4.0,8.0,8.0,12.0,14.0,17.0,21.0,24.0,30.0,34.0,37.0,46.0,47.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2000,Madonna,Music,3:45,Rock,2000-08-12,2000-09-16,41,23.0,18.0,14.0,2.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,4.0,8.0,11.0,16.0,20.0,25.0,27.0,27.0,29.0,44.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2000,"Aguilera, Christina",Come On Over Baby (All I Want Is You),3:38,Rock,2000-08-05,2000-10-14,57,47.0,45.0,29.0,23.0,18.0,11.0,9.0,9.0,11.0,1.0,1.0,1.0,1.0,4.0,8.0,12.0,22.0,23.0,43.0,44.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [189]:
# let's isolate some columns so we can deal with a smaller dataframe that is easier to view
music_simple=music[["year", "artist.inverted", "track", "time", "genre", "date.entered", "date.peaked"]]
music_simple.head()

Unnamed: 0,year,artist.inverted,track,time,genre,date.entered,date.peaked
0,2000,Destiny's Child,Independent Women Part I,3:38,Rock,2000-09-23,2000-11-18
1,2000,Santana,"Maria, Maria",4:18,Rock,2000-02-12,2000-04-08
2,2000,Savage Garden,I Knew I Loved You,4:07,Rock,1999-10-23,2000-01-29
3,2000,Madonna,Music,3:45,Rock,2000-08-12,2000-09-16
4,2000,"Aguilera, Christina",Come On Over Baby (All I Want Is You),3:38,Rock,2000-08-05,2000-10-14


In [190]:
# some column names are awkward
music_simple=music_simple.rename(columns={"artist.inverted":"artist"})

In [191]:
music_simple.dtypes

year             int64
artist          object
track           object
time            object
genre           object
date.entered    object
date.peaked     object
dtype: object

In [194]:
# our dates and times are objects, meaning they are strings - if we want to manipulate them we
# need pandas to understand they are actually datetime values (for the dates) or timedeltas(for the track length)
music_simple["entered_dt"]=pd.to_datetime(music_simple["date.entered"])
music_simple["peaked_dt"]=pd.to_datetime(music_simple["date.peaked"])
music_simple.head()

Unnamed: 0,year,artist,track,time,genre,date.entered,date.peaked,entered_dt,peaked_dt
0,2000,Destiny's Child,Independent Women Part I,3:38,Rock,2000-09-23,2000-11-18,2000-09-23,2000-11-18
1,2000,Santana,"Maria, Maria",4:18,Rock,2000-02-12,2000-04-08,2000-02-12,2000-04-08
2,2000,Savage Garden,I Knew I Loved You,4:07,Rock,1999-10-23,2000-01-29,1999-10-23,2000-01-29
3,2000,Madonna,Music,3:45,Rock,2000-08-12,2000-09-16,2000-08-12,2000-09-16
4,2000,"Aguilera, Christina",Come On Over Baby (All I Want Is You),3:38,Rock,2000-08-05,2000-10-14,2000-08-05,2000-10-14


In [195]:
# looks the same but wait!
music_simple.dtypes

year                     int64
artist                  object
track                   object
time                    object
genre                   object
date.entered            object
date.peaked             object
entered_dt      datetime64[ns]
peaked_dt       datetime64[ns]
dtype: object

In [198]:
# we can do some operations like
music_simple["month_entered"]=music_simple["entered_dt"].dt.month
music_simple.head()

Unnamed: 0,year,artist,track,time,genre,date.entered,date.peaked,entered_dt,peaked_dt,month_entered
0,2000,Destiny's Child,Independent Women Part I,3:38,Rock,2000-09-23,2000-11-18,2000-09-23,2000-11-18,9
1,2000,Santana,"Maria, Maria",4:18,Rock,2000-02-12,2000-04-08,2000-02-12,2000-04-08,2
2,2000,Savage Garden,I Knew I Loved You,4:07,Rock,1999-10-23,2000-01-29,1999-10-23,2000-01-29,10
3,2000,Madonna,Music,3:45,Rock,2000-08-12,2000-09-16,2000-08-12,2000-09-16,8
4,2000,"Aguilera, Christina",Come On Over Baby (All I Want Is You),3:38,Rock,2000-08-05,2000-10-14,2000-08-05,2000-10-14,8


In [199]:
# but this doesn't work for the time column because pandas doesn't know if it is HH:MM or MM:SS
music_simple["time_td"]="0:"+music_simple["time"]
music_simple.head()

Unnamed: 0,year,artist,track,time,genre,date.entered,date.peaked,entered_dt,peaked_dt,month_entered,time_td
0,2000,Destiny's Child,Independent Women Part I,3:38,Rock,2000-09-23,2000-11-18,2000-09-23,2000-11-18,9,0:3:38
1,2000,Santana,"Maria, Maria",4:18,Rock,2000-02-12,2000-04-08,2000-02-12,2000-04-08,2,0:4:18
2,2000,Savage Garden,I Knew I Loved You,4:07,Rock,1999-10-23,2000-01-29,1999-10-23,2000-01-29,10,0:4:07
3,2000,Madonna,Music,3:45,Rock,2000-08-12,2000-09-16,2000-08-12,2000-09-16,8,0:3:45
4,2000,"Aguilera, Christina",Come On Over Baby (All I Want Is You),3:38,Rock,2000-08-05,2000-10-14,2000-08-05,2000-10-14,8,0:3:38


In [201]:
# now it works because it interprets the format as HH:MM:SS
# and we can perform an operation to work out the time in seconds
# check pandas documentation for the various functions you can call on timedeltas and datetimes
music_simple["time_td"]=pd.to_timedelta(music_simple["time_td"])
music_simple["time_seconds"]=music_simple["time_td"].dt.total_seconds()
music_simple.head()

Unnamed: 0,year,artist,track,time,genre,date.entered,date.peaked,entered_dt,peaked_dt,month_entered,time_td,time_seconds
0,2000,Destiny's Child,Independent Women Part I,3:38,Rock,2000-09-23,2000-11-18,2000-09-23,2000-11-18,9,00:03:38,218.0
1,2000,Santana,"Maria, Maria",4:18,Rock,2000-02-12,2000-04-08,2000-02-12,2000-04-08,2,00:04:18,258.0
2,2000,Savage Garden,I Knew I Loved You,4:07,Rock,1999-10-23,2000-01-29,1999-10-23,2000-01-29,10,00:04:07,247.0
3,2000,Madonna,Music,3:45,Rock,2000-08-12,2000-09-16,2000-08-12,2000-09-16,8,00:03:45,225.0
4,2000,"Aguilera, Christina",Come On Over Baby (All I Want Is You),3:38,Rock,2000-08-05,2000-10-14,2000-08-05,2000-10-14,8,00:03:38,218.0


In [205]:
# how many songs were entered each day? several ways to do this but we looked at a pivot table, don't worry
# we will go over these in more detail so you can take this step as magic for now

pivot=pd.pivot_table(music_simple, index="entered_dt", values="track", aggfunc="count")
pivot.head()

entered_dt
1999-06-05    1
1999-07-17    1
1999-09-04    1
1999-09-11    1
1999-10-09    2
Name: track, dtype: int64

In [210]:
# does this match
pivot.sum()

317

In [211]:
# seems so yes
music_simple.shape

(317, 12)

In [213]:
# let's also fill the missing value where no songs were entered
date_range=pd.date_range(start=pivot.index.min(), end=pivot.index.max())
pivot2=pivot.reindex(date_range)

In [215]:
# fill the nulls with zeros, as no songs were entered on those days
pivot2=pivot2.fillna(0)
pivot2.head()

1999-06-05    1.0
1999-06-06    0.0
1999-06-07    0.0
1999-06-08    0.0
1999-06-09    0.0
Freq: D, Name: track, dtype: float64

In [216]:
# check matches
pivot2.sum()

317.0

In [219]:
# for returning no null values
no_nulls=music.dropna()

In [221]:
# oh dear! nothing left
no_nulls.shape

(0, 83)

In [230]:
# let's subset
some_nulls=music.dropna(subset=["x1st.week", "x2nd.week"])

In [231]:
some_nulls.shape

(312, 83)

In [234]:
# other useful operations
third_week_null_rows=music.loc[music["x3rd.week"].isnull(),:]
third_week_non_null_rows=music.loc[music["x3rd.week"].notnull(),:]

(10, 83)