In [1]:
import pandas as pd

## 1. Pew Data

Income Distribution Within U.S. Religious Groups ([Link](
https://www.pewforum.org/2009/01/30/income-distribution-within-us-religious-groups/ ))


In [2]:
pew = pd.read_csv("pew.txt", sep = '\t')
pew.head()

Unnamed: 0,religion,<$10k,$10-20k,$20-30k,$30-40k,$40-50k,$50-75k,$75-100k,$100-150k,>150k,Don't know/refused
0,Agnostic,27,34,60,81,76,137,122,109,84,96
1,Atheist,12,27,37,52,35,70,73,59,74,76
2,Buddhist,27,21,30,34,33,58,62,39,53,54
3,Catholic,418,617,732,670,638,1116,949,792,633,1489
4,Don’t know/refused,15,14,15,11,10,35,21,17,18,116


In [3]:
# Melting income into a unique column
pew_tidy = pd.melt(pew, id_vars = ["religion"], var_name = "income", value_name = "freq")
pew_tidy

Unnamed: 0,religion,income,freq
0,Agnostic,<$10k,27
1,Atheist,<$10k,12
2,Buddhist,<$10k,27
3,Catholic,<$10k,418
4,Don’t know/refused,<$10k,15
...,...,...,...
175,Orthodox,Don't know/refused,73
176,Other Christian,Don't know/refused,18
177,Other Faiths,Don't know/refused,71
178,Other World Religions,Don't know/refused,8


In [5]:
pew_tidy[pew_tidy.religion == 'Atheist']

Unnamed: 0,religion,income,freq
1,Atheist,<$10k,12
19,Atheist,$10-20k,27
37,Atheist,$20-30k,37
55,Atheist,$30-40k,52
73,Atheist,$40-50k,35
91,Atheist,$50-75k,70
109,Atheist,$75-100k,73
127,Atheist,$100-150k,59
145,Atheist,>150k,74
163,Atheist,Don't know/refused,76


## 2. Baby Names


In [7]:
babynames_2014 = pd.read_csv("2014-baby-names-illinois.csv")
babynames_2015 = pd.read_csv("2015-baby-names-illinois.csv")

In [8]:
babynames_2014.head()

Unnamed: 0,rank,name,frequency,sex
0,1,Noah,837,Male
1,2,Alexander,747,Male
2,3,William,687,Male
3,4,Michael,680,Male
4,5,Liam,670,Male


In [None]:
babynames_2015.head()

Unnamed: 0,rank,name,frequency,sex
0,1,Noah,863,Male
1,2,Liam,709,Male
2,3,Alexander,703,Male
3,4,Jacob,650,Male
4,5,William,618,Male


In [17]:
# Create new column "year"
babynames_2015["year"] = 2015
babynames_2014["year"] = 2014

# Append datasets into a unique one
babynames = pd.concat([babynames_2014, babynames_2015])
babynames

Unnamed: 0,rank,name,frequency,sex,year
0,1,Noah,837,Male,2014
1,2,Alexander,747,Male,2014
2,3,William,687,Male,2014
3,4,Michael,680,Male,2014
4,5,Liam,670,Male,2014
...,...,...,...,...,...
95,96,Giovanni,168,Male,2015
96,97,Hudson,167,Male,2015
97,98,Camden,165,Male,2015
98,99,Max,164,Male,2015


## 3. Weather

In [20]:
weather = pd.read_csv("weather.txt", sep = "\t")
weather.head()

Unnamed: 0,id,year,month,element,d1,d2,d3,d4,d5,d6,...,d22,d23,d24,d25,d26,d27,d28,d29,d30,d31
0,MX000017004,2010,1,TMAX,,,,,,,...,,,,,,,,,278.0,
1,MX000017004,2010,1,TMIN,,,,,,,...,,,,,,,,,145.0,
2,MX000017004,2010,2,TMAX,,273.0,241.0,,,,...,,299.0,,,,,,,,
3,MX000017004,2010,2,TMIN,,144.0,144.0,,,,...,,107.0,,,,,,,,
4,MX000017004,2010,3,TMAX,,,,,321.0,,...,,,,,,,,,,


In [21]:
# Melt day (d1, d2, ...) into the column "date"
weather = pd.melt(weather, id_vars=['id','year','month', 'element'], var_name='day')
weather.head()

Unnamed: 0,id,year,month,element,day,value
0,MX000017004,2010,1,TMAX,d1,
1,MX000017004,2010,1,TMIN,d1,
2,MX000017004,2010,2,TMAX,d1,
3,MX000017004,2010,2,TMIN,d1,
4,MX000017004,2010,3,TMAX,d1,


In [22]:
weather.dropna()

Unnamed: 0,id,year,month,element,day,value
20,MX000017004,2010,12,TMAX,d1,299.0
21,MX000017004,2010,12,TMIN,d1,138.0
24,MX000017004,2010,2,TMAX,d2,273.0
25,MX000017004,2010,2,TMIN,d2,144.0
40,MX000017004,2010,11,TMAX,d2,313.0
...,...,...,...,...,...,...
631,MX000017004,2010,8,TMIN,d29,153.0
638,MX000017004,2010,1,TMAX,d30,278.0
639,MX000017004,2010,1,TMIN,d30,145.0
674,MX000017004,2010,8,TMAX,d31,254.0


In [23]:
# Delete NaN values and reset index
weather.dropna(inplace=True)
weather.reset_index(drop = True, inplace=True)
weather.head()

Unnamed: 0,id,year,month,element,day,value
0,MX000017004,2010,12,TMAX,d1,299.0
1,MX000017004,2010,12,TMIN,d1,138.0
2,MX000017004,2010,2,TMAX,d2,273.0
3,MX000017004,2010,2,TMIN,d2,144.0
4,MX000017004,2010,11,TMAX,d2,313.0


In [24]:
weather.day
          #.str.replace("d","")

Unnamed: 0,day
0,d1
1,d1
2,d2
3,d2
4,d2
...,...
61,d29
62,d30
63,d30
64,d31


In [25]:
# Detele first letter in "day"
weather.day = weather.day.str.replace("d","")
weather.day = weather.day.astype(int)
weather.head()

Unnamed: 0,id,year,month,element,day,value
0,MX000017004,2010,12,TMAX,1,299.0
1,MX000017004,2010,12,TMIN,1,138.0
2,MX000017004,2010,2,TMAX,2,273.0
3,MX000017004,2010,2,TMIN,2,144.0
4,MX000017004,2010,11,TMAX,2,313.0


In [26]:
# Pivot table in order to obtain TMAX and TMIN
weather = pd.pivot_table(weather, index = ['id','year','month','day'], columns = 'element', values = 'value')
weather.reset_index(drop= False, inplace = True)
weather.head()

element,id,year,month,day,TMAX,TMIN
0,MX000017004,2010,1,30,278.0,145.0
1,MX000017004,2010,2,2,273.0,144.0
2,MX000017004,2010,2,3,241.0,144.0
3,MX000017004,2010,2,11,297.0,134.0
4,MX000017004,2010,2,23,299.0,107.0


In [27]:
# Create "date"
weather['date'] = weather.year.astype(str)+"-"+weather.month.astype(str)+"-"+weather.day.astype(str)
weather.head()

element,id,year,month,day,TMAX,TMIN,date
0,MX000017004,2010,1,30,278.0,145.0,2010-1-30
1,MX000017004,2010,2,2,273.0,144.0,2010-2-2
2,MX000017004,2010,2,3,241.0,144.0,2010-2-3
3,MX000017004,2010,2,11,297.0,134.0,2010-2-11
4,MX000017004,2010,2,23,299.0,107.0,2010-2-23


In [28]:
# Delete extra-columns
weather = weather.drop(['year','month','day'], axis = 1)
weather.head()

element,id,TMAX,TMIN,date
0,MX000017004,278.0,145.0,2010-1-30
1,MX000017004,273.0,144.0,2010-2-2
2,MX000017004,241.0,144.0,2010-2-3
3,MX000017004,297.0,134.0,2010-2-11
4,MX000017004,299.0,107.0,2010-2-23


In [None]:
weather.to_csv("./data/weather_tidy.csv")

## 4. Tubercolosis Records from World Health Organization

In [None]:
tb = pd.read_csv("./data/tb.csv")
tb.head()

Unnamed: 0,iso2,year,new_sp,new_sp_m04,new_sp_m514,new_sp_m014,new_sp_m1524,new_sp_m2534,new_sp_m3544,new_sp_m4554,...,new_sp_f04,new_sp_f514,new_sp_f014,new_sp_f1524,new_sp_f2534,new_sp_f3544,new_sp_f4554,new_sp_f5564,new_sp_f65,new_sp_fu
0,AD,1989,,,,,,,,,...,,,,,,,,,,
1,AD,1990,,,,,,,,,...,,,,,,,,,,
2,AD,1991,,,,,,,,,...,,,,,,,,,,
3,AD,1992,,,,,,,,,...,,,,,,,,,,
4,AD,1993,15.0,,,,,,,,...,,,,,,,,,,


## 5. Billboard Dataset

In [None]:
billboard_tidy = pd.read_csv("./data/billboard.csv", encoding = "unicode-escape")
billboard_tidy.head()

Unnamed: 0,year,artist.inverted,track,time,genre,date.entered,date.peaked,x1st.week,x2nd.week,x3rd.week,...,x67th.week,x68th.week,x69th.week,x70th.week,x71st.week,x72nd.week,x73rd.week,x74th.week,x75th.week,x76th.week
0,2000,Destiny's Child,Independent Women Part I,3:38,Rock,2000-09-23,2000-11-18,78,63.0,49.0,...,,,,,,,,,,
1,2000,Santana,"Maria, Maria",4:18,Rock,2000-02-12,2000-04-08,15,8.0,6.0,...,,,,,,,,,,
2,2000,Savage Garden,I Knew I Loved You,4:07,Rock,1999-10-23,2000-01-29,71,48.0,43.0,...,,,,,,,,,,
3,2000,Madonna,Music,3:45,Rock,2000-08-12,2000-09-16,41,23.0,18.0,...,,,,,,,,,,
4,2000,"Aguilera, Christina",Come On Over Baby (All I Want Is You),3:38,Rock,2000-08-05,2000-10-14,57,47.0,45.0,...,,,,,,,,,,
