In [37]:
import pandas as pd

In [38]:
pew = pd.read_csv('../data/pew.csv')

In [39]:
pew.head()

Unnamed: 0,religion,<$10k,$10-20k,$20-30k,$30-40k,$40-50k,$50-75k,$75-100k,$100-150k,>150k,Don't know/refused
0,Agnostic,27,34,60,81,76,137,122,109,84,96
1,Atheist,12,27,37,52,35,70,73,59,74,76
2,Buddhist,27,21,30,34,33,58,62,39,53,54
3,Catholic,418,617,732,670,638,1116,949,792,633,1489
4,Don’t know/refused,15,14,15,11,10,35,21,17,18,116


The pandas function `melt` will reshape a dataframe into a tidy format.

Parameters:  
* **id_vars** is a container (list, tuple, ndarray) that represetns the variables that will remain as is.
* **value_vars** identifies the columns you want to melt down (or *unpivot*). By default, it will melt all columns ont in `id_cars`.
* **var_name** is a string for the new column name when the `value_vars` is melted down. By default it will be called `variable`/
* **value_name** is a string for the new column name that represetns the values for the `var_name`. By default, it will be called `value`.

In [40]:
pew_long = pd.melt(pew, id_vars='religion')

In [41]:
pew_long.sample(10)

Unnamed: 0,religion,variable,value
92,Buddhist,$50-75k,58
133,Historically Black Prot,$100-150k,81
76,Don’t know/refused,$40-50k,10
48,Muslim,$20-30k,9
106,Other World Religions,$50-75k,7
25,Historically Black Prot,$10-20k,244
119,Mormon,$75-100k,85
74,Buddhist,$40-50k,33
73,Atheist,$40-50k,35
27,Jewish,$10-20k,19


In [42]:
pew_long = pd.melt(pew, id_vars='religion', var_name='income', value_name='count')
pew_long.sample(10)

Unnamed: 0,religion,income,count
30,Muslim,$10-20k,7
26,Jehovah's Witness,$10-20k,27
88,Other World Religions,$40-50k,2
168,Hindu,Don't know/refused,37
77,Evangelical Prot,$40-50k,881
7,Historically Black Prot,<$10k,228
149,Evangelical Prot,>150k,414
132,Hindu,$100-150k,48
71,Unaffiliated,$30-40k,365
80,Jehovah's Witness,$40-50k,21


In [43]:
billboard = pd.read_csv('../data/billboard.csv')

In [44]:
billboard.sample(10)

Unnamed: 0,year,artist,track,time,date.entered,wk1,wk2,wk3,wk4,wk5,...,wk67,wk68,wk69,wk70,wk71,wk72,wk73,wk74,wk75,wk76
91,2000,"Elliott, Missy ""Misdemeanor""",Hot Boyz,3:51,1999-11-27,36,21.0,13.0,9.0,7.0,...,,,,,,,,,,
189,2000,Madison Avenue,Don't Call Me Baby,3:44,2000-07-08,98,96.0,93.0,93.0,93.0,...,,,,,,,,,,
292,2000,Trick Daddy,Shut Up,4:17,2000-05-20,99,95.0,87.0,87.0,83.0,...,,,,,,,,,,
218,2000,N'Sync,Bye Bye Bye,3:15,2000-01-29,42,20.0,19.0,14.0,13.0,...,,,,,,,,,,
223,2000,Next,Wifey,4:03,2000-05-27,85,61.0,46.0,40.0,36.0,...,,,,,,,,,,
202,2000,"McGraw, Tim",My Next Thirty Years,3:37,2000-10-21,73,62.0,56.0,52.0,46.0,...,,,,,,,,,,
6,2000,A*Teens,Dancing Queen,3:44,2000-07-08,97,97.0,96.0,95.0,100.0,...,,,,,,,,,,
32,2000,Big Punisher,It's So Hard,3:25,2000-04-22,96,87.0,75.0,79.0,81.0,...,,,,,,,,,,
315,2000,Zombie Nation,Kernkraft 400,3:30,2000-09-02,99,99.0,,,,...,,,,,,,,,,
300,2000,Vertical Horizon,Everything You Want,4:01,2000-01-22,70,61.0,53.0,46.0,40.0,...,,,,,,,,,,


In [45]:
billboard_long = pd.melt(
    billboard,
    id_vars=['year', 'artist', 'track', 'time', 'date.entered'],
    var_name='week',
    value_name='rating'
)

In [46]:
billboard_long.sample(10)

Unnamed: 0,year,artist,track,time,date.entered,week,rating
17428,2000,"Worley, Darryl",When You Need My Lov...,3:35,2000-06-17,wk55,
5223,2000,"John, Elton",Someday Out Of The B...,4:41,2000-04-22,wk17,
18576,2000,Madonna,American Pie,4:30,2000-02-19,wk59,
3327,2000,"Keith, Toby",How Do You Like Me N...,3:29,2000-01-29,wk11,31.0
3314,2000,Jay-Z,Big Pimpin',3:55,2000-04-22,wk11,21.0
8824,2000,Sister Hazel,Change Your Mind,4:02,2000-07-15,wk28,
3149,2000,"Urban, Keith",Your Everything,4:10,2000-07-15,wk10,51.0
10679,2000,N'Sync,Bye Bye Bye,3:15,2000-01-29,wk34,
9804,2000,"Tritt, Travis",Best Of Intentions,4:15,2000-08-19,wk31,
13044,2000,"Brooks, Garth",Do What You Gotta Do,2:56,2000-02-19,wk42,


In [47]:
billboard_long.shape

(24092, 7)

In [48]:
ebola = pd.read_csv('../data/country_timeseries.csv')

In [49]:
ebola.columns

Index(['Date', 'Day', 'Cases_Guinea', 'Cases_Liberia', 'Cases_SierraLeone',
       'Cases_Nigeria', 'Cases_Senegal', 'Cases_UnitedStates', 'Cases_Spain',
       'Cases_Mali', 'Deaths_Guinea', 'Deaths_Liberia', 'Deaths_SierraLeone',
       'Deaths_Nigeria', 'Deaths_Senegal', 'Deaths_UnitedStates',
       'Deaths_Spain', 'Deaths_Mali'],
      dtype='object')

In [50]:
ebola.sample(10)

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali
52,9/14/2014,176,942.0,2710.0,1673.0,,,,,,601.0,1459.0,562.0,,,,,
117,3/27/2014,5,103.0,8.0,6.0,,,,,,66.0,6.0,5.0,,,,,
76,7/8/2014,108,409.0,142.0,337.0,,,,,,309.0,88.0,142.0,,,,,
23,11/10/2014,233,,6878.0,,,,,,,,2812.0,,,,,,
3,1/2/2015,286,,8157.0,,,,,,,,3496.0,,,,,,
99,4/26/2014,35,224.0,,0.0,,,,,,143.0,,0.0,,,,,
29,10/31/2014,222,,6525.0,,,,,,,,2697.0,,,,,,
51,9/17/2014,179,,3022.0,,,,,,,,1578.0,,,,,,
56,9/7/2014,169,861.0,2081.0,1424.0,21.0,3.0,,,,557.0,1137.0,524.0,8.0,0.0,,,
96,5/5/2014,44,235.0,13.0,0.0,,,,,,157.0,11.0,0.0,,,,,


In [51]:
ebola_long = pd.melt(ebola, id_vars=['Date', 'Day'])
ebola_long.sample(10)

Unnamed: 0,Date,Day,variable,value
1327,4/15/2014,24,Deaths_SierraLeone,
1285,8/9/2014,140,Deaths_SierraLeone,315.0
435,7/30/2014,129,Cases_Nigeria,3.0
914,8/20/2014,151,Cases_Mali,
371,12/28/2014,281,Cases_Nigeria,
1095,3/25/2014,3,Deaths_Guinea,60.0
91,5/27/2014,66,Cases_Guinea,281.0
1393,9/17/2014,179,Deaths_Nigeria,
112,4/4/2014,13,Cases_Guinea,143.0
165,10/5/2014,197,Cases_Liberia,


In [52]:
'Cases_Guinea'.split('_')

['Cases', 'Guinea']

In [53]:
# string accessor
variable_split = ebola_long['variable'].str.split('_')

In [54]:
type(variable_split)

pandas.core.series.Series

In [55]:
variable_split.sample(10)

932              [Cases, Mali]
1084          [Deaths, Guinea]
1519         [Deaths, Senegal]
221           [Cases, Liberia]
1824           [Deaths, Spain]
1024          [Deaths, Guinea]
1395         [Deaths, Nigeria]
82             [Cases, Guinea]
1162         [Deaths, Liberia]
1635    [Deaths, UnitedStates]
Name: variable, dtype: object

In [56]:
type(variable_split[0])

list

In [57]:
variable_split[0][1]

'Guinea'

In [58]:
status_values = variable_split.str.get(0)
status_values.head()

0    Cases
1    Cases
2    Cases
3    Cases
4    Cases
Name: variable, dtype: object

In [59]:
country_values = variable_split.str.get(1)
country_values.head()

0    Guinea
1    Guinea
2    Guinea
3    Guinea
4    Guinea
Name: variable, dtype: object

In [60]:
ebola_long['status'] = status_values
ebola_long['country'] = country_values

In [61]:
ebola_long.sample(10)

Unnamed: 0,Date,Day,variable,value,status,country
991,11/30/2014,253,Deaths_Guinea,1327.0,Deaths,Guinea
871,11/23/2014,246,Cases_Mali,7.0,Cases,Mali
1631,10/1/2014,193,Deaths_UnitedStates,0.0,Deaths,UnitedStates
1866,10/18/2014,210,Deaths_Mali,,Deaths,Mali
645,10/19/2014,211,Cases_UnitedStates,3.0,Cases,UnitedStates
1060,6/17/2014,87,Deaths_Guinea,,Deaths,Guinea
472,4/16/2014,25,Cases_Nigeria,,Cases,Nigeria
1695,4/11/2014,20,Deaths_UnitedStates,,Deaths,UnitedStates
225,4/21/2014,30,Cases_Liberia,34.0,Cases,Liberia
1701,3/29/2014,7,Deaths_UnitedStates,,Deaths,UnitedStates


In [62]:
variable_split = ebola_long['variable'].str.split('_', expand=True)

In [63]:
type(variable_split)

pandas.core.frame.DataFrame

In [64]:
variable_split.sample(10)

Unnamed: 0,0,1
811,Cases,Spain
72,Cases,Guinea
319,Cases,SierraLeone
221,Cases,Liberia
723,Cases,UnitedStates
1418,Deaths,Nigeria
1299,Deaths,SierraLeone
1491,Deaths,Senegal
1274,Deaths,SierraLeone
1946,Deaths,Mali


In [65]:
variable_split.columns = ['status_expand', 'country_expand']

In [66]:
variable_split.head()

Unnamed: 0,status_expand,country_expand
0,Cases,Guinea
1,Cases,Guinea
2,Cases,Guinea
3,Cases,Guinea
4,Cases,Guinea


In [67]:
ebola_long = pd.concat([ebola_long, variable_split], axis=1)

In [68]:
ebola_long.head()

Unnamed: 0,Date,Day,variable,value,status,country,status_expand,country_expand
0,1/5/2015,289,Cases_Guinea,2776.0,Cases,Guinea,Cases,Guinea
1,1/4/2015,288,Cases_Guinea,2775.0,Cases,Guinea,Cases,Guinea
2,1/3/2015,287,Cases_Guinea,2769.0,Cases,Guinea,Cases,Guinea
3,1/2/2015,286,Cases_Guinea,,Cases,Guinea,Cases,Guinea
4,12/31/2014,284,Cases_Guinea,2730.0,Cases,Guinea,Cases,Guinea


### Variables in both rows and columns...

In [69]:
weather = pd.read_csv('../data/weather.csv')

In [70]:
weather.shape

(22, 35)

In [71]:
weather.iloc[:5, :11]

Unnamed: 0,id,year,month,element,d1,d2,d3,d4,d5,d6,d7
0,MX17004,2010,1,tmax,,,,,,,
1,MX17004,2010,1,tmin,,,,,,,
2,MX17004,2010,2,tmax,,27.3,24.1,,,,
3,MX17004,2010,2,tmin,,14.4,14.4,,,,
4,MX17004,2010,3,tmax,,,,,32.1,,


In [72]:
weather_melt = pd.melt(
    weather,
    id_vars=['id', 'year', 'month', 'element'],
    var_name='day',
    value_name='temp'
)

In [57]:
weather_melt.head()

Unnamed: 0,id,year,month,element,day,temp
0,MX17004,2010,1,tmax,d1,
1,MX17004,2010,1,tmin,d1,
2,MX17004,2010,2,tmax,d1,
3,MX17004,2010,2,tmin,d1,
4,MX17004,2010,3,tmax,d1,


In [73]:
weather_tidy = weather_melt.pivot_table(
    index=['id', 'year', 'month', 'day'],
    columns='element',
    values='temp'
)

In [74]:
type(weather_tidy)

pandas.core.frame.DataFrame

In [75]:
weather_tidy.reset_index().head()

element,id,year,month,day,tmax,tmin
0,MX17004,2010,1,d30,27.8,14.5
1,MX17004,2010,2,d11,29.7,13.4
2,MX17004,2010,2,d2,27.3,14.4
3,MX17004,2010,2,d23,29.9,10.7
4,MX17004,2010,2,d3,24.1,14.4


In [78]:
weather_tidy = (weather_melt
                .pivot_table(
                    index=['id', 'year', 'month', 'day'],
                    columns='element',
                    values='temp')
                .reset_index()
)

In [79]:
weather_tidy.sample(10)

element,id,year,month,day,tmax,tmin
27,MX17004,2010,11,d5,26.3,7.9
25,MX17004,2010,10,d7,28.1,12.9
17,MX17004,2010,8,d13,29.8,16.5
32,MX17004,2010,12,d6,27.8,10.5
28,MX17004,2010,11,d27,27.7,14.2
10,MX17004,2010,6,d17,28.0,17.5
22,MX17004,2010,10,d14,29.5,13.0
6,MX17004,2010,3,d16,31.1,17.6
1,MX17004,2010,2,d11,29.7,13.4
11,MX17004,2010,6,d29,30.1,18.0


In [80]:
billboard_long.sample(10)

Unnamed: 0,year,artist,track,time,date.entered,week,rating
11815,2000,Drama,"Left, Right, Left",3:37,2000-02-12,wk38,
8442,2000,"McEntire, Reba",What Do You Say,3:26,1999-10-30,wk27,
3407,2000,"Price, Kelly",You Should've Told M...,3:12,2000-09-23,wk11,64.0
6695,2000,Blink-182,All The Small Things,2:52,1999-12-04,wk22,41.0
331,2000,Alice Deejay,Better Off Alone,6:50,2000-04-08,wk2,65.0
5543,2000,Juvenile,U Understand,3:51,2000-02-05,wk18,
5657,2000,Son By Four,A Puro Dolor (Purest...,3:30,2000-04-08,wk18,32.0
1834,2000,"Rogers, Kenny",Buy Me A Rose,3:46,2000-03-11,wk6,46.0
1074,2000,Hoku,Another Dumb Blonde,3:47,2000-02-19,wk4,34.0
23328,2000,M2M,Don't Say You Love M...,3:41,1999-11-20,wk74,


In [81]:
billboard_long[billboard_long['track'] == 'Loser']

Unnamed: 0,year,artist,track,time,date.entered,week,rating
3,2000,3 Doors Down,Loser,4:24,2000-10-21,wk1,76.0
320,2000,3 Doors Down,Loser,4:24,2000-10-21,wk2,76.0
637,2000,3 Doors Down,Loser,4:24,2000-10-21,wk3,72.0
954,2000,3 Doors Down,Loser,4:24,2000-10-21,wk4,69.0
1271,2000,3 Doors Down,Loser,4:24,2000-10-21,wk5,67.0
...,...,...,...,...,...,...,...
22510,2000,3 Doors Down,Loser,4:24,2000-10-21,wk72,
22827,2000,3 Doors Down,Loser,4:24,2000-10-21,wk73,
23144,2000,3 Doors Down,Loser,4:24,2000-10-21,wk74,
23461,2000,3 Doors Down,Loser,4:24,2000-10-21,wk75,


In [82]:
billboard_songs = billboard_long[['year', 'artist', 'track', 'time']]

In [83]:
billboard_songs.head()

Unnamed: 0,year,artist,track,time
0,2000,2 Pac,Baby Don't Cry (Keep...,4:22
1,2000,2Ge+her,The Hardest Part Of ...,3:15
2,2000,3 Doors Down,Kryptonite,3:53
3,2000,3 Doors Down,Loser,4:24
4,2000,504 Boyz,Wobble Wobble,3:35


In [84]:
billboard_songs.shape

(24092, 4)

In [85]:
billboard_songs = billboard_songs.drop_duplicates()

In [86]:
billboard_songs.shape

(317, 4)

In [76]:
range(10)

range(0, 10)

In [87]:
billboard_songs.shape[0]

317

In [88]:
len(billboard_songs)

317

In [89]:
billboard_songs['id'] = range(len(billboard_songs))

In [90]:
billboard_songs.head(10)

Unnamed: 0,year,artist,track,time,id
0,2000,2 Pac,Baby Don't Cry (Keep...,4:22,0
1,2000,2Ge+her,The Hardest Part Of ...,3:15,1
2,2000,3 Doors Down,Kryptonite,3:53,2
3,2000,3 Doors Down,Loser,4:24,3
4,2000,504 Boyz,Wobble Wobble,3:35,4
5,2000,98^0,Give Me Just One Nig...,3:24,5
6,2000,A*Teens,Dancing Queen,3:44,6
7,2000,Aaliyah,I Don't Wanna,4:15,7
8,2000,Aaliyah,Try Again,4:03,8
9,2000,"Adams, Yolanda",Open My Heart,5:30,9


In [82]:
billboard_ratings = billboard_long.merge(
    billboard_songs, on=['year', 'artist', 'track', 'time']
)

In [83]:
billboard_ratings.head()

Unnamed: 0,year,artist,track,time,date.entered,week,rating,id
0,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,wk1,87.0,0
1,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,wk2,82.0,0
2,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,wk3,72.0,0
3,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,wk4,77.0,0
4,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,wk5,87.0,0


In [84]:
billboard_ratings = billboard_ratings[['id', 'date.entered', 'week', 'rating']]

In [85]:
billboard_ratings.head()

Unnamed: 0,id,date.entered,week,rating
0,0,2000-02-26,wk1,87.0
1,0,2000-02-26,wk2,82.0
2,0,2000-02-26,wk3,72.0
3,0,2000-02-26,wk4,77.0
4,0,2000-02-26,wk5,87.0
