## Preppin Data 
### Week 17: The Price of Streaming

https://preppindata.blogspot.com/2022/04/2022-week-17-price-of-streaming.html

### 1. Import libraries and data files

In [54]:
#import pandas
import pandas as pd

In [55]:
#import files
streaming = pd.read_excel('2022W17 Input.xlsx', sheet_name='Streaming')
pricing = pd.read_excel('2022W17 Input.xlsx', sheet_name='Avg Pricing')

In [56]:
streaming.head()

Unnamed: 0,userID,t,location,content_type,duration
0,3,2021-01-05T21:44:55Z,Essex,Preserved,47
1,3,2021-01-05T21:44:55Z,Essex,Preserved,29
2,3,2021-01-05T21:44:55Z,Essex,Preserved,31
3,3,2021-01-05T21:44:55Z,Essex,Preserved,4
4,3,2021-01-05T21:44:55Z,Essex,Preserved,8


In [57]:
pricing.head()

Unnamed: 0,Month,Avg_Price,Content_Type
0,08 2020,20.92,Primary
1,09 2020,22.9,Primary
2,10 2020,23.41,Primary
3,11 2020,20.66,Primary
4,12 2020,19.61,Primary


### 2. Fix Spelling Errors

In [58]:
#Use .value_counts() to see all location entries
streaming['location'].value_counts()

Edinurgh      514
Cardiff       319
London        308
Manchester    301
Essex         195
Plymouth      143
Cornwall      128
Nottingham     49
Perth          35
Glasgow        14
Newcastle      12
Norfolk        12
Bristol         7
Kent            6
Name: location, dtype: int64

In [59]:
#Use .replace() to change 'Edinburgh'
streaming['location'] = streaming['location'].replace('Edinurgh', 'Edinburgh')

### 3. Date Field Handling

In [60]:
#Use pd.to_datetime to convert t to datetime
streaming.t = pd.to_datetime(streaming.t)

### 4. Update Content Type Fields

In [128]:
#View existing content_type options

streaming.content_type.value_counts()

Primary      211
Preserved     94
Secondary     46
Name: content_type, dtype: int64

##### 4a. Use pd.apply and a function to assign values to new column

In [62]:
#Write function to assign values to content_type based on location and previous content_type

def content(df):
    if df.location == 'London' or df.location == 'Cardiff' or df.location == 'Edinburgh':
        return 'Primary'
    else:
        if df.content_type == 'Preserved':
            return 'Preserved'
        else:
            return 'Secondary'

In [63]:
#Use .apply to apply function

streaming.content_type = streaming.apply(content, axis=1)

In [64]:
#Verify that it worked

streaming.content_type.value_counts()

Primary      1141
Preserved     594
Secondary     308
Name: content_type, dtype: int64

### 5. Aggregate to view the total duration of each streaming session

In [66]:
#Create a groupby object on ID, location, t, content_type with a sum function on duration to calculate total duration of each session
#Use reset_index to cast result as dataframe rather than groupby object

streaming = streaming.groupby(by=['userID','location', 't', 'content_type']).sum().reset_index()
streaming

Unnamed: 0,userID,location,t,content_type,duration
0,1,Cardiff,2022-01-04 18:31:56+00:00,Primary,32
1,1,London,2022-01-04 18:05:22+00:00,Primary,28
2,2,Bristol,2020-10-17 08:43:02+00:00,Secondary,248
3,2,Cardiff,2020-08-20 12:50:58+00:00,Primary,167
4,2,Cardiff,2020-08-20 13:12:28+00:00,Primary,260
...,...,...,...,...,...
346,5,Edinburgh,2022-01-26 10:11:00+00:00,Primary,37
347,5,Edinburgh,2022-01-26 10:26:33+00:00,Primary,25
348,5,Edinburgh,2022-03-07 05:23:45+00:00,Primary,17
349,5,London,2022-01-26 07:03:58+00:00,Primary,49


### 6. Find out when each user's first streaming session was
####  6a. For primary content, take overal minimum streaming month, regardless of location

In [73]:
#Filter to only primary content by taking a slice
primary = streaming[streaming.content_type == 'Primary'].copy()

In [74]:
#Groupby userID and take the minimum date; reset index to cast to dataframe
msm = primary.groupby(by='userID').min().reset_index()

In [75]:
#Drop extra columns
msm = msm.drop(['location', 'duration'], axis=1)
msm

Unnamed: 0,userID,t,content_type
0,1,2022-01-04 18:05:22+00:00,Primary
1,2,2020-08-20 11:26:09+00:00,Primary
2,3,2020-08-21 18:15:46+00:00,Primary
3,4,2022-01-13 01:47:20+00:00,Primary
4,5,2021-09-06 09:41:27+00:00,Primary


In [76]:
#Create field using dt.month and dt.year along with an .astype to ensure the space can be put into the middle
msm['minimum_streaming_month'] = msm.t.dt.month.astype('string') + ' ' + msm.t.dt.year.astype('string')
msm

Unnamed: 0,userID,t,content_type,minimum_streaming_month
0,1,2022-01-04 18:05:22+00:00,Primary,1 2022
1,2,2020-08-20 11:26:09+00:00,Primary,8 2020
2,3,2020-08-21 18:15:46+00:00,Primary,8 2020
3,4,2022-01-13 01:47:20+00:00,Primary,1 2022
4,5,2021-09-06 09:41:27+00:00,Primary,9 2021


In [77]:
#Remove t column

msm = msm.drop('t', axis=1)
msm

Unnamed: 0,userID,content_type,minimum_streaming_month
0,1,Primary,1 2022
1,2,Primary,8 2020
2,3,Primary,8 2020
3,4,Primary,1 2022
4,5,Primary,9 2021


In [80]:
#Join minimum streaming month information to primary table
primary = pd.merge(streaming, msm, on=['userID', 'content_type'])
primary

Unnamed: 0,userID,location,t,content_type,duration,minimum_streaming_month
0,1,Cardiff,2022-01-04 18:31:56+00:00,Primary,32,1 2022
1,1,London,2022-01-04 18:05:22+00:00,Primary,28,1 2022
2,2,Cardiff,2020-08-20 12:50:58+00:00,Primary,167,8 2020
3,2,Cardiff,2020-08-20 13:12:28+00:00,Primary,260,8 2020
4,2,Cardiff,2020-08-20 13:49:58+00:00,Primary,133,8 2020
...,...,...,...,...,...,...
206,5,Edinburgh,2022-01-26 10:11:00+00:00,Primary,37,9 2021
207,5,Edinburgh,2022-01-26 10:26:33+00:00,Primary,25,9 2021
208,5,Edinburgh,2022-03-07 05:23:45+00:00,Primary,17,9 2021
209,5,London,2022-01-26 07:03:58+00:00,Primary,49,9 2021


#### 6b. For other content, find the minimum active month for each user, in each location, for each_content type

In [96]:
#Filter to content excluding primary
other = streaming[streaming.content_type != 'Primary'].copy()
other

Unnamed: 0,userID,location,t,content_type,duration
2,2,Bristol,2020-10-17 08:43:02+00:00,Secondary,248
27,2,Cornwall,2020-12-19 10:57:16+00:00,Secondary,263
28,2,Cornwall,2020-12-19 14:00:20+00:00,Secondary,197
29,2,Cornwall,2020-12-19 14:08:19+00:00,Secondary,191
30,2,Cornwall,2020-12-20 08:39:24+00:00,Secondary,216
...,...,...,...,...,...
302,3,Nottingham,2020-12-26 15:48:37+00:00,Preserved,86
303,3,Plymouth,2021-04-23 21:48:55+00:00,Secondary,224
304,3,Plymouth,2021-04-23 22:11:27+00:00,Secondary,187
305,3,Plymouth,2021-04-23 22:22:03+00:00,Secondary,129


In [97]:
#Group by user, location, and content type'; take minimum of t to find first date; reset index to cast to dataframe
msm = other.groupby(by=['userID', 'location', 'content_type']).min().reset_index()
msm

Unnamed: 0,userID,location,content_type,t,duration
0,2,Bristol,Secondary,2020-10-17 08:43:02+00:00,248
1,2,Cornwall,Secondary,2020-12-19 10:57:16+00:00,145
2,2,Essex,Preserved,2020-08-21 19:17:17+00:00,184
3,2,Glasgow,Preserved,2020-11-07 14:20:58+00:00,170
4,2,Manchester,Preserved,2020-12-02 10:14:11+00:00,249
5,2,Nottingham,Preserved,2020-12-04 06:05:42+00:00,202
6,2,Perth,Preserved,2020-08-30 16:17:46+00:00,184
7,2,Plymouth,Secondary,2021-02-04 05:12:12+00:00,144
8,3,Cornwall,Secondary,2020-12-19 14:05:11+00:00,132
9,3,Essex,Preserved,2020-12-21 21:54:10+00:00,112


In [98]:
#Drop duration column
msm = msm.drop('duration', axis=1)
msm

Unnamed: 0,userID,location,content_type,t
0,2,Bristol,Secondary,2020-10-17 08:43:02+00:00
1,2,Cornwall,Secondary,2020-12-19 10:57:16+00:00
2,2,Essex,Preserved,2020-08-21 19:17:17+00:00
3,2,Glasgow,Preserved,2020-11-07 14:20:58+00:00
4,2,Manchester,Preserved,2020-12-02 10:14:11+00:00
5,2,Nottingham,Preserved,2020-12-04 06:05:42+00:00
6,2,Perth,Preserved,2020-08-30 16:17:46+00:00
7,2,Plymouth,Secondary,2021-02-04 05:12:12+00:00
8,3,Cornwall,Secondary,2020-12-19 14:05:11+00:00
9,3,Essex,Preserved,2020-12-21 21:54:10+00:00


In [99]:
#Extract month and year data using pandas.Series.month and pandas.Series.year; cast to string and add a space to match formatting of other table
msm['minimum_streaming_month'] = msm.t.dt.month.astype('string') + ' ' + msm.t.dt.year.astype('string')
msm

Unnamed: 0,userID,location,content_type,t,minimum_streaming_month
0,2,Bristol,Secondary,2020-10-17 08:43:02+00:00,10 2020
1,2,Cornwall,Secondary,2020-12-19 10:57:16+00:00,12 2020
2,2,Essex,Preserved,2020-08-21 19:17:17+00:00,8 2020
3,2,Glasgow,Preserved,2020-11-07 14:20:58+00:00,11 2020
4,2,Manchester,Preserved,2020-12-02 10:14:11+00:00,12 2020
5,2,Nottingham,Preserved,2020-12-04 06:05:42+00:00,12 2020
6,2,Perth,Preserved,2020-08-30 16:17:46+00:00,8 2020
7,2,Plymouth,Secondary,2021-02-04 05:12:12+00:00,2 2021
8,3,Cornwall,Secondary,2020-12-19 14:05:11+00:00,12 2020
9,3,Essex,Preserved,2020-12-21 21:54:10+00:00,12 2020


In [100]:
#Drop t column
msm = msm.drop('t', axis=1)
msm

Unnamed: 0,userID,location,content_type,minimum_streaming_month
0,2,Bristol,Secondary,10 2020
1,2,Cornwall,Secondary,12 2020
2,2,Essex,Preserved,8 2020
3,2,Glasgow,Preserved,11 2020
4,2,Manchester,Preserved,12 2020
5,2,Nottingham,Preserved,12 2020
6,2,Perth,Preserved,8 2020
7,2,Plymouth,Secondary,2 2021
8,3,Cornwall,Secondary,12 2020
9,3,Essex,Preserved,12 2020


In [101]:
#Join minimum streaming information with original other table
other = pd.merge(other, msm, on=['userID', 'location', 'content_type'])
other

Unnamed: 0,userID,location,t,content_type,duration,minimum_streaming_month
0,2,Bristol,2020-10-17 08:43:02+00:00,Secondary,248,10 2020
1,2,Cornwall,2020-12-19 10:57:16+00:00,Secondary,263,12 2020
2,2,Cornwall,2020-12-19 14:00:20+00:00,Secondary,197,12 2020
3,2,Cornwall,2020-12-19 14:08:19+00:00,Secondary,191,12 2020
4,2,Cornwall,2020-12-20 08:39:24+00:00,Secondary,216,12 2020
...,...,...,...,...,...,...
135,3,Nottingham,2020-12-26 15:48:37+00:00,Preserved,86,11 2020
136,3,Plymouth,2021-04-23 21:48:55+00:00,Secondary,224,4 2021
137,3,Plymouth,2021-04-23 22:11:27+00:00,Secondary,187,4 2021
138,3,Plymouth,2021-04-23 22:22:03+00:00,Secondary,129,4 2021


### 7. Join the pricing and streaming tables together based on Minimum Streaming Date

#### 7a. Union the primary and other tables

In [102]:
#Union with append
union = primary.append(other)
union

Unnamed: 0,userID,location,t,content_type,duration,minimum_streaming_month
0,1,Cardiff,2022-01-04 18:31:56+00:00,Primary,32,1 2022
1,1,London,2022-01-04 18:05:22+00:00,Primary,28,1 2022
2,2,Cardiff,2020-08-20 12:50:58+00:00,Primary,167,8 2020
3,2,Cardiff,2020-08-20 13:12:28+00:00,Primary,260,8 2020
4,2,Cardiff,2020-08-20 13:49:58+00:00,Primary,133,8 2020
...,...,...,...,...,...,...
135,3,Nottingham,2020-12-26 15:48:37+00:00,Preserved,86,11 2020
136,3,Plymouth,2021-04-23 21:48:55+00:00,Secondary,224,4 2021
137,3,Plymouth,2021-04-23 22:11:27+00:00,Secondary,187,4 2021
138,3,Plymouth,2021-04-23 22:22:03+00:00,Secondary,129,4 2021


#### 7b. Join Pricing Data in based on minimum streaming month

*Additional cleanup required on pricing table*

In [118]:
#Remove leading zeroes from 'Month' column

pricing.Month = pricing.Month.where(pricing.Month.str[0] != '0', pricing.Month.str[1:])
pricing

Unnamed: 0,Month,Avg_Price,Content_Type
0,8 2020,20.92,Primary
1,9 2020,22.9,Primary
2,10 2020,23.41,Primary
3,11 2020,20.66,Primary
4,12 2020,19.61,Primary
5,1 2021,16.05,Primary
6,2 2021,16.43,Primary
7,3 2021,15.99,Primary
8,4 2021,15.83,Primary
9,5 2021,14.83,Primary


In [120]:
#Join pricing to union table

full = pd.merge(union, pricing, left_on=['minimum_streaming_month', 'content_type'], right_on=['Month', 'Content_Type'], how='left')
full

Unnamed: 0,userID,location,t,content_type,duration,minimum_streaming_month,Month,Avg_Price,Content_Type
0,1,Cardiff,2022-01-04 18:31:56+00:00,Primary,32,1 2022,1 2022,6.56,Primary
1,1,London,2022-01-04 18:05:22+00:00,Primary,28,1 2022,1 2022,6.56,Primary
2,2,Cardiff,2020-08-20 12:50:58+00:00,Primary,167,8 2020,8 2020,20.92,Primary
3,2,Cardiff,2020-08-20 13:12:28+00:00,Primary,260,8 2020,8 2020,20.92,Primary
4,2,Cardiff,2020-08-20 13:49:58+00:00,Primary,133,8 2020,8 2020,20.92,Primary
...,...,...,...,...,...,...,...,...,...
346,3,Nottingham,2020-12-26 15:48:37+00:00,Preserved,86,11 2020,,,
347,3,Plymouth,2021-04-23 21:48:55+00:00,Secondary,224,4 2021,4 2021,14.98,Secondary
348,3,Plymouth,2021-04-23 22:11:27+00:00,Secondary,187,4 2021,4 2021,14.98,Secondary
349,3,Plymouth,2021-04-23 22:22:03+00:00,Secondary,129,4 2021,4 2021,14.98,Secondary


In [121]:
#Remove duplicated and unnecessary columns
full = full.drop(['Month', 'minimum_streaming_month', 'Content_Type'], axis=1)
full

Unnamed: 0,userID,location,t,content_type,duration,Avg_Price
0,1,Cardiff,2022-01-04 18:31:56+00:00,Primary,32,6.56
1,1,London,2022-01-04 18:05:22+00:00,Primary,28,6.56
2,2,Cardiff,2020-08-20 12:50:58+00:00,Primary,167,20.92
3,2,Cardiff,2020-08-20 13:12:28+00:00,Primary,260,20.92
4,2,Cardiff,2020-08-20 13:49:58+00:00,Primary,133,20.92
...,...,...,...,...,...,...
346,3,Nottingham,2020-12-26 15:48:37+00:00,Preserved,86,
347,3,Plymouth,2021-04-23 21:48:55+00:00,Secondary,224,14.98
348,3,Plymouth,2021-04-23 22:11:27+00:00,Secondary,187,14.98
349,3,Plymouth,2021-04-23 22:22:03+00:00,Secondary,129,14.98


### 8. Set preserved content price

In [123]:
#First check to make sure they're all preserved
full[full.Avg_Price.isna()].content_type.value_counts()

Preserved    94
Name: content_type, dtype: int64

In [124]:
#Use fillna() to fill in missing price info
full.Avg_Price = full.Avg_Price.fillna(14.98)
full

Unnamed: 0,userID,location,t,content_type,duration,Avg_Price
0,1,Cardiff,2022-01-04 18:31:56+00:00,Primary,32,6.56
1,1,London,2022-01-04 18:05:22+00:00,Primary,28,6.56
2,2,Cardiff,2020-08-20 12:50:58+00:00,Primary,167,20.92
3,2,Cardiff,2020-08-20 13:12:28+00:00,Primary,260,20.92
4,2,Cardiff,2020-08-20 13:49:58+00:00,Primary,133,20.92
...,...,...,...,...,...,...
346,3,Nottingham,2020-12-26 15:48:37+00:00,Preserved,86,14.98
347,3,Plymouth,2021-04-23 21:48:55+00:00,Secondary,224,14.98
348,3,Plymouth,2021-04-23 22:11:27+00:00,Secondary,187,14.98
349,3,Plymouth,2021-04-23 22:22:03+00:00,Secondary,129,14.98


### 8. Export to CSV

In [127]:
full.to_csv('pandas_solution.csv', index=False)