In [1]:
import pandas as pd
import numpy as np
import sqlite3
import regex as re
import string

In [2]:
# Build SQL Connection, need GuidePod4.sqlite file in your folder
conn = sqlite3.connect('GuidePod4.sqlite')
cur = conn.cursor()

In [3]:
# Get all table names from GuidePod4 file
tables_df = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", conn)
tables_df

Unnamed: 0,name
0,podcast_reviews
1,episode_counts
2,podcast_master
3,genre_master
4,podcast_desc
5,podcast_reviews_temp
6,episode_details


In [4]:
# podcast_master
podcasts_df = pd.read_sql_query("SELECT * FROM podcast_master", conn)

# podcast_reviews
reviews_df = pd.read_sql_query("SELECT * FROM podcast_reviews", conn)

# episode_counts
episodes_df = pd.read_sql_query("SELECT * FROM episode_counts", conn)

# genre_master
genres_df = pd.read_sql_query("SELECT * FROM genre_master", conn)

# podcast_desc
descriptions_df = pd.read_sql_query("SELECT * FROM podcast_desc", conn)

# episode_details
episode_details_df = pd.read_sql_query("SELECT * FROM episode_details", conn)

## Podcast Data Cleanup

In [5]:
podcasts_df

Unnamed: 0,country,rank,name,id,releaseDate,Primary_Genre,Other_Genre,Artist,FeedURL,Explicit_content
0,us,1,Mommy Doomsday,1540849480,2021-02-09,True Crime,"Society & Culture,",NBC News,https://podcastfeeds.nbcnews.com/mommy-doomsday,0
1,us,2,Crime Junkie,1322200189,2020-10-26,True Crime,,audiochuck,https://feeds.megaphone.fm/ADL9840290619,0
2,us,3,The Daily,1200361736,2020-10-28,Daily News,"News,",The New York Times,https://feeds.simplecast.com/54nAGcIl,0
3,us,4,Dateline NBC,1464919521,2020-10-28,True Crime,"News,",NBC News,https://podcastfeeds.nbcnews.com/dateline-nbc,0
4,us,5,The Ben Shapiro Show,1047335260,2020-10-27,News,,The Daily Wire,https://feeds.megaphone.fm/WWO8086402096,0
...,...,...,...,...,...,...,...,...,...,...
22085,us,0,Snapped: Women Who Murder,1145089790,,True Crime,"TV & Film,",Oxygen,https://rss.art19.com/snapped-women-who-murder,-1
22086,us,0,Most Notorious! A True Crime History Podcast,1055044256,,True Crime,"History,",Blue Ewe Media,https://www.spreaker.com/show/4698315/episodes...,-1
22087,us,0,The Daily Crime,1547278397,,True Crime,,VAULT Studios,https://feeds.megaphone.fm/thedailycrime,-1
22088,us,0,Crimeaholics,1501365478,,True Crime,,Kenzi & Holly,https://anchor.fm/s/d4de434/podcast/rss,-1


In [6]:
# Check for duplicates from COUNT(*) and COUNT(unique podcast IDs)
cur.execute("""
SELECT
  COUNT(*)
, COUNT(DISTINCT id)
FROM podcast_master 
"""
)

print(cur.fetchall())

[(22090, 21643)]


In [7]:
# Creates a new dataframe to get clean data from podcasts_master (still duplicating 3 podcasts)

podcasts_clean = pd.read_sql_query("""
WITH rank as 
(
    SELECT 
      *
    , ROW_NUMBER() OVER (PARTITION BY name, id) as ranked
    FROM podcast_master 
    WHERE country = 'us'
    GROUP BY 1,2,3,4,5,6,7,8
)
SELECT
*
FROM rank
WHERE ranked = 1
""", conn)

podcasts_clean = podcasts_clean.drop(columns=['ranked'])
podcasts_clean.head()

Unnamed: 0,country,rank,name,id,releaseDate,Primary_Genre,Other_Genre,Artist,FeedURL,Explicit_content
0,us,0,"""A Tale of Two Cities"" Audiobook (Audio book)",185699549,,Books,"Arts,",Charles Dickens performed by Jane Aker,http://podcasts.loudlit.org/podcasts/twocities...,-1
1,us,0,"""DOPE AS USUAL""",1552587575,,Comedy Interviews,"Comedy,Leisure,",Dope As Yola,https://dopeasusual.libsyn.com/rss,-1
2,us,0,"""Doctor Mom"" Podcast",973303969,,Parenting,"Kids & Family,Health & Fitness,Alternative Hea...","Stephanie Greunke, RD and Dr. Elana Roumell, ND",https://realfoodmamas.libsyn.com/rss,-1
3,us,0,"""Pretty Boy"" Doug Masters: The Kayfabe Korner",1536386337,,Wrestling,"Sports,",Action VR Network,https://www.spreaker.com/show/3605803/episodes...,-1
4,us,0,"""See, The Thing Is...""",1534096260,,Society & Culture,"Comedy,",The Joe Budden Network,https://sttipodcast.libsyn.com/STTIPOD,-1


> Use `podcasts_df` as the base dataset with all info in table

> Use `podcasts_clean` dataset for modeling/analyses

## Episode Counts Cleanup

In [8]:
# Check out Row 417
episodes_df

Unnamed: 0,name,id,author,episode_count
0,Mommy Doomsday,1540849480,NBC News,7
1,Crime Junkie,1322200189,audiochuck,180
2,The Daily,1200361736,The New York Times,1158
3,Dateline NBC,1464919521,NBC News,334
4,The Ben Shapiro Show,1047335260,The Daily Wire,350
...,...,...,...,...
413,Daily Dose,1456761991,Newslaundry.com,656
414,Prime Time with Ravish,1442530475,NDTV,10
415,Detective Mathema's Maths Puzzles for Kids,1184274374,Fun Kids,5
416,RED FM LOVE STORY by RJ PAHI,1289643592,Red FM,39


In [9]:
# Removed the row that contained header value
episodes_df = episodes_df[episodes_df.name != 'name']
episodes_df

Unnamed: 0,name,id,author,episode_count
0,Mommy Doomsday,1540849480,NBC News,7
1,Crime Junkie,1322200189,audiochuck,180
2,The Daily,1200361736,The New York Times,1158
3,Dateline NBC,1464919521,NBC News,334
4,The Ben Shapiro Show,1047335260,The Daily Wire,350
...,...,...,...,...
412,TED Talks Education,470623037,TED,112
413,Daily Dose,1456761991,Newslaundry.com,656
414,Prime Time with Ravish,1442530475,NDTV,10
415,Detective Mathema's Maths Puzzles for Kids,1184274374,Fun Kids,5


In [10]:
# One duplicate name, make sure to join on ID 
episodes_df.describe()

Unnamed: 0,name,id,author,episode_count
count,417,417,417,417
unique,416,417,295,222
top,Motley Fool Money,1537865727,NPR,100
freq,2,1,16,14


In [11]:
# Confirming they are different episodes with different IDs
episodes_df[episodes_df.name == 'Motley Fool Money']

Unnamed: 0,name,id,author,episode_count
141,Motley Fool Money,306106212,The Motley Fool,672
339,Motley Fool Money,1118867383,LiSTNR,380


> `episodes_df` is clean to use for analysis

## Genre Master Cleanup

In [12]:
genres_df

Unnamed: 0,genre
0,True Crime
1,Daily News
2,News
3,Christianity
4,Society & Culture
...,...
70,Leisure
71,Mathematics
72,Nature
73,Pets & Animals


In [13]:
# After Shows has two rows in here for whatever reason
genres_df.groupby('genre').size()

genre
After Shows           2
Alternative Health    1
Arts                  1
Astronomy             1
Basketball            1
                     ..
TV & Film             1
TV Reviews            1
Tech News             1
Technology            1
True Crime            1
Length: 74, dtype: int64

In [14]:
# Removing duplicates and getting unique values only
genres_df = genres_df.drop_duplicates()
genres_df

Unnamed: 0,genre
0,True Crime
1,Daily News
2,News
3,Christianity
4,Society & Culture
...,...
70,Leisure
71,Mathematics
72,Nature
73,Pets & Animals


> `genres_df` is clean to use for analysis

## Episode Details Cleanup

In [15]:
# Need to clean out the duration so we get only the numbers instead of the string field

episode_details_df

Unnamed: 0,podcast_id,title,description,duration,pubDate,cleanDate
0,270054094,"""the red sea""","""by stephen edgar""",174,Sun 21 Mar 2021 17:00:00 +0000,2021-03-21
1,270054094,"""to arielle and the moon""","""by david trinidad""",87,Sat 20 Mar 2021 17:00:00 +0000,2021-03-20
2,270054094,"""late melt""","""by melissa broder""",60,Fri 19 Mar 2021 17:00:00 +0000,2021-03-19
3,270054094,"""life of savage""","""by vijay seshadri""",121,Thu 18 Mar 2021 17:00:00 +0000,2021-03-18
4,270054094,"""misgivings""","""by william matthews""",85,Wed 17 Mar 2021 17:00:00 +0000,2021-03-17
...,...,...,...,...,...,...
2656559,1201374883,"""growing up with transitions i""","""in this episode i explore the concept of tran...",<itunes:duration>903</itunes:duration>,Sat 19 Aug 2006 12:49:00 GMT,2006-08-19
2656560,1201374883,"""random train of thought""","""ok this show is a long one for me 19 47 to be...",<itunes:duration>1187</itunes:duration>,Wed 16 Aug 2006 11:16:00 GMT,2006-08-16
2656561,1201374883,"""and now i'm down to one in the can ""","""any audio pro would scold me for posting such...",<itunes:duration>770</itunes:duration>,Sun 30 Jul 2006 15:03:00 GMT,2006-07-30
2656562,1201374883,"""the man and his plane""","""hey folks i won't even bother to claim that t...",<itunes:duration>559</itunes:duration>,Sun 16 Jul 2006 16:29:00 GMT,2006-07-16


In [16]:
# Cleaning up the bad data
# Create new field that contains itunes or is null, and extracting the numerical duration from string

bad_durations = episode_details_df[episode_details_df['duration'].str.contains("itunes") | episode_details_df['duration'].isnull()]
bad_durations['new_duration'] = bad_durations['duration'].copy()
bad_durations['new_duration'] = bad_durations['new_duration'].str.extract('(\d+)')
bad_durations.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bad_durations['new_duration'] = bad_durations['duration'].copy()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bad_durations['new_duration'] = bad_durations['new_duration'].str.extract('(\d+)')


Unnamed: 0,podcast_id,title,description,duration,pubDate,cleanDate,new_duration
99,1512510969,"""review durable trades by rory groves""","""in this episode i review rory groves apos bo...",<itunes:duration>3380</itunes:duration>,Wed 17 Mar 2021 05:00:00 -0600,2021-03-17,3380
100,1512510969,"""why overload is destroying men""","""in this episode we'll talk about how overload...",<itunes:duration>2819</itunes:duration>,Sat 06 Mar 2021 12:00:00 -0700,2021-03-06,2819
101,1512510969,"""essential habits of the resilient man""","""as humans we all without exception face adver...",<itunes:duration>4012</itunes:duration>,Sat 13 Feb 2021 12:00:00 -0700,2021-02-13,4012
102,1512510969,"""interview aaron renn from the masculinist ""","""in this episode i talk with aaron renn from t...",<itunes:duration>3878</itunes:duration>,Thu 04 Feb 2021 05:00:00 -0700,2021-02-04,3878
103,1512510969,"""the art of culture war""","""it's a trying time for christians waking up t...",<itunes:duration>4209</itunes:duration>,Thu 28 Jan 2021 05:00:00 -0700,2021-01-28,4209


In [17]:
# Updating episode_details_df with new_duration
episode_details_df['new_duration'] = episode_details_df['duration']
episode_details_df.update(bad_durations)

In [18]:
# Some values in the bad_durations did not populate because they had no values
# Turning those values into numbers and imputing them with averages
# Taking one outlier and imputing with the average
# Averages were calculated in Excel = 20,200

avg_duration = 20200
episode_details_df.loc[episode_details_df['new_duration'].str.contains('duration', na=False), 'new_duration'] = avg_duration
episode_details_df.loc[episode_details_df['new_duration'] == 4294967295] = avg_duration
episode_details_df.loc[episode_details_df['new_duration'].isna()] = avg_duration
episode_details_df['new_duration'] = pd.to_numeric(episode_details_df['new_duration'])
episode_details_df.loc[episode_details_df['new_duration'] < 100] = avg_duration

In [19]:
episode_details_df.dtypes

podcast_id      float64
title            object
description      object
duration         object
pubDate          object
cleanDate        object
new_duration      int64
dtype: object

In [20]:
episode_details_df.describe()

Unnamed: 0,podcast_id,new_duration
count,2656564.0,2656564.0
mean,1028160000.0,3448.663
std,454414000.0,34853.25
min,20200.0,100.0
25%,737480500.0,1387.0
50%,1153679000.0,2513.0
75%,1434346000.0,3780.0
max,1557399000.0,42687810.0


In [21]:
episode_details_df.sort_values(by=['new_duration'])

Unnamed: 0,podcast_id,title,description,duration,pubDate,cleanDate,new_duration
665448,1.033620e+09,"""apn16 the equanimity game""","""this is a mini-chapter from brian's book a p...",100,Fri 07 Aug 2015 18:00:00 +0000,2015-08-07,100
1995151,2.154199e+08,"""0003 mammoth hot spring terraces overview""",,100,Wed 7 Feb 2007 19:00:00 GMT,2007-02-07,100
1713029,1.272795e+09,"""brady the tribute song greg and the morni...","""the best of greg the morning buzz listen w...",100,Thu 01 Feb 2018 09:55:58 -0500,2018-02-01,100
992003,1.099657e+09,"""letter to bakul""",""" """,<itunes:duration>100</itunes:duration>,Tue 17 Nov 2020 20:00:00 -0500,2020-11-17,100
285355,9.085029e+08,"""37 its all about the entry price""","""in this session jerry robinson explains an im...",<itunes:duration>100</itunes:duration>,Fri 03 Oct 2014 19:00:49 +0000,2014-10-03,100
...,...,...,...,...,...,...,...
27071,1.338733e+09,"""e05 nocturnes shabane ha ""","""nocturneskazuo ishiguro t me cafeketab""",<itunes:duration>11136430</itunes:duration>,Thu 17 Jan 2019 00:00:00 +0000,2019-01-17,11136430
27072,1.338733e+09,"""e04 nocturnes shabane ha ""","""nocturneskazuo ishiguro t me cafeketab""",<itunes:duration>11495086</itunes:duration>,Thu 17 Jan 2019 00:00:00 +0000,2019-01-17,11495086
27073,1.338733e+09,"""e03 nocturnes shabane ha ""","""nocturneskazuo ishiguro t me cafeketab""",<itunes:duration>11626609</itunes:duration>,Thu 17 Jan 2019 00:00:00 +0000,2019-01-17,11626609
27074,1.338733e+09,"""e02 nocturnes shabane ha ""","""nocturneskazuo ishiguro t me cafeketab""",<itunes:duration>12427059</itunes:duration>,Thu 17 Jan 2019 00:00:00 +0000,2019-01-17,12427059


> `episode_details_df` has a `new_duration` column that has the length of duration standardized in ints

In [22]:
# Leaving the episode description, reviews, and reviews for Gary & Harjot's word parsing and 
# cleaning libraries to clean up.

## Reviews Cleanup

In [23]:
reviews_df

Unnamed: 0,id,country,review_title,review_text,review_rating
0,1540849480,us,Good story condescending narration,Its a fascinating story and its covered fairly...,3
1,1540849480,us,Ugh Keith Morrison,I know its just me but I cant stand a podcast ...,2
2,1540849480,us,Great Narrator,Great Narration with a few new bits more than ...,5
3,1540849480,us,Amazing,This seems like straight out of a mystery book...,5
4,1540849480,us,LDS Cult Judged correctly Mormons mad,A tragic story of cult devotees masquerade by...,5
...,...,...,...,...,...
1020606,1195484262,US,Another GREAT show,awesome duo great stories and information appr...,5
1020607,1195484262,US,Love these guys,this one was a bit hard to follow but super in...,5
1020608,1195484262,US,Great show,love it,5
1020609,1195484262,US,Amateur Hour,hard to listen to i couldn t make it through t...,2


In [24]:
us_reviews = reviews_df[reviews_df['id'].isin(podcasts_clean['id']) & reviews_df['review_text'].notnull() & reviews_df['review_title'].notnull()]
us_reviews = us_reviews[['id','review_title','review_text']]
us_reviews['title'] = us_reviews.groupby(['id'], as_index=False)['review_title'].transform(lambda x : ' '.join(x))
us_reviews['text'] = us_reviews.groupby(['id'], as_index=False)['review_text'].transform(lambda x : ' '.join(x))

In [25]:
us_reviews = us_reviews[['id','title','text']].drop_duplicates()
reviews_clean = us_reviews.merge(descriptions_df, left_on = 'id', right_on='podcast_id', how='left')
reviews_clean = reviews_clean[['id','title','text','description']]

In [26]:
cols = ['title', 'text', 'description']
reviews_clean['combined'] = reviews_clean[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
reviews_clean = reviews_clean[['id','combined']]

In [27]:
reviews_score = reviews_df[['id','review_rating']].groupby('id').agg(['count','mean']).reset_index()
reviews_score.columns = ['id','num_reviews','avg_review_score']
reviews_score

Unnamed: 0,id,num_reviews,avg_review_score
0,73329271,34,4.735294
1,73329284,427,3.850117
2,73329293,50,4.120000
3,73329404,500,3.012000
4,73329520,50,4.760000
...,...,...,...
18679,1557373207,3,5.000000
18680,1557381717,15,5.000000
18681,1557385009,2,5.000000
18682,1557394864,2,5.000000


In [28]:
reviews_final = reviews_clean.merge(reviews_score, on = 'id', how = 'left')
reviews_final

Unnamed: 0,id,combined,num_reviews,avg_review_score
0,1540849480,Good story condescending narration Ugh Keith ...,120,3.833333
1,1322200189,AMAZING Five star The best true crime podcast ...,296,4.462838
2,1200361736,Bring the other guy I listen every day Fake ne...,485,3.400000
3,1464919521,Dateline where did you go Love Only getting wo...,115,3.617391
4,1047335260,nope Bad Poggers What I like about this podcas...,525,3.733333
...,...,...,...,...
18203,1145089790,Luv it Make the description match the podcast ...,37,3.756757
18204,1055044256,Most Notorious is a fascinating look at true c...,411,4.824818
18205,1547278397,Good concept poor execution Something daily to...,59,3.762712
18206,1501365478,Two Amazing Women Doing Amazing Things Distrac...,43,4.488372


In [29]:
def onlyWords(s):
    return re.sub(r'[^\w\s]', ' ', s).strip().lower()

In [30]:
reviews_final.loc[:, 'combined'] = reviews_final.loc[:, 'combined'].apply(lambda x: onlyWords(x))

In [31]:
reviews_final

Unnamed: 0,id,combined,num_reviews,avg_review_score
0,1540849480,good story condescending narration ugh keith ...,120,3.833333
1,1322200189,amazing five star the best true crime podcast ...,296,4.462838
2,1200361736,bring the other guy i listen every day fake ne...,485,3.400000
3,1464919521,dateline where did you go love only getting wo...,115,3.617391
4,1047335260,nope bad poggers what i like about this podcas...,525,3.733333
...,...,...,...,...
18203,1145089790,luv it make the description match the podcast ...,37,3.756757
18204,1055044256,most notorious is a fascinating look at true c...,411,4.824818
18205,1547278397,good concept poor execution something daily to...,59,3.762712
18206,1501365478,two amazing women doing amazing things distrac...,43,4.488372


## Episode Count Cleanup

In [32]:
# Get episode counts 
episode_counts = episode_details_df[['podcast_id','title']].groupby('podcast_id').count().reset_index()
episode_counts.columns = ['podcast_id','episode_count']
episode_counts

Unnamed: 0,podcast_id,episode_count
0,2.020000e+04,86868
1,7.332927e+07,170
2,7.332928e+07,50
3,7.332929e+07,297
4,7.332940e+07,10
...,...,...
20821,1.557395e+09,3
20822,1.557397e+09,3
20823,1.557397e+09,19
20824,1.557398e+09,2


In [33]:
podcast_duration = episode_details_df[['podcast_id','new_duration']].groupby('podcast_id').sum().reset_index()
podcast_duration.columns = ['podcast_id','total_duration']
podcast_duration

Unnamed: 0,podcast_id,total_duration
0,2.020000e+04,1754733600
1,7.332927e+07,383033
2,7.332928e+07,141270
3,7.332929e+07,635450
4,7.332940e+07,88290
...,...,...
20821,1.557395e+09,13126
20822,1.557397e+09,3295
20823,1.557397e+09,46800
20824,1.557398e+09,976


In [34]:
episodes_final = episode_counts.merge(podcast_duration, on = 'podcast_id')
episodes_final

Unnamed: 0,podcast_id,episode_count,total_duration
0,2.020000e+04,86868,1754733600
1,7.332927e+07,170,383033
2,7.332928e+07,50,141270
3,7.332929e+07,297,635450
4,7.332940e+07,10,88290
...,...,...,...
20821,1.557395e+09,3,13126
20822,1.557397e+09,3,3295
20823,1.557397e+09,19,46800
20824,1.557398e+09,2,976


## Final Cleanup

In [35]:
# Use podcasts_df as main
# JOIN episodes_df for number of episodes
# JOIN descriptions_df for description
# 

podcasts_cols = podcasts_clean[['id','name','releaseDate','Primary_Genre','Artist']]
#episodes_cols = episodes_df[['id','episode_count']]
#final_df = podcasts_cols.merge(episodes_cols, on = 'id', how = 'left')
final_df = podcasts_cols.merge(episodes_final, left_on = 'id', right_on = 'podcast_id', how = 'inner')
final_df = final_df.merge(reviews_final, on = 'id')
final_df.head()

Unnamed: 0,id,name,releaseDate,Primary_Genre,Artist,podcast_id,episode_count,total_duration,combined,num_reviews,avg_review_score
0,185699549,"""A Tale of Two Cities"" Audiobook (Audio book)",,Books,Charles Dickens performed by Jane Aker,185699500.0,31,58285,excellent very well done all i could want high...,56,4.839286
1,1552587575,"""DOPE AS USUAL""",,Comedy Interviews,Dope As Yola,1552588000.0,7,32102,love this podcast dope as yola podcast is amaz...,50,4.96
2,973303969,"""Doctor Mom"" Podcast",,Parenting,"Stephanie Greunke, RD and Dr. Elana Roumell, ND",973304000.0,218,640393,so supportive and educational thank heaven for...,50,5.0
3,1534096260,"""See, The Thing Is...""",,Society & Culture,The Joe Budden Network,1534096000.0,24,180997,mandi disappointed i m not gonna hold you grea...,212,4.330189
4,1521742221,"""So Look Bro"" Podcast",,Music Commentary,DJ Ghost,1521742000.0,10,33966,ghost gang run it up recommend fire ghost gang...,166,4.945783


In [36]:
dummy = pd.get_dummies(final_df['Primary_Genre'])
dummy.head()

Unnamed: 0,After Shows,Alternative Health,Animation & Manga,Arts,Astronomy,Automotive,Aviation,Baseball,Basketball,Books,...,TV Reviews,Tech News,Technology,Tennis,True Crime,Video Games,Visual Arts,Volleyball,Wilderness,Wrestling
0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
model_df = pd.concat([final_df,dummy], axis = 1)
model_df.drop('podcast_id', axis = 'columns', inplace=True)

In [None]:
model_df.head()

In [None]:
test = model_df.iloc[3,8]

In [None]:
test

In [None]:
# model_df.to_csv('model_ready_podcasts.csv')

In [None]:
from sqlalchemy import create_engine

engine = create_engine('sqlite:///podcast_clean.sqlite', echo=True)
sqlite_connection = engine.connect()

In [None]:
sqlite_table = "podcast_model_data"
model_df.to_sql(sqlite_table, sqlite_connection, if_exists='fail')

In [37]:
final_df.to_csv('podcast_final.csv')

In [39]:
max(final_df['total_duration'])/3600

36147.70861111111

In [40]:
max(final_df['num_reviews'])

1000

In [41]:
test = descriptions_df.merge(podcasts_df, left_on = "podcast_id", right_on = "id")
test[['name','Artist','description']]

Unnamed: 0,name,Artist,description
0,Mommy Doomsday,NBC News,The disappearance of two of Lori Vallow’s chil...
1,Mommy Doomsday,NBC News,The disappearance of two of Lori Vallow’s chil...
2,Mommy Doomsday,NBC News,The disappearance of two of Lori Vallow’s chil...
3,Mommy Doomsday,NBC News,The disappearance of two of Lori Vallow’s chil...
4,Mommy Doomsday,NBC News,The disappearance of two of Lori Vallow’s chil...
...,...,...,...
1099,Jensen and Holes: The Murder Squad,Exactly Right,pRetired Cold Case Investigator Paul Holes and...
1100,Noble Blood,iHeartRadio and Grim & Mild,Author Dana Schwartz explores the stories of s...
1101,Ologies with Alie Ward,Alie Ward,pVolcanoes. Trees. Drunk butterflies. Mars mis...
1102,HISTORY This Week,The HISTORY Channel,"pThis week, something momentous happened. Whet..."
