In [None]:
import sqlite3
import numpy as np
import pandas as pd
import re
import time
from collections import Counter

# Custom module
from spell_check import word_level

### Import data

In [None]:
# Create connection
cnx = sqlite3.connect('../data/GuidePod_clean.sqlite')

# import
df = pd.read_sql_query("SELECT * FROM podcast_model_data;", cnx)
print(f"Original podcast dataset shape = {df.shape}")

In [None]:
print(df.columns[df.isnull().any()])

In [None]:
print(df["releaseDate"].isnull().sum())
print(df["Primary_Genre"].isnull().sum())
print(df["Artist"].isnull().sum())

In [None]:
# Drop NAN
df = df[df['Primary_Genre'].notna()]
df = df[df['Artist'].notna()]
df = df.drop('releaseDate', 1)
df = df.drop('index', 1)

In [None]:
print(f"Dataset shape = {df.shape}")

### Create new features

In [None]:
# Find the average duration per episode
df["total_duration"] = df["total_duration"]/df["episode_count"]
df.rename(columns={"total_duration": "avg_duration"}, inplace=True)

In [None]:
# Add prefix to genre dummy columns
df.rename(columns={col: "genre_"+col for col in df.iloc[:,9:].columns}, inplace=True)

In [None]:
c = Counter()

for i in df["Artist"]:
    c[i] += 1

In [None]:
artist_dummy = list({"artist_"+el for el in c.elements() if c[el] >= 3 and el != "Unknown"})

In [None]:
len(artist_dummy)

In [None]:
dummies = pd.get_dummies(df.Artist, prefix='artist')

In [None]:
df = pd.concat([df, dummies[artist_dummy]], axis=1)

In [None]:
print(f"Dataset shape = {df.shape}")

In [None]:
move text column to the end
col = df.pop("combined")
df.insert(df.shape[1], col.name, col)

In [None]:
df.columns = df.columns.str.lower()

In [None]:
print(f"Podcasts with only 1 review: {len(df.loc[df['num_reviews']==1])}")

### Clean text

In [None]:
def onlyWords(s):
    
    return re.sub(r'[^A-Za-z]+', ' ', s).strip().lower()

df.loc[:,"combined"] = df.loc[:,"combined"].apply(lambda x: onlyWords(x))

In [None]:
import requests
r = requests.get("https://raw.githubusercontent.com/stopwords-iso/stopwords-en/master/stopwords-en.txt")
stopwords = r.text.split('\n')

In [None]:
start = time.time()

body_text = "_".join(df["combined"])
pattern = re.compile(r'\b(' + r'|'.join(stopwords) + r')\b\s*')
text = pattern.sub('', body_text)
print(len(set(text.split())))
text = text.split("_")
df.loc[:,"combined"] = text

end = time.time()
print(end - start)

In [None]:
start = time.time()

df.loc[:,"combined"] = df.loc[:,"combined"].apply(lambda x: word_level(x))

end = time.time()
print(end - start)

In [None]:
start = time.time()

body_text = "_".join(df["combined"])
pattern = re.compile(r'\b(' + r'|'.join(stopwords) + r')\b\s*')
text = pattern.sub('', body_text)
print(len(set(text.split())))
text = text.split("_")
df.loc[:,"combined"] = text

end = time.time()
print(end - start)

### Bin the continuous variables

In [None]:
cut_bins = np.arange(1.0, 5.5, 0.5)

df['avg_review_score'] = pd.cut(df['avg_review_score'], bins=cut_bins, right=True)

print(df['avg_review_score'].value_counts())

dummies = pd.get_dummies(df['avg_review_score'], prefix="reviewscore")
df = pd.concat([df, dummies], axis=1)
df.drop(['avg_review_score'], inplace=True, axis=1)

In [None]:
cut_bins = np.arange(0, 140*60, 20*60)
cut_bins = np.append(cut_bins, np.inf)
df['avg_duration_bin'] = pd.cut(df['avg_duration'], bins=cut_bins)#.value_counts()

print(df['avg_duration_bin'].value_counts())

dummies = pd.get_dummies(df['avg_duration_bin'], prefix="duration")
df = pd.concat([df, dummies], axis=1)
df.drop(['avg_duration_bin'], inplace=True, axis=1)

In [None]:
cut_bins = np.arange(0, 500, 100)
cut_bins = np.append(cut_bins, np.inf)
df['episode_count_bin'] = pd.cut(df['episode_count'], bins=cut_bins)#.value_counts()

print(df['episode_count_bin'].value_counts())

dummies = pd.get_dummies(df['episode_count_bin'], prefix="episodecount")
df = pd.concat([df, dummies], axis=1)
df.drop(['episode_count_bin'], inplace=True, axis=1)

### Export

In [None]:
# sort the DF and reset index for easier reference
df.sort_values(["num_reviews"], ascending=False, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
# move the text column to the end
col = df.pop("combined")
df.insert(df.shape[1], col.name, col)

In [None]:
df.drop(columns=df.iloc[:,9:], inplace=True)

In [None]:
df.to_csv("../data/podcast.csv", index=False)