In [1]:
import pandas as pd
import re
import matplotlib.pyplot as plt

In [2]:
def cleaning(data, column="text"):
    df = data.copy()
    df[column] = df[column].str.lower()
    
    # fastest source of contractions I could find: https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
    contractions = { 
    "ain't": "are not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he had",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "I'd": "I would",
    "I'd've": "I would have",
    "I'll": "I will",
    "I'll've": "I will have",
    "I'm": "I am",
    "I've": "I have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so is",
    "that'd": "that had",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
    }
#   cleaning the text for the contractions
    for k, v in contractions.items():
        df[column] = df[column].str.replace(k.lower(), v.lower())
#   making a list of the neutral characters to delete in the text
    special_characters={
    "-": " ",
    "\'": " ",
    "\"": " ",
    ",": " ",
    "$": " ",
    "%": " ",
    "[": " ",
    "]": " ",
    ".": " ",
    "/": " ",
    "?": " ? "
    }
#   cleaning the data with the special characters
    for k, v in special_characters.items():
        df[column] = df[column].str.replace(k,v)
#   replacing multiple spaces with a single space
    for s in range(1, 8):
        df[column] = df[column].str.replace(" "*s, " ")

#   attempting to strip the HTML code in descriptions
    for i, string in enumerate(df[column]):
        try:
            string = re.sub(r"<[^>]*>", "", string)
            df[column][i] = string

        except:
            pass
        
    # making a column for any left over/missed descriptions with HTML code 
    df["html_uncleaned"] = df[column].str.contains("</")
    # drop the episodes that still have HTML (there should not be contextual information on the episode)
    drop_it = df[df["html_uncleaned"] == 1].index
    df = df.drop(drop_it)
    df = df.drop(columns="html_uncleaned")

    # fill all NAN object cells with empty string
    df = df.fillna("")
    # drop the episodes with no text information (there is no ability to save the data)
    drop_it = df[df[column] == ""].index
    df = df.drop(drop_it)

    # reset the index and drop the original & return
    return df.reset_index(drop=True)

In [3]:
tal = pd.read_csv("raw transcripts/this_american_life.csv")
tal = cleaning(tal)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [4]:
episodes_sample = pd.read_csv("raw data/episodes-sample.csv")
episodes_sample["text"] = episodes_sample["subtitle"] + " "+ episodes_sample["description"] + " "+ episodes_sample["summary"]
episodes_sample = cleaning(episodes_sample)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [15]:
# this is actually the combined dataset for the larger data set. This should be fairly cleaned as well.
podcasts = pd.read_csv("raw data/podcasts.csv")
episodes = pd.read_csv("raw data/episodes.csv")

# making genre dummy columns
genres = list(set([g for g in podcasts["categories"] if not " | " in g]))
for g in genres:
    podcasts[f"{g}_genre"] = [1 if g in cell else 0 for cell in podcasts["categories"]]
genre_cols = [f"{g}_genre" for g in genres]

episodes = episodes[["title","podcast_uuid","audio", "description", "audio_length"]]
merging = podcasts[["uuid", "language", "title", ]+genre_cols]

combined = episodes.merge(merging, left_on="podcast_uuid", right_on="uuid" )
english_pods = combined[combined["language"] == "English"]
final_pods = english_pods[["title_x", "audio", "description", "audio_length", "title_y",]+genre_cols]

In [18]:
final_pods.head(1)

Unnamed: 0,title_x,audio,description,audio_length,title_y,News & Politics_genre,Sports & Recreation_genre,Government & Organizations_genre,Comedy_genre,Arts_genre,...,Games & Hobbies_genre,TV & Film_genre,Society & Culture_genre,Technology_genre,Health_genre,Science & Medicine_genre,Kids & Family_genre,Business_genre,Religion & Spirituality_genre,Education_genre
31,Introduction to Luke,http://www.wgcr.net/images/TimelessTruths/TTT-...,Luke 1:1-4 -,1691,Teaching Timeless Truths,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [30]:
final_pods = final_pods.rename(columns={"title_x":"episode_title", "title_y":"podcast_title", "description":"text"})

In [31]:
final_pods = cleaning(final_pods)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [37]:
final_pods.to_csv("Cleaned data/clean_podcasts.csv", index=False)

In [38]:
tal.to_csv("raw transcripts/this_american_life.csv", index=False)

I need to replace all the html tags to create a clean description.