In [15]:
import pandas as pd
import numpy as np

df = pd.read_csv("data/streaming_titles.csv",dtype={'duration' :'string'})

In [16]:
# Removing duplicates

df = df.drop_duplicates(subset=['title'])

In [17]:
# Only take columns with a score value

df = df.loc[df['score'] > 0]

In [18]:
# Fixing issue with duration and rating column being swapped

badcols = df['rating'].str.contains("min") | df['rating'].str.contains("Seasons") | df['rating'].str.contains("Season")
df.loc[badcols,'duration'] = df.loc[badcols,'rating']
df.loc[badcols,'rating'] = np.NaN

df[badcols]

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,streaming_service,score,imdbid
50,s51,Movie,Yashahime: Princess Half-Demon,,,Japan,"October 9, 2021",2020,,2 Seasons,Anime,Long-lost twins Towa and Setsuna reunite after...,hulu,48.0,tt12287748
108,s109,Movie,Maze,,,"Ireland, United Kingdom, Germany, Sweden","October 1, 2021",2017,,93 min,"Action, Adventure, Crime",Based on the incredible true story of the 1983...,hulu,56.0,tt5752606
263,s264,Movie,Trolls: TrollsTopia,,,United States,"September 2, 2021",2020,,4 Seasons,"Family, Kids",Trolls: TrollsTopia! is the next chapter in th...,hulu,28.0,tt11714932
397,s398,Movie,Gunfight at the O.K. Corral,,,United States,"August 1, 2021",1957,,136 min,"Classics, Documentaries, Drama",Dramatization of the legendary battle between ...,hulu,68.0,tt0050468
510,s511,Movie,Citizen Jane: Battle for the City,,,United States,"July 1, 2021",2017,,93 min,Documentaries,Chronicle of activist Jane Jacobs' battle with...,hulu,41.0,tt3699354
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2908,s2909,Movie,The Wrong Mans,,,United Kingdom,"December 24, 2014",2013,,2 Seasons,"Action, Adventure, Comedy",Lowly office workers Sam (Matthew Baynton) and...,hulu,82.0,tt2603596
2955,s2956,Movie,Getting On,,,United Kingdom,"January 7, 2014",2009,,3 Seasons,"Comedy, International, Sitcom",Care for the elderly is the least glamorous ar...,hulu,40.0,tt1468773
2959,s2960,Movie,Packed to the Rafters,,,Australia,"December 3, 2013",2008,,6 Seasons,"Comedy, Drama, International",Packed to the Rafters is an Australian family-...,hulu,24.0,tt1132600
18282,s5542,Movie,Louis C.K. 2017,Louis C.K.,Louis C.K.,United States,"April 4, 2017",2017,,74 min,Movies,"Louis C.K. muses on religion, eternal love, gi...",netflix,78.0,tt6736782


In [19]:
# Converting duration to int

def dur_to_int(row):
    if type(row['duration']) != str:
        return np.NaN
    sp = ''.join([char for char in row['duration'] if char.isnumeric()])
    return int(sp)

df['duration'] = df.apply(dur_to_int,axis=1)

In [20]:
# Reducing and refactoring genre (complete code in genre_exploration.ipynb)

genre_reduction = {"Action & Adventure" : ["Action","Adventure"],
                   "Action-Adventure" : ["Action","Adventure"],
                   "Animals & Nature" : ["Nature"],
                   "Anime Features" : ["Anime"],
                   "Anime Series" : ["Anime"],
                   "Children & Family Movies" : ["Kids","Family"],
                   "Classic & Cult TV" : ["Classics","Cult"],
                   "Comedies" : ["Comedy"],
                   "Concert Film" : ["Music"],
                   "Crime TV Shows" : ["Crime"],
                   "Cult Movies" : ["Cult"],
                   "Documentaries" : ["Documentary"],
                   "Docuseries" : ["Documentary"],
                   "Dramas" : ["Drama"],
                   "Faith & Spirituality" : ["Faith and Spirituality"],
                   "Game Show / Competition" : ["Game Shows"],
                   "Historical" : ["History"],
                   "Horror Movies" : ["Horror"],
                   "International Movies" : ["International"],
                   "International TV Shows" : ["International"],
                   "Kid's TV" : ["Kids"],
                   "Late Night" : ["Talk Show"],
                   "LGBTQ Movies" : ["LGBTQ"],
                   "LGBTQ+" : ["LGBTQ"],
                   "Lifestyle & Culture": ["Lifestyle","Culture"],
                   "Music & Musicals" : ["Music", "Musical"],
                   "Music Videos and Concerts" : ["Music"],
                   "Reality TV" : ["Reality"],
                   "Romantic Comedy" : ["Romance", "Comedy"],
                   "Romantic TV Shows" : ["Romance"],
                   "Sci-Fi & Fantasy" : ["Science Fiction", "Fantasy"],
                   "Science & Nature TV" : ["Science & Technology","Nature"],
                   "Spanish-Language TV Shows" : ["Latino"],
                   "Sports Movies" : ["Sports"],
                   "Stand-Up Comedy" : ["Stand Up"],
                   "Stand-Up Comedy & Talk Shows" : ["Stand Up","Talk Show"],
                   "TV Action & Adventure" : ["Action", "Adventure"],
                   "TV Comedies" : ["Comedies"],
                   "TV Dramas" : ["Drama"],
                   "TV Horror" : ["Horror"],
                   "TV Mysteries" : ["Mystery"],
                   "TV Sci-Fi & Fantasy" : ["Science Fiction", "Fantasy"],
                   "TV Thrillers" : ["Thriller"],
                   "Talk Show and Variety" : ["Talk Show","Variety"],
                   "Teen TV Shows" : ["Teen"],
                   "Thrillers" : ["Thriller"],

                   "Arts" : ["Arts, Entertainment, and Culture"],
                   "Entertainment" : ["Arts, Entertainment, and Culture"],
                   "and Culture" : ["Arts, Entertainment, and Culture"]
                   }

df = df.reset_index(drop = True)

for i in range(len(df)):
    elem = df.loc[i,'listed_in']
    if type(elem) != str:
        continue
    splitted = elem.split(', ')
    for k in splitted:
        key = [k]
        if k in genre_reduction:
            key = genre_reduction[k]
        for l in key:
            df.loc[i,"genre." + l.replace(" ", "_")] = True

df.fillna({x : False for x in df.columns[pd.Series(df.columns).str.startswith("genre")]}, inplace=True)

df.drop(columns = ["streaming_service","show_id","date_added","listed_in"], inplace = True)

print(df.columns)

df.head(3)

Index(['type', 'title', 'director', 'cast', 'country', 'release_year',
       'rating', 'duration', 'description', 'score', 'imdbid', 'genre.Crime',
       'genre.Drama', 'genre.Thriller', 'genre.Action', 'genre.Horror',
       'genre.Science_Fiction', 'genre.Music', 'genre.Reality',
       'genre.Romance', 'genre.Comedy', 'genre.Mystery', 'genre.Documentary',
       'genre.History', 'genre.Teen', 'genre.Health_&_Wellness',
       'genre.Lifestyle', 'genre.Culture', 'genre.Black_Stories', 'genre.News',
       'genre.Latino', 'genre.Adventure', 'genre.Anime', 'genre.Talk_Show',
       'genre.Sketch_Comedy', 'genre.Family', 'genre.Kids', 'genre.Classics',
       'genre.LGBTQ', 'genre.Adult_Animation', 'genre.Sitcom',
       'genre.Cooking_&_Food', 'genre.Sports', 'genre.Game_Shows',
       'genre.International', 'genre.Cartoons', 'genre.Science_&_Technology',
       'genre.Stand_Up', 'genre.Special_Interest', 'genre.Suspense',
       'genre.TV_Shows', 'genre.Arts,_Entertainment,_and_Cult

Unnamed: 0,type,title,director,cast,country,release_year,rating,duration,description,score,...,genre.Coming_of_Age,genre.Anthology,genre.Buddy,genre.Parody,genre.Spy/Espionage,genre.Survival,genre.Soap_Opera_/_Melodrama,genre.Dance,genre.Medical,genre.Disaster
0,Movie,Silent Night,,,,2020,,94.0,"Mark, a low end South London hitman recently r...",56.0,...,False,False,False,False,False,False,False,False,False,False
1,Movie,The Marksman,,,,2021,PG-13,108.0,A hardened Arizona rancher tries to protect an...,57.0,...,False,False,False,False,False,False,False,False,False,False
2,Movie,Gaia,,,,2021,R,97.0,A forest ranger and two survivalists with a cu...,63.0,...,False,False,False,False,False,False,False,False,False,False


In [21]:
# Convert to csv

df.to_csv("data/streaming_titles_clean.csv",index=False)