In [1]:
import pandas as pd
import numpy as np
import os
import regex as re
from scipy.signal import savgol_filter

In [2]:
from statsmodels.nonparametric.smoothers_lowess import lowess

In [3]:
def televisionSeriesCondenser(tvSeries, numberOfEps):
    
    neededQuantiles = np.linspace(1, numberOfEps, numberOfEps)/numberOfEps
    
    episodesChosen = np.floor(
        np.quantile(tvSeries['Episode Number'], neededQuantiles))
    
    condensedShow = tvSeries.loc[np.isin(tvSeries['Episode Number'],
                                           episodesChosen)  , :]
    
    return(condensedShow)

In [4]:
os.listdir("./IMDb Data Dump/Unpacked Data")

['name_basics.tsv',
 'title_ratings.tsv',
 'title_crew.tsv',
 'title_principals.tsv',
 'title_akas.tsv',
 'title_episode.tsv',
 'title_basics.tsv']

In [5]:
dataLocation = "./IMDb Data Dump/Unpacked Data"

In [6]:
ratingData = pd.read_csv(dataLocation+'/'+'title_ratings.tsv', sep='\t',
                        low_memory=False)

ratingData = ratingData.replace('\\N', np.nan)

episodeData = pd.read_csv(dataLocation+'/'+'title_episode.tsv', sep='\t',
                         low_memory=False)

episodeData = episodeData.replace('\\N', np.nan)

episodeData['seasonNumber'] = episodeData['seasonNumber'].astype('float64')

episodeData['episodeNumber'] = episodeData['episodeNumber'].astype('float64')

titleData = pd.read_csv(dataLocation+'/'+'title_basics.tsv', sep='\t',
                       low_memory=False)

In [7]:
def wordFinder(potentialWord, theWord):
    
    try: 
        ## For shows that are only comedies
        #theTeller = re.search(potentialWord, theWord)
        
        # return(not pd.isnull(theTeller))
        
        ### For shows that are comedies (but also something else)
        
        theTeller = potentialWord.find(theWord)+1
        
        return([False, True][theTeller])
    
    except: 
        
        return(False)
        
        
    
    

In [8]:
theComedies = titleData['genres'].apply(lambda x: wordFinder(x, "Comedy"))


In [9]:
titleData = titleData.loc[theComedies, :]

In [10]:
desiredSeries = (episodeData['parentTconst'].value_counts() >= 51).to_frame()

In [11]:
desiredSeries.columns = ['keep_series']


In [12]:
merged = pd.merge(desiredSeries, episodeData, left_index=True, right_on='parentTconst',
                 how='inner')

In [13]:
wantedSeriesEpisodes = merged.loc[merged['keep_series'], :]

In [14]:
episodeWithSeriesTitle = pd.merge(wantedSeriesEpisodes,
                                  titleData[['tconst', 'primaryTitle', 'originalTitle']],
                                 left_on='parentTconst', right_on='tconst')

In [15]:
episodeWithSeriesTitle = episodeWithSeriesTitle.rename({"primaryTitle": "seriesPrimaryTitle",
                              "originalTitle": "seriesOriginalTitle",
                              'tconst_x': "tconst"},
                              axis=1)

episodeWithSeriesTitle.drop('tconst_y', axis=1, inplace=True)


episodeWithAllTitles = pd.merge(episodeWithSeriesTitle, titleData,
                                 left_on='tconst', right_on='tconst')


episodeWithAllTitles = episodeWithAllTitles.rename({"primaryTitle": "episodePrimaryTitle",
                              "originalTitle": "episodeOriginalTitle",
                              'tconst_x': "tconst"},
                              axis=1)

In [16]:
def savitzkyGolaySmoother(completeShowData):
    
    meanEpisodesPerSeason = np.max([5, int(np.floor(completeShowData['seasonNumber'].value_counts().mean()))])
    
    if (meanEpisodesPerSeason >=completeShowData.shape[0]):
        
        meanEpisodesPerSeason = int(np.floor(completeShowData.shape[0]/5))
    
    if (meanEpisodesPerSeason % 2 == 0):
        
        meanEpisodesPerSeason = meanEpisodesPerSeason+1
        
    smoothed = savgol_filter(np.array(completeShowData['averageRating']), meanEpisodesPerSeason, 3)
    
    return(smoothed)
    
    

In [17]:
def lowessSmoother(completeShowData): ### Using .1 lead to good performance
    
    completeShowData=completeShowData.reset_index(drop=True)

    smoothed = lowess(np.array(completeShowData['averageRating']), np.array(completeShowData.index), frac=.075, is_sorted=True)
    
    return(smoothed[:,1])

In [18]:
finalEpisodeData = episodeWithAllTitles.merge(ratingData, how='inner')

finalEpisodeData = finalEpisodeData.dropna()

In [19]:
desiredSeriesFinal = (finalEpisodeData['parentTconst'].value_counts() >= 51).to_frame()
desiredSeriesFinal.columns = ['keep_series_verified']

finalEpisodeData = pd.merge(desiredSeriesFinal, finalEpisodeData, 
                            left_index=True, right_on='parentTconst',
                 how='inner')

In [20]:
finalEpisodeData = finalEpisodeData.loc[
    finalEpisodeData['keep_series_verified']==True, :].reset_index(drop=True)

In [21]:
finalEpisodeData = finalEpisodeData.sort_values(by = ["parentTconst", 
        'seasonNumber', 'episodeNumber'],
    ascending=[True, True, True])[['tconst', "parentTconst", "seriesOriginalTitle",
                                           'seasonNumber', 'episodeNumber',
                                          'averageRating']].reset_index(drop=True)

In [22]:
### CHOOSE SMOOTHER HERE
smoothedRatings = finalEpisodeData.groupby('parentTconst').apply(lambda x: lowessSmoother(x))

smoothedRatings = smoothedRatings.explode().to_frame(name='smoothedRating')

In [23]:
finalEpisodeData['smoothedRating'] = np.array(smoothedRatings['smoothedRating'])

In [24]:
seriesLengths = finalEpisodeData.groupby(['parentTconst']).apply(
    lambda x: np.array(list(range(0, len(x))))+1)

In [25]:
finalEpisodeData['seriesEpisodeNumber'] = np.array(seriesLengths.apply(pd.Series)\
.stack().reset_index(drop=True))

In [26]:
finalEpisodeDataCorrectSchema = finalEpisodeData.rename({"episodeNumber": "Episode Number",
                        "seasonNumber": "Season", "episodeNumber": "Episode",
                        "averageRating": "Rating",
                        "seriesOriginalTitle": "Title",
                        "seriesEpisodeNumber": "Episode Number",
                        'parentTconst':"Code"}, axis=1)

In [27]:
finalEpisodeDataCorrectSchema.drop('tconst', axis=1, inplace=True)

In [28]:
finalEpisodeCount = finalEpisodeDataCorrectSchema['Code'].value_counts().to_frame()

finalEpisodeCount.columns=['Number of Episodes']

In [29]:
finalEpisodeDataCorrectSchema = finalEpisodeDataCorrectSchema\
.join(finalEpisodeCount, on='Code')

In [30]:
finalEpisodeDataCorrectSchema['Prop Through Series'] = (
    finalEpisodeDataCorrectSchema['Episode Number']/
        finalEpisodeDataCorrectSchema['Number of Episodes'])

In [31]:
americanOffice = finalEpisodeDataCorrectSchema.loc[
    finalEpisodeDataCorrectSchema['Code']=='tt0386676', :]

condensedOffice = televisionSeriesCondenser(tvSeries=americanOffice, numberOfEps=50)

In [32]:
condensedShows = finalEpisodeDataCorrectSchema.groupby('Code').apply(
    lambda x: televisionSeriesCondenser(x, 51))

condensedShows = condensedShows.reset_index(level=0, drop=True)

In [33]:
onlyFirstFifty = condensedShows.groupby('Code').head(50).copy()

In [34]:
onlyFirstFifty.loc[:, 'main_character_left'] = (onlyFirstFifty['Title'] == 'The Office') & (
    onlyFirstFifty['Episode Number'] >138)

In [35]:
onlyFirstFifty['main_character_left'] = onlyFirstFifty['main_character_left'].astype('int32')

In [36]:
onlyFirstFifty.to_csv('big tv rating data.csv', index=False)