In [None]:
#API request to get data
import requests as req
url = 'http://api.tvmaze.com/schedule?country=US'
resp = req.get(url)
response = resp.text

In [None]:
type(response)

In [None]:
#Convert str datatype of response to list since json_normalize function from pandas
#requires JSON array i.e. list of records
import json
data = json.loads(response)
type(data)

In [None]:
#Module to load data as pandas dataframe using json_normalize function from pandas
import pandas as pd
from pandas.io.json import json_normalize
df = json_normalize(data)
df.head()

In [None]:
#get shape of generated data frame
df.shape

In [None]:
#List all column names to select relevant ones for further data processing
df.columns

In [None]:
#Get relevant columns from dataframe on which knn algorithm can be applied(feature selection)
df_relevant = df[['name','id','show.name','show.genres','show.rating.average','show.type']]

In [None]:
#Print newly created data frame
df_relevant.head()

In [None]:
'''The reason to merge name column with show.name column was because
some of the shows had repeated episodes. For example:- Abby Hatcher show had two entries with 
different episode names viz Afraid of Cats and Chef Curly. I needed unique names
to identify nieghbours of an episode correctly which is why I merged the columns to
generate unique names column'''
df_relevant["full_name"] = df_relevant["show.name"].map(str) +"-"+ df_relevant["name"]

In [None]:
df_relevant.head()

In [None]:
#Dropping the source columns from which full_name was generated
df_relevant = df_relevant.drop(['name','show.name'], axis=1)


In [None]:
#rename column names of dataframe for ease of understanding
renamed_columns_dictionary = {'show.genres': 'genres', 
                              'show.language': 'language',
                              'show.rating.average':'rating',
                              'show.type':'type'
                             }
df_relevant.rename(columns=renamed_columns_dictionary, inplace=True)

In [None]:
df_relevant

In [None]:
'''There were many entries which had genres columns empty. We have the choice to
substitute the empty values or drop them entirely'''
df_relevant[df_relevant['genres'].str.len() == 0].shape

In [None]:
#Dropping all columns whose genres column is empty for now 
#Adding relevant genres based on show type can be a better option later
df_relevant = df_relevant.drop(df_relevant[df_relevant['genres'].str.len() == 0].index)

In [None]:
#Considerable number of rows were removed
df_relevant.shape

In [None]:
#Creating a new copy for ease of use and keeping a backup just in case anything messes up ahead
df = df_relevant.copy()

In [None]:
df.head()

In [None]:
#Check for all null values in dataframe
df.isnull().sum(axis = 0)
#Ratings column had some null values

In [None]:
#Check datatypes of all values
df.dtypes

In [None]:
#Fill null values in rating column with median of all values in rating column
df["rating"].fillna(df["rating"].median(), inplace = True)

In [None]:
df.head()

I had to do label encoding for genres column. But pd.get_dummies function in 
pandas does not take list as row values due to which I had to convert them into
string of comma separated values 

In [None]:
df['genres'] = df['genres'].apply(lambda x: ",".join(x))

In [None]:
df.head()

In [None]:
# Now we can get label encoded values from genres column using get_dummies fxn from pandas
df["genres"].str.get_dummies(sep=',')

We are creating a new data frame to be used for generating model using knn. We are neglecting name and id column since they are not useful for finding neighbours. 
"Genres" and "type" column is label encoded while rating column is taken as it is

In [None]:
tv_show_features = pd.concat([df["genres"].str.get_dummies(sep=","),
                            pd.get_dummies(df[["type"]]),
                            df[["rating"]]],axis=1)

In [None]:
tv_show_features

Since the ratings column has values from 0 to 10 while other columns have values from
0 to 1 this can bias the distance metric in KNN because features containing bigger numbers will be weighted heavily while the other features will be discounted.
So I ended up using MinMaxScaler from scikit-learn as it scales the values from 0–1.

In [None]:
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
tv_show_features = min_max_scaler.fit_transform(tv_show_features)

In [None]:
import numpy as np
np.round(tv_show_features,2)

In [None]:
# Then we fit the KNN model from scikit learn to the data and calculate 
# the nearest neighbors for each distances.
from sklearn.neighbors import NearestNeighbors
nbrs = NearestNeighbors(n_neighbors=6, algorithm='ball_tree').fit(tv_show_features)
distances, indices = nbrs.kneighbors(tv_show_features)

In [None]:
distances

In [None]:
#Helper fucntions to get relevant predictions
def get_index_from_name(name):
    return df[df["full_name"]==name].index.tolist()[0]

In [None]:
all_show_names = list(df.full_name.values)

In [None]:
def get_id_from_partial_name(partial):
    for name in all_show_names:
        if partial in name:
            print(name,all_show_names.index(name))

In [None]:
def print_similar_tvshows(query=None,id=None):
    if id:
        for id in indices[id][1:]:
            print(df.iloc[id]["full_name"])
    if query:
        found_id = get_index_from_name(query)
        for id in indices[found_id][1:]:
            print(df.iloc[id]["full_name"])

In [None]:
df

In [None]:
#Change values below as per full_name column values
get_id_from_partial_name("Days of Our Lives-Ep. #13507")

In [None]:
get_index_from_name("Days of Our Lives-Ep. #13507")

In [None]:
print_similar_tvshows("Gotham-Trespassers")

In [None]:
df[df["full_name"]=="Gotham-Trespassers"]

In [None]:
showrecs=[
"Fam-Pilot",
"The Big Bang Theory-The Propagation Proposition",
"Mom-Hacky Sack and a Beautiful Experience",
"Young Sheldon-A Tummy Ache and a Whale of a Metaphor",
"The Orville-Home"
]

df[df['full_name'].isin(showrecs)]