# Movie recommender

In [None]:
import pandas as pd
import numpy as np
from ast import literal_eval
import locale
import re
from urllib import request
import gzip
import shutil
from multiprocessing.pool import ThreadPool
locale.setlocale(locale.LC_TIME, 'de_DE.UTF-8')

## Netflix
- go to your account page.
- in the desired profile, open the Profiles & Parental Controls setting.
- IMPORTANT: ensure the language is set to english!
- open Track History.
- if the list is not fully displayed, select the Show more button.
- Save the file to `NetflixViewingHistory.csv`

Next code section:
1. Loads data to an dataframe
2. converts the string date to a python date format
3. the regList identifies any kind of tvSeries marker, which will be used in regexes to determine wether the name is a tvSeries or a movie
4. All sufixes in the name e.g. season episode need to be remove for further imdb name mappings

In [None]:
netflix = pd.read_csv('NetflixViewingHistory.csv')
netflix.columns = ['name', 'date']
netflix['date'] = pd.to_datetime(netflix['date'], format='%d/%m/%Y')

regList = ['-Staffel ', '– Staffel','- Staffel', ': Season', ': Series', ': Limited Series:', ': Staffel', ': Part', ' Part', ': Vol', ': Volume ', ': Stranger Things ', ': Die komplette 1. Staffel', 'Staffel']
combinedMatchList = ".*(" + ").*|.*(".join(regList) + ").*"
combinedDelList = "(" + ").*|(".join(regList) + ").*"

netflix['type'] = netflix['name'].apply(lambda x: 'tvseries' if re.match(combinedMatchList, x) is not None else 'movie' )
netflix['name'] = netflix['name'].str.replace(combinedDelList, '', regex=True)

netflix['service'] = 'Netflix'
netflix.head(4)

## Amazon Prime Video
- check in to your Amazon Prime Account.
- tap on your account.
- find Digital Content & Devices.
- select Prime Video Settings.
- Account & Settings screen opens.
- tap on Watch History.
- View Watch History.
- Manually mark your history and copy & paste it to a txt file.
- Save to `PrimeVideoViewingHistoryRaw.csv`

Next code section:
1. Since Prime Video doesn't provide CSV File but we have Txt File we need to create the columns on our own.
2. We can identify each date String and move it to another column, next we can fill the gaps and convert the Strings to Dates.
3. As before for Netflix data we can reuse the regexes to detect tVSeries and Movies.

In [None]:
amazon = pd.read_table('PrimeVideoViewingHistory.txt',header=None)
amazon.columns = ['mixed']

# filter out
skipLines = ['Folgen aus dem Wiedergabeverlauf löschen', 'Angesehene Folgen', 'Film aus dem Wiedergabeverlauf löschen']
amazon = amazon[~amazon['mixed'].isin(skipLines)]

# move date
amazon['date']=amazon['mixed'][amazon['mixed'].str.match(r'^\d+. \S+ \d+$') == True]
amazon.fillna(method='ffill', inplace=True)
amazon['date'] = pd.to_datetime(amazon['date'], format='%d. %B %Y')

# filter dates in mixed out 
amazon = amazon[~amazon['mixed'].str.match(r'^\d+. \S+ \d+$')]

# replace OV
amazon['mixed'] = amazon['mixed'].str.replace('(\[\D+\.\D+\])', '', regex=True)

# detect tvSeries
amazon['type'] = amazon['mixed'].apply(lambda x: 'tvseries' if re.match(combinedMatchList, x) is not None else 'movie' )
amazon['name'] = amazon['mixed'].str.replace(combinedDelList, '', regex=True)

# cleanup
amazon.drop(['mixed'], inplace=True, axis=1, errors='ignore')
amazon['service'] = 'Amazon Prime Video'
amazon.head(4)

Next combine Netflix and Amazon Prime Video dataframes.

In [None]:
history = netflix.append(amazon, ignore_index=True, sort=True)
history["name"] = history["name"].str.lower()
history["name"] = history["name"].str.strip()
history.head(4)

# IMDb Datasets

Download  IMDb Datasets and combine it into one dataset from https://www.imdb.com/interfaces/ 

The IMDB dataframe contains info for each season and episode, since our data doesn't offer all information, we need to drop the duplicate titles.

In [None]:
urls = ["https://datasets.imdbws.com/title.basics.tsv.gz"
        #,"https://datasets.imdbws.com/title.episode.tsv.gz"
        ,"https://datasets.imdbws.com/title.ratings.tsv.gz"
        #,"https://datasets.imdbws.com/title.akas.tsv.gz"
        #,"https://datasets.imdbws.com/title.crew.tsv.gz"
        #,"https://datasets.imdbws.com/title.principals.tsv.gz"
        #,"https://datasets.imdbws.com/name.basics.tsv.gz"
          ]
          
def download_url(url):
    # Download process
    print("downloading: ",url)
    file_title = re.split(pattern='/', string=url)[-1]
    urlrtv = request.urlretrieve(url=url, filename=file_title)
    
    # for ".tsv" to ".csv"
    title = re.split(pattern=r'\.tsv', string=file_title)[0] +".csv"
    
    # Unzip ".gz" file
    with gzip.open(file_title, 'rb') as f_in:
        with open(title, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

# parallel fast download
results = ThreadPool(5).imap_unordered(download_url, urls)

imdb = pd.read_csv('title.basics.csv', sep='\t', na_values=['\\N'], low_memory=False)
imdbRatings = pd.read_csv('title.ratings.csv', sep='\t', na_values=['\\N'], low_memory=False)
imdb = pd.merge(imdb, imdbRatings, how='left', on='tconst')
imdb.drop(['isAdult', 'endYear'], inplace=True, axis=1, errors='ignore')
imdb[['startYear']] = imdb[['startYear']].astype(str)
imdb["primaryTitle"] = imdb["primaryTitle"].str.lower()

# normalize different titleTypes
imdb["titleType"] = imdb["titleType"].str.lower()
imdb['titleType'] = imdb['titleType'].str.replace('tvsiniseries', 'tvseries')
imdb['titleType'] = imdb['titleType'].str.replace('tvminiseries', 'tvseries')
imdb['titleType'] = imdb['titleType'].str.replace('short', 'movie')
imdb['titleType'] = imdb['titleType'].str.replace('tvmovie', 'movie')
imdb['titleType'] = imdb['titleType'].str.replace('tvpilot', 'movie')
imdb['titleType'] = imdb['titleType'].str.replace('tvshort', 'movie')
imdb['titleType'] = imdb['titleType'].str.replace('tvspecial', 'movie')
imdb['titleType'] = imdb['titleType'].str.replace('video', 'movie')
imdb['titleType'] = imdb['titleType'].str.replace('videogame', 'movie')
imdb['titleType'] = imdb['titleType'].str.replace('moviegame', 'movie')
imdb = imdb[~imdb['titleType'].isin(['tvepisode'])]
imdb.drop_duplicates(subset=['primaryTitle'], keep='first', inplace=True)
imdb.head(4)

#imdb.describe()
#imdb.boxplot()
#imdb.hist()
#imdb.info()
#imdb.isnull().sum()
#imdb.groupby(['titleType']).mean()
#imdb[imdb['tconst'].str.strip()=='tt3556944']

For a high mapping quality, we don't search only by name, but name and type combination. That should catch most cases.

In [None]:
history = pd.merge(history, imdb, how='left', left_on=['name', 'type'], right_on=['primaryTitle', 'titleType'])
history.drop(['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'startYear', 'runtimeMinutes'], inplace=True, axis=1, errors='ignore')
history.head(4)

In [None]:
imdb.to_csv('imdb.zip', index=False, encoding='utf-8', compression={'method': 'zip', 'archive_name': 'imdb.csv'})
history.to_csv('history.zip', index=False, encoding='utf-8', compression={'method': 'zip', 'archive_name': 'history.csv'})