### Getting our data:

In [154]:
# filenames
netflix_movie_titles = "netflix/movie_titles.csv"
imdb_movie_titles = "imdb/title.basics.tsv"

In [195]:
import re
from io import StringIO 
import pandas as pd
import numpy as np

In [156]:
def manual_sep(old_split):
    new_split = old_split[0:2] + [",".join(old_split[2:])]
    return new_split
    
ntfx = pd.read_csv(netflix_movie_titles,
                   encoding = "ISO-8859-1",
                   header = None,
                   names = ['Movie_Id', 'Year', 'Name'],
                   on_bad_lines=manual_sep,
                   engine='python')
ntfx.dropna(subset='Year', inplace=True)
ntfx['Year'] = ntfx['Year'].astype("Int64")
print(f'{ntfx.shape = }')
ntfx.head()

ntfx.shape = (17763, 3)


Unnamed: 0,Movie_Id,Year,Name
0,1,2003,Dinosaur Planet
1,2,2004,Isle of Man TT 2004 Review
2,3,1997,Character
3,4,1994,Paula Abdul's Get Up & Dance
4,5,2004,The Rise and Fall of ECW


In [198]:
stream = StringIO()
header = True # We want the first line always
with open(imdb_movie_titles, "r") as file:
    patrn = "^tt[0-9]*\t(movie|short|tvSeries|tvShort|tvMovie|tvSpecial|video|tvMiniSeries)\t"
    for line in file:
        if re.search(patrn, line) or header:
            stream.write(line)
            header = False
stream.seek(0)
imdb = pd.read_csv(stream,
                   sep='\t',
                   header=0)
imdb = imdb[imdb.startYear.apply(lambda x: x.isnumeric())].dropna(subset='startYear', inplace=False)
imdb["endYear"] = imdb.endYear.apply(lambda x: x if x.isnumeric() else np.nan)
imdb['endYear'] = imdb['endYear'].astype("Int64")
imdb['startYear'] = imdb['startYear'].astype("Int64")
stream.close()
file.close()

In [200]:
print(f'{imdb.shape = }')
imdb.head()

imdb.shape = (2100508, 9)


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [201]:
imdb['titleType'].unique()

array(['short', 'movie', 'tvSeries', 'tvShort', 'tvMovie', 'tvMiniSeries',
       'tvSpecial', 'video'], dtype=object)

In [218]:
!cat imdb/title.basics.tsv | grep "Merlin the Magical Puppy" -i

tt1113486	tvSeries	Merlin the Magical Puppy	Merlin the Magical Puppy	0	2001	\N	10	Animation,Family


### Title comparison using word2vec

In [161]:
import gensim
from gensim.models import Word2Vec
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize

In [203]:
imdb_titles = list(zip(imdb['primaryTitle'].values,
                       imdb['originalTitle'].values,
                       imdb['startYear'].values,
                       imdb['endYear'].values))
ntfx_titles = list(zip(ntfx['Name'].values, ntfx['Year'].values))

In [None]:
sm = difflib.SequenceMatcher()
rat = 0.0
chosen_titles = {}
used = {}
for ntitle in ntfx_titles:
    best_rat = 0.0
    best_title = ''
    for ititle in imdb_titles:
        if used.get(ititle, False):
            continue
        sm.set_seqs(ntitle, ititle)
        rat = sm.ratio()
        if rat > best_rat:
            best_rat = rat
            best_title = ititle
    chosen_titles[ntitle] = best_title
    used[best_title] = True

In [48]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            if token == 'xxxx':
                continue
            result.append(lemmatize_stemming(token))
    return result

In [216]:
# Load google's pretrained word vectors
model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)

In [225]:
s1 = ['rise', 'fall', 'ecw']
s2 = ['rise', 'fal', 'ecw']
s3 = ['rise', 'fall']
v1 = model.get_mean_vector(s1, post_normalize=True)
v2 = model.get_mean_vector(s2, post_normalize=True)
v3 = model.get_mean_vector(s3, post_normalize=True)
model.cosine_similarities(v1, [v2, v3])

array([0.71618056, 1.        ], dtype=float32)

### Comparing based on difflab ratios

In [163]:
import difflib
import numpy as np

In [212]:
sm1 = difflib.SequenceMatcher()
sm2 = difflib.SequenceMatcher()
rat = 0.0
chosen_titles = {}
used = {}
n = 0 # For testing
print('(ntitle|ititle|nyear|iyear|ratio)') # For testing
for ntitle, nyear in ntfx_titles:
    best_rat = 0.0
    best_title = None
    best_year = None
    for ititle1, ititle2, iyear, iyear_end in imdb_titles:
        if (pd.isna(iyear_end)):
            test = abs(iyear - nyear) > 1
        else:
            test = not (iyear-1 <= nyear <= iyear_end+1)
        if used.get(ititle1, False) or test:
            continue
        sm1.set_seqs(ntitle, ititle1)
        sm2.set_seqs(ntitle, ititle2)
        rat = max(sm1.ratio(), sm2.ratio())
        if rat > 0.5 and rat > best_rat:
            best_rat = rat
            best_title = ititle1
            best_year = iyear
    print(f'({ntitle}|{best_title}|{nyear}|{best_year}|{best_rat})') # For testing
    chosen_titles[ntitle] = (best_title, iyear, nyear)
    used[best_title] = True
    n+=1 # For testing
    if (n == 1000): # For testing
        break # For testing
    

(ntitle|ititle|nyear|iyear|ratio)
(Dinosaur Planet|Dinosaur Planet|2003|2003|1.0)
(Isle of Man TT 2004 Review|None|2004|None|0.0)
(Character|Character|1997|1997|1.0)
(Paula Abdul's Get Up & Dance|Get Up and Dance!|1994|1994|0.5777777777777777)
(The Rise and Fall of ECW|The Rise & Fall of ECW|2004|2004|0.9130434782608695)
(Sick|Sick|1997|1997|1.0)
(8 Man|8 Man|1992|1992|1.0)
(What the #$*! Do We Know!?|What the #$*! Do We (K)now!?|2004|2004|0.9629629629629629)
(Class of Nuke 'Em High 2|Class of Nuke 'Em High Part II: Subhumanoid Meltdown|1991|1991|0.6052631578947368)
(Fighter|Fighter|2001|2000|1.0)
(Full Frame: Documentary Shorts|Untitled Al Gore Documentary|1999|2000|0.5862068965517241)
(My Favorite Brunette|My Favorite Brunette|1947|1947|1.0)
(Lord of the Rings: The Return of the King: Extended Edition: Bonus Material|The Lord of the Rings: The Return of the King - Special Extended Edition Scenes|2003|2004|0.7922077922077922)
(Nature: Antarctica|Antarctica|1982|1983|0.7142857142857143

(Cat and the Canary|Fat and the Canary|1927|1927|0.9444444444444444)
(Naked Lies|Naked Lies|1998|1998|1.0)
(Star Trek: Voyager: Season 1|Star Trek: Voyager|1995|1995|0.782608695652174)
(Allergies: A Natural Approach|All Natural 4|2001|2000|0.5714285714285714)
(Lost in the Wild|Christy in the Wild|1993|1993|0.8)
(Goddess of Mercy|Goddess of Mercy|2004|2003|1.0)
(The Tricky Master|The Tricky Master|2000|1999|1.0)
(The Game|The Game|1997|1997|1.0)
(Deepak Chopra: The Way of the Wizard & Alchemy|The Way of the Birds|2000|2000|0.5454545454545454)
(Get Out Your Handkerchiefs|Get Out Your Handkerchiefs|1978|1978|1.0)
(Cannibal Women in the Avocado Jungle of Death|Cannibal Women in the Avocado Jungle of Death|1988|1989|1.0)
(Where Sleeping Dogs Lie|Where Sleeping Dogs Lie|1992|1991|1.0)
(Sweet November|Sweet November|2001|2001|1.0)
(The Edward R. Murrow Collection|The Homegrown Collection|2005|2006|0.6545454545454545)
(Firetrap|Firetrap|2001|2001|1.0)
(Sleepover Nightmare|Sleepover Nightmare|2

(Saudade Do Futuro|Saudade Do Futuro|2000|2000|1.0)
(Touched by an Angel: Season 1|Touched by an Angel|1994|1994|0.7916666666666666)
(The Final Countdown|The Final Countdown|1980|1980|1.0)
(Parenthood|Parenthood|1989|1989|1.0)
(Sex and the City: Season 4|Sex and the City|2001|1998|0.7619047619047619)
(Saludos Amigos|Saludos Amigos|1943|1942|1.0)
(Female Yakuza Tale|Female Yakuza Tale|1973|1973|1.0)
(Taxi|Taxi|2004|2004|1.0)
(Crazy as Hell|Crazy as Hell|2002|2002|1.0)
(Evelyn|Evelyn|2002|2002|1.0)
(Cold Harvest|Cold Harvest|1998|1999|1.0)
(Enigma: MCMXC A.D|Excitant Eye Shot|2003|2002|0.5217391304347826)
(100 Days Before the Command|100 Days Before the Command|1990|1991|1.0)
(Micki and Maude|Micki + Maude|1984|1984|0.8571428571428571)
(Sarah Brightman: In Concert|Sarah Brightman in Concert|1998|1998|0.9433962264150944)
(The Legend|The Legend|1993|1993|1.0)
(Blue Seed: Beyond|Quick Step Beyond|2003|2002|0.6470588235294118)
(If These Walls Could Talk|If These Walls Could Talk|1996|1996|1.

(Rio Lobo|Rio Lobo|1970|1970|1.0)
(Halloween 5: The Revenge of Michael Myers|Halloween 5: The Revenge of Michael Myers|1989|1989|1.0)
(Pan Tadeusz|Pan Tadeusz: The Last Foray in Lithuania|1999|1999|1.0)
(Substitute 4: Failure is Not an Option|The Substitute: Failure Is Not an Option|2000|2001|0.8974358974358975)
(The Shaft|The Shaft|2001|2001|1.0)
(Wings of Desire|Wings of Desire|1987|1987|1.0)
(Hostage|Hostage|2005|2005|1.0)
(The Abductors|The Abductors|1972|1972|1.0)
(Nightbreed|Nightbreed|1990|1990|1.0)
(Godzilla vs. The Sea Monster|Gamera: The Giant Monster|1966|1965|0.6037735849056604)
(Frank Lloyd Wright|Frank Lloyd Wright|1998|1998|1.0)
(Vendetta|Vendetta|1999|1999|1.0)
(Jay Jay the Jet Plane: Adventures in Learning|Jay Jay the Jet Plane|1999|1998|0.6363636363636364)
(Igby Goes Down|Igby Goes Down|2002|2002|1.0)
(Girl|Girl|1999|1998|1.0)
(Reign in Darkness|Reign in Darkness|2002|2002|1.0)
(Elephant|Elephant|2003|2003|1.0)
(Transformers: Season 3: Part 1|Transformers: Five Faces 

(Summer of the Monkeys|Summer of the Monkeys|1998|1998|1.0)
(Return to Horror High|Return to Horror High|1987|1987|1.0)
(You're Invited to Mary-Kate and Ashley's Vacation Parties|You're Invited to Mary-Kate and Ashley's Mall Party|1996|1997|0.8703703703703703)
(Young Einstein|Young Einstein|1988|1988|1.0)
(Drop Dead Fred|Drop Dead Fred|1991|1991|1.0)
(With a Friend Like Harry|With a Friend Like Harry...|2000|2000|0.9411764705882353)
(The Alamo|The Alamo|1960|1960|1.0)
(Sol Goode|Stolen Good|2001|2002|0.8)
(Here is Greenwood|Here Is Greenwood|1991|1991|0.9411764705882353)
(A Crime of Passion|A Crime of Passion|2003|2003|1.0)
(Rumpole of the Bailey: Series 4|Rumpole of the Bailey|1987|1978|0.8076923076923077)
(Ghosts of the Abyss: Bonus Material|Ghosts of the Abyss|2003|2003|0.7037037037037037)
(King Cobra|King Cobra|1998|1999|1.0)
(The Blackout|The Blackout|1997|1997|1.0)
(Roughnecks: The Starship Troopers Chronicles: The Pluto Campaign|Roughnecks: The Starship Troopers Chronicles|1999|

(Veronica 2030|Veronica|2004|2004|0.7619047619047619)
(Highlander: Season 5|Highlander|1996|1992|0.6666666666666666)
(Robin Hood: Prince of Thieves|Robin Hood: Prince of Thieves|1991|1991|1.0)
(The Last House on the Left|The Last House on the Left|1972|1972|1.0)
(Saving Grace|Saving Grace|2000|2000|1.0)
(Who is Cletis Tout?|Who Is Cletis Tout?|2002|2001|0.9473684210526315)
(Rob Roy|Rob Roy|1995|1995|1.0)
(La Femme Nikita: Season 3|La Femme Nikita|1999|1997|0.75)
(Office Killer|Office Killer|1997|1997|1.0)
(Lipstick|Lipstick|1976|1976|1.0)
(The Man Who Came to Dinner|The Man Who Came to Dinner|2000|2000|1.0)
(Gilligan's Island: Season 2|Gilligan's Island|1965|1964|0.7727272727272727)
(Saturday Night Live: The Best of Will Ferrell 2|Saturday Night Live: The Best of Will Ferrell - Volume 2|2004|2004|0.912621359223301)
(Real Kung Fu of Shaolin|Young Hero of Shaolin II|1986|1986|0.6382978723404256)
(He Loves Me, He Loves Me Not|He Loves Me, He Loves Me Not|2002|2002|1.0)
(The Winter People|

(La Vallee|La vallée|1972|1972|0.7777777777777778)
(Clerks|Clerks|1994|1994|1.0)
(Boyz N the Hood|Boyz n the Hood|1991|1991|0.9333333333333333)
(A Single Girl|Single Girls|2000|2001|0.88)
(Don Henley: Live Inside Job|Don Henley: Live Inside Job|2000|2000|1.0)
(Unsolved History: Salem Witch Trials|Unsolved History|2004|2002|0.6153846153846154)
(By Brakhage: An Anthology|By Brakhage: An Anthology, Volume One|2004|2003|0.8064516129032258)
(A Stranger Among Us|A Stranger Among Us|1992|1992|1.0)
(Landmarks of Early Film|Landmarks of Early Film|1997|1997|1.0)
(Rabid|Rabid|1976|1977|1.0)
(Look Back in Anger|Look Back in Anger|1959|1959|1.0)
(Jaws|Jaws|1975|1975|1.0)
(Teen Titans: Season 1|Teen Titans|2003|2003|0.6875)
(Dracula Vs. Frankenstein|Dracula vs. Frankenstein|1971|1971|0.9583333333333334)
(We Know Where You Live. Live!|We Know Where You Live. Live!|2001|2001|1.0)
(Spies|Spies|1928|1928|1.0)
(Soul Assassin|Soul Assassin|2001|2001|1.0)
(Sherlock Holmes: The Scarlet Claw|The Scarlet Cla

(A Hard Day's Night: Collector's Series|A Hard Day's Night|1964|1964|0.6428571428571429)
(Comedian|Comedian|2002|2002|1.0)
(Satanis: The Devil's Mass / Sinthia: The Devil's Doll: Double Feature|None|1968|None|0.0)
(Don't Bother to Knock|Don't Bother to Knock|1952|1952|1.0)
(Gojoe: Spirit War Chronicle|Security Cam Chronicles 1|2004|2004|0.6153846153846154)
(Giant Robo|Giant|2004|2003|0.6666666666666666)
(Taxi Zum Klo|Taxi zum Klo|1981|1980|0.9166666666666666)
(Dodsworth|Dodsworth|1936|1936|1.0)
(Fear Of A Punk Planet|US Off the Planet|2002|2001|0.5789473684210527)
(Danielle Steel's Changes|Valentine's Challenge|1991|1992|0.6222222222222222)
(The Night That Never Happened|The Night That Never Happened|1997|1997|1.0)
(Journeys with George|Journeys with George|2002|2002|1.0)
(Back to the Beach|Back to the Beach|1987|1987|1.0)
(Another Day in Paradise|Another Day in Paradise|1998|1998|1.0)
(Where Are We?|Where Were You?|1992|1993|0.7142857142857143)
(Backyardigans: It's Great to Be a Ghost

In [131]:
chosen_titles

{}

In [76]:

mr = difflib.SequenceMatcher()
mr.set_seqs("The Rise and Fall of ECW", 'The Rise and Fall of Books') 
print(mr.ratio())
mr.set_seqs('dog is wet', 'is wet') 
print(mr.ratio())

0.84
0.75
