## PRE PROCESS DATA
Convert the movielens 100k data to csv file.

In [9]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn

In [10]:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']

In [11]:
users = pd.read_csv("ml-100k/u.user",sep='|',names=u_cols)
ratings = pd.read_csv('ml-100k/u.data',sep='\t', names=r_cols)
movies = pd.read_csv('ml-100k/u.item', sep='|', names=m_cols, usecols=range(5), encoding = "ISO-8859-1")

In [38]:
ratings.to_csv ('processed-data/dataratings.csv', index = None, header=True,encoding='utf-8')
movies.to_csv ('processed-data/movies.csv', index = None, header=True)
users.to_csv ('processed-data/users.csv', index = None, header=True)

## Crawl from IMDB
crawl more movie details form imdb website

Step:
1. get imdb ID by search the movie name
2. get more movie details from omdbapi

In [14]:
import html5lib
import requests
import re
from bs4 import BeautifulSoup



class IMDB:
    baseUrl = "http://www.imdb.com/"
    titleUrl = baseUrl+"title/"
    searchUrl = baseUrl+"find?s=all&q="
    creditsPath = 'fullcredits/'
    currentSearchTitle = None
    
    
    def getMovie(self, title):
        return IMDB.getMovieByImdbId(self, IMDB.getIdFromName(self,title))
    
    def getMovieByImdbId(self, title):
        soup = IMDB.getSoup(title, None)
        movie = IMDB.getFeatures(soup);
        movie['rating'] = IMDB.getRatingByImdbId(self, title, soup)
        movie['summary'] = IMDB.getSummaryByImdbId(self, title, soup)
        movie['director'] = IMDB.getDirectorByImdbId(self, title, soup)
        movie['casting'] = IMDB.getCastingByImdbId(self, title)
        
        return movie;

    #############################
    '''get Ratings only STARTS'''
    #############################

    def getRating(self, title):
        return IMDB.getRatingByImdbId(self, IMDB.getIdFromName(title))

    def getRatingByImdbId(self, title, soup=None):
        soup = IMDB.getSoup(title, soup)
        return re.sub('\s+', '', IMDB.parseHTML(soup, 'div', 'class', 'ratingValue').strong.span.text)

    ###########################
    '''get Ratings only ENDS'''
    ###########################

    '''--------------------------------------------------------------------'''

    #############################
    '''get Summary only STARTS'''
    #############################

    def getSummary(self, title):
        return IMDB.getSummaryByImdbId(self, IMDB.getIdFromName(title))

    def getSummaryByImdbId(self, title, soup=None):
        soup = IMDB.getSoup(title, soup)
        return (re.sub(r'[\t\r\n]', '', (IMDB.parseHTML(soup, 'div', 'class', 'summary_text').text))).strip()

    #############################
    '''get Summary only ENDS'''
    #############################

    '''--------------------------------------------------------------------'''

    #############################
    '''get Director only STARTS'''
    #############################

    def getDirector(self, title):
        return IMDB.getDirectorByImdbId(self, IMDB.getIdFromName(title))

    def getDirectorByImdbId(self, title, soup=None):
        soup = IMDB.getSoup(title, soup)
        director = (re.sub(r'[\t\r\n]', '', (IMDB.parseHTML(soup, 'div', 'class', 'credit_summary_item').text))).strip()
        
        return director[9:]


    #############################
    '''get Director only ENDS'''
    #############################

    '''--------------------------------------------------------------------'''

    #############################
    '''get Casting only STARTS'''
    #############################

    def getCasting(self, title, length=10, all=False):
        return IMDB.getCastingByImdbId(self, IMDB.getIdFromName(title), length=length, all=all)

    def getCastingByImdbId(self, title, length=10, all=False):
        soup = IMDB.parseHTML(IMDB.scrapSite(IMDB.titleUrl+title+"/"+IMDB.creditsPath), 'table', 'class', 'cast_list')
        castList = []
        counter = 0;
        for tr in soup.find_all('tr'):
            tds = tr.find_all('td')
            if len(tds) > 2:
                cast = {}
                cast['actor'] = re.sub(r'[\t\r\n]', '', "".join(tds[1].find_all(text=True))).strip()
                cast['role'] = re.sub(r'[\t\r\n]', '', "".join(tds[3].find_all(text=True))).strip()
                castList.append(cast)
                if not all:
                    counter+=1;
                    if counter == 10:
                        break;
        return castList;

    #############################
    '''get Casting only ENDS'''
    #############################

    '''--------------------------------------------------------------------'''
    
    def getFeatures(soup):
        features = {}
        features['title'] = IMDB.currentSearchTitle
        '''features['runTime'] = re.sub(r'[\t\r\n]', '',IMDB.parseHTML(soup, 'time', 'datetime').text).strip()'''
        
        infobar = soup.find('div',{'class':'titleBar'})
        generDirty = infobar.findAll('a',{'href':True});
        features['runTime'] = re.sub(r'[\t\r\n]', '',infobar.find('time').text).strip();
        features['titleYear'] = generDirty[0].text
        features['releaseDate'] = re.sub(r'[\t\r\n]', '', generDirty[-1].text)
        gener = []
        for tag in generDirty:
            gener.append(tag.text)
        
        features['gener'] = gener[1:-1];
        features['posterUrl'] = IMDB.parseHTML(soup, 'div', 'class', 'poster').a.img['src']
        
        return features;
    
    def getSoup(title, soup):
        if soup is None:
            soup = IMDB.scrapSite(IMDB.titleUrl+title+"/")
        return soup;

    def parseHTML(soup, ele, idType, idValue):
        try:
            return soup.find( ele, {idType:idValue})
        except Exception:
            print("Sorry an error accured cant get data extracted")
        return ""

    def getIdFromName(self,title):
        try:
            soup = IMDB.scrapSite(IMDB.searchUrl+title)
            movie = soup.find('td',{'class':'result_text'}).a
            print("Movie: "+movie.text)
            IMDB.currentSearchTitle = movie.text;
            return movie['href'].split('/')[2]
        except Exception:
            print("Sorry an error accured cant get data extracted")
        
        return ""
    
    def getIdFromName2(self,title):
        try:
            soup = IMDB.scrapSite(IMDB.searchUrl+title)
            movie = soup.find('td',{'class':'result_text'}).a
            moviess = []
            moviess.append(movie.text)
            #print("Movie: "+movie.text)
            #print(moviess[0])
            IMDB.currentSearchTitle = movie.text;
            #print(movie['href'].split('/')[2])
            imdbID = movie['href'].split('/')[2]
            moviess.append(imdbID)
            return moviess;
        except Exception:
            return False
            #print("Sorry an error accured cant get data extracted")
        
        return ""

    def scrapSite(url):
        try:
            resp = requests.get(url)
            return BeautifulSoup(resp.text, "html5lib")
        except Exception:
            print("Problem with the network connection, please check your wifi or lan connection")

In [15]:
#test if works
imdb = IMDB()
movieName = 'Shanghai Triad '

imdb. getIdFromName2(movieName)
print(imdb. getIdFromName2(movieName))

['Yao a yao, yao dao wai po qiao', 'tt0115012']


In [17]:
#expand original datafram to include imdbID and imdbName
movies['imdbID'] = 'N/A'
movies['imdbName'] = 'N/A'
movies.head()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url,imdbID,imdbName
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,,
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,,
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,,
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,,
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),,


In [420]:
#iterate movie dataframe to obtain imdbID

for i, row in movies.iterrows():

    movieName = row['title']
    movieNameNoYear = movieName[:-7]
    
    imdbID = imdb. getIdFromName2(movieNameNoYear) 
    print(imdbID)
    if (imdbID):
        movies.loc[i,'imdbName']  = imdbID[0]
        movies.loc[i,'imdbID'] = imdbID[1]
    else:
        print("error for "+ movieName)
        


['Toy Story', 'tt0114709']
['GoldenEye', 'tt0113189']
['Four Rooms', 'tt0113101']
['Get Shorty', 'tt5761496']
['Copycat', 'tt0112722']
Sorry an error accured cant get data extracted

error forShanghai Triad (Yao a yao yao dao waipo qiao) (1995)
['Twelve Monkeys', 'tt0114746']
['Babe', 'tt0112431']
['Dead Man Walking', 'tt0112818']
['Richard III', 'tt0114279']
['Se7en', 'tt0114369']
['The Usual Suspects', 'tt0114814']
['Mighty Aphrodite', 'tt0113819']
['Il postino', 'tt0110877']
["Mr. Holland's Opus", 'tt0113862']
['Gazon maudit', 'tt0113149']
['From Dusk Till Dawn', 'tt0116367']
['Badkonake sefid', 'tt0112445']
['Antonia', 'tt0112379']
['Angels and Insects', 'tt0112365']
['Muppet Treasure Island', 'tt0117110']
['Braveheart', 'tt0112573']
['Taxi Driver', 'tt0075314']
['Hung fan kui', 'tt0113326']
['The Birdcage', 'tt0115685']
['The Brothers McMullen', 'tt0112585']
['Bad Boys', 'tt0112442']
['Apollo 13', 'tt0112384']
['Batman Forever', 'tt0112462']
['Secret Diary of a Call Girl', 'tt1000

['James and the Giant Peach', 'tt0116683']
['Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb', 'tt0057012']
['Trainspotting', 'tt0117951']
['The First Wives Club', 'tt0116313']
['Matilda', 'tt0117008']
['The Philadelphia Story', 'tt0032904']
['Vertigo', 'tt0052357']
['North by Northwest', 'tt0053125']
['The Apartment', 'tt0053604']
['Some Like It Hot', 'tt0053291']
['Casablanca', 'tt0034583']
['The Maltese Falcon', 'tt0033870']
['My Fair Lady', 'tt0058385']
['Sabrina, the Teenage Witch', 'tt0115341']
['Roman Holiday', 'tt0046250']
['Sunset Blvd.', 'tt0043014']
['Notorious', 'tt0038787']
['To Catch a Thief', 'tt0048728']
['The Adventures of Robin Hood', 'tt0029843']
['East of Eden', 'tt0048028']
['The Thin Man', 'tt0025878']
['His Girl Friday', 'tt0032599']
['Around the World in 80 Days', 'tt0327437']
["It's a Wonderful Life", 'tt0038650']
['Bringing Up Baby', 'tt0029947']
['The African Queen', 'tt0043265']
['Cat on a Hot Tin Roof', 'tt0051459']
['Fly Away Home', 't

['Dazed and Confused', 'tt0106677']
['Naked', 'tt0107653']
['Orlando', 'tt0107756']
['Ruby in Paradise', 'tt0108000']
['Some Folks Call It a Sling Blade', 'tt0108181']
['A Month by the Lake', 'tt0113849']
['Funny Face', 'tt0050419']
['An Affair to Remember', 'tt0050105']
['Little Lord Fauntleroy', 'tt0081062']
['The Inspector General', 'tt0041509']
['Winnie the Pooh and the Blustery Day', 'tt0063819']
['Boychoir', 'tt3302706']
['Mediterraneo', 'tt0102426']
['Passion Fish', 'tt0105107']
['Jerry Garcia', 'nm0305263']
['The Poison Rose', 'tt5862166']
['Fear', 'tt0116287']
['Solo: A Star Wars Story', 'tt3778644']
['The Substitute', 'tt0117774']
["Heaven's Prisoners", 'tt0116508']
['The Trigger Effect', 'tt0117965']
['Mother Night', 'tt0117093']
['Dangerous Ground', 'tt0118927']
['Maximum Risk', 'tt0117011']
["The Rich Man's Wife", 'tt0117473']
['Shadow Conspiracy', 'tt0120107']
['Blood', 'tt8991526']
['Turbulence', 'tt0120390']
['Underworld', 'tt0320691']
['The Beautician and the Beast', '

['The Scarlet Letter', 'tt0114345']
['8 Seconds', 'tt0109021']
['That Darn Cat!', 'tt0059793']
['Ladybird Ladybird', 'tt0110296']
['Bye Bye, Love', 'tt2639596']
['Century', 'tt0106537']
['Ma saison préférée', 'tt0107471']
['Pather Panchali', 'tt0048473']
['Golden Earrings', 'tt0039428']
['Foreign Correspondent', 'tt0032484']
['Lady of Burlesque', 'tt0036094']
['Angel on My Shoulder', 'tt0038300']
['Angel and the Badman', 'tt0039152']
['Outlaw King', 'tt6679794']
['Beat the Devil', 'tt0046414']
['Love Is All There Is', 'tt0116928']
['A Damsel in Distress', 'tt0028757']
['Madame Butterfly', 'tt0023169']
['Sleepover', 'tt0368975']
['Here Comes Cookie', 'tt0026465']
['Aladdin and the King of Thieves', 'tt0115491']
["This Boy's Life", 'tt0108330']
['The Stars Fell on Henrietta', 'tt0114534']
['Last Summer in the Hamptons', 'tt0113612']
["Margaret's Museum", 'tt0113774']
['The Saint of Fort Washington', 'tt0108026']
['A Cure for Wellness', 'tt4731136']
['Tom and Huck', 'tt0112302']
['Gumby 1

In [18]:
movies.head()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url,imdbID,imdbName
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,,
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,,
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,,
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,,
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),,


In [20]:
#test OMDBAPI
movieName = 'Shanghai Triad'
imdbID = 'tt0047437'
API_KEY = 'acaee3e0'
url2 = "http://www.omdbapi.com/?i="+imdbID+"&plot=full&apikey="+API_KEY
r = requests.get(url2)

print(jsonA)
jsonA = r.json()

if (jsonA['Response']=='False'):
    print("movieName  ERRORRR")
else:
        print (jsonA['Title'])
        a = pd.DataFrame(jsonA)

###1000 request per day, unless pay then 100k

{'Title': 'Sabrina', 'Year': '1954', 'Rated': 'Passed', 'Released': '15 Oct 1954', 'Runtime': '113 min', 'Genre': 'Comedy, Drama, Romance', 'Director': 'Billy Wilder', 'Writer': 'Billy Wilder, Samuel A. Taylor, Ernest Lehman', 'Actors': 'Humphrey Bogart, Audrey Hepburn, William Holden', 'Plot': "Linus and David Larrabee are the two sons of a very wealthy family. Linus is all work -- busily running the family corporate empire with no time for a wife and family. David is all play -- technically employed in the family business but never showing up for work, spending all his time entertaining, and having been married and divorced three times. Sabrina Fairchild is the young, shy, and awkward daughter of the household chauffeur, who has been infatuated with David all her life, but whom David hardly notices till she goes away to Paris for two years and returns an elegant, sophisticated, beautiful woman. Suddenly, she finds she's captured David's attention, but just as she does so, she finds h

In [22]:
#clear and create imdb df
imdb_df = a[0:0]
imdb_df

Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,Language,Country,Awards,Poster,Ratings,Metascore,imdbRating,imdbVotes,imdbID,Type,DVD,BoxOffice,Production,Website,Response


In [22]:
#iterate movie dataframe to obtain imdbID
#------------------------------------------------------
import math

for i, row in movies_edited.iterrows():
    imdbID = row['imdbID']
    print(imdbID)
    if str(imdbID) == 'nan' :
        continue
        
    url2 = "http://www.omdbapi.com/?i="+imdbID+"&plot=full&apikey="+API_KEY

    r = requests.get(url2)
    jsonA = r.json()


    if (jsonA['Response']=='False'):
        print("------------------------------Error occur for " + imdbID)
    else:
            print (jsonA['Title'])
            a = pd.DataFrame(jsonA)
            pd.set_option('display.max_columns', 30)
            a = a.drop(columns="Ratings") 
            a.drop_duplicates(inplace=True)
            imdb_df = imdb_df.append(a,ignore_index = True)

tt0116477
Hamlet
tt0118002
Two If by Sea
tt0113097
Forget Paris
tt0113501
Just Cause
tt0114268
Rent-a-Kid
tt3622592
Paper Towns
tt5973164
Fearless
tt0107497
Malice
tt0117108
Multiplicity
tt0117628
She's the One
tt11163028
House Arrest
tt0039420
The Ghost and Mrs. Muir
tt0115580
The Associate
tt0112896
Dracula: Dead and Loving It
tt0114011
Now and Then
tt0117102
Mr. Wrong
tt0111194
A Simple Twist of Fate
tt0104029
Cronos
tt0117283
The Pallbearer
tt0111667
The War
tt0116126
Don't Be a Menace to South Central While Drinking Your Juice in the Hood
tt0115472
The Adventures of Pinocchio
tt0116240
The Evening Star
tt0119815
Four Days in September
tt0113670
A Little Princess
tt1815708
Freelancers
tt0085809
Koyaanisqatsi
tt0112453
Balto
tt0115734
Bottle Rocket
tt0114808
The Star Maker
tt5580392
Amateur
tt0113677
Living in Oblivion
tt0114095
Party Girl
tt0114210
A Pyromaniac's Love Story
tt0111149
Shallow Grave
tt0110950
Reality Bites
tt0110455
A Man of No Importance
tt0110763
The Pagemaster
tt0

1-900
tt0105729
Venice/Venice
tt0116635
Infinity
tt0116167
Ed's Next Move
tt0109823
For the Moment
tt0645145
The Deadly Cure
tt5144366
7 Days in Venice Beach
tt0111622
The Sex Life of the Belgians
tt0117577
The Search for One-eye Jimmy
tt0115531
American Strays
tt0116859
The Leopard Son
tt7245458
Bird of Prey
tt0107274
Johnny One Hundred Pesos
tt0110173
JLG/JLG: Self-Portrait in December
tt0109781
Lesson Faust
tt0110521
Mina Tannenbaum
tt0042354
The Forbidden Christ
tt0110171
I Can't Sleep
tt0110425
La machine
tt0123281
The Stranger
tt0067152
Good Morning
tt0080714
Falling in Love Again
tt0106535
The Cement Garden
tt0119644
Meet Wally Sparks
tt0116565
Hotel de Love
tt0120014
Rhyme & Reason
tt0116931
Love and Other Catastrophes
tt0113314
Hollow Reed
tt0116920
Losing Chase
tt0058985
Le Bonheur
tt0120087
The Second Jungle Book: Mowgli & Baloo
tt0117724
Squeeze
tt0120034
Roseanna's Grave
tt0105569
Tetsuo II: Body Hammer
tt0119098
Fall
tt0116384
Gabbeh
tt0117076
Mondo
tt0113425
The Innocent

Wedding Bell Blues
tt0116949
MURDER and murder
tt3952864
------------------------------Error occur for tt3952864
tt0116379
The Break
tt0107315
Kika
tt0113827
Mirage
tt0056215
Mamma Roma
tt0117781
The Sunchaser
tt0460692
The War at Home
tt0114592
Sweet Nothing
tt0119711
Mother and Son
tt0120594
B. Monkey
tt0120148
Sliding Doors
tt0111804
Martin Lawrence: You So Crazy
tt0102855
Scream of Stone


In [33]:
imdb_df.drop_duplicates(inplace=True)
imdb_df.head(10)

Unnamed: 0,Actors,Awards,BoxOffice,Country,DVD,Director,Episode,Genre,Language,Metascore,Plot,Poster,Production,Rated,Released,Response,Runtime,Season,Title,Type,Website,Writer,Year,imdbID,imdbRating,imdbVotes,seriesID,totalSeasons
0,"Tom Hanks, Tim Allen, Don Rickles, Jim Varney",Nominated for 3 Oscars. Another 26 wins & 20 n...,,USA,20 Mar 2001,John Lasseter,,"Animation, Adventure, Comedy, Family, Fantasy",English,95.0,A little boy named Andy loves to be in his roo...,https://m.media-amazon.com/images/M/MV5BMDU2ZW...,Buena Vista,G,22 Nov 1995,True,81 min,,Toy Story,movie,,"John Lasseter (original story by), Pete Docter...",1995,tt0114709,8.3,830337,,
1,"Pierce Brosnan, Sean Bean, Izabella Scorupco, ...",Nominated for 2 BAFTA Film Awards. Another 2 w...,,"UK, USA",19 Oct 1999,Martin Campbell,,"Action, Adventure, Thriller","English, Russian, Spanish",65.0,When a deadly satellite weapon system falls in...,https://m.media-amazon.com/images/M/MV5BMzk2OT...,MGM/UA,PG-13,17 Nov 1995,True,130 min,,GoldenEye,movie,,"Ian Fleming (characters), Michael France (stor...",1995,tt0113189,7.2,227904,,
2,"Sammi Davis, Amanda De Cadenet, Valeria Golino...",1 win & 1 nomination.,,USA,20 Apr 1999,"Allison Anders, Alexandre Rockwell, Robert Rod...",,Comedy,English,,This movie features the collaborative director...,https://m.media-amazon.com/images/M/MV5BNDc3Y2...,Miramax Films,R,25 Dec 1995,True,98 min,,Four Rooms,movie,,"Allison Anders, Alexandre Rockwell, Robert Rod...",1995,tt0113101,6.8,93243,,
3,"Carolyn Dodd, Goya Robles, Sean Bridgers, Isaa...",1 nomination.,,USA,,Allen Coulter,,"Comedy, Crime",English,,The adventures of a mobster-turned-movie produ...,https://m.media-amazon.com/images/M/MV5BMTExMT...,,TV-MA,,True,60 min,,Get Shorty,series,,Davey Holmes,2017–,tt5761496,8.2,4340,,3.0
4,"Sigourney Weaver, Holly Hunter, Dermot Mulrone...",2 wins & 1 nomination.,,USA,28 Apr 1998,Jon Amiel,,"Drama, Mystery, Thriller",English,54.0,"In San Francisco, the criminal psychologist He...",https://m.media-amazon.com/images/M/MV5BYWUwND...,Warner Home Video,R,27 Oct 1995,True,123 min,,Copycat,movie,,"Ann Biderman, David Madsen",1995,tt0112722,6.6,51161,,
5,"Li Gong, Baotian Li, Xiaoxiao Wang, Xuejian Li",Nominated for 1 Oscar. Another 5 wins & 3 nomi...,,"France, China",12 Dec 2000,Yimou Zhang,,"Crime, Drama, History, Romance, Thriller",Mandarin,,Uncle Liu brings his cousin to Shanghai to wor...,https://m.media-amazon.com/images/M/MV5BYjQ0OT...,Sony Pictures Home Entertainment,R,22 Dec 1995,True,108 min,,Shanghai Triad,movie,,"Bi Feiyu, Li Xiao (novel)",1995,tt0115012,7.1,4868,,
6,"Joseph Melito, Bruce Willis, Jon Seda, Michael...",Nominated for 2 Oscars. Another 10 wins & 22 n...,,USA,31 Mar 1998,Terry Gilliam,,"Mystery, Sci-Fi, Thriller","English, French",74.0,An unknown and lethal virus has wiped out five...,https://m.media-amazon.com/images/M/MV5BN2Y2OW...,Universal Pictures,R,05 Jan 1996,True,129 min,,12 Monkeys,movie,,"Chris Marker (film La Jetée), David Webb Peopl...",1995,tt0114746,8.0,551055,,
7,"Christine Cavanaugh, Miriam Margolyes, Danny M...",Won 1 Oscar. Another 19 wins & 26 nominations.,,"Australia, USA",23 Sep 2003,Chris Noonan,,"Comedy, Drama, Family",English,83.0,Gentle farmer Arthur Hoggett wins a piglet nam...,https://m.media-amazon.com/images/M/MV5BYjg4Zj...,Universal Pictures,G,04 Aug 1995,True,91 min,,Babe,movie,,"Dick King-Smith (novel), George Miller (screen...",1995,tt0112431,6.7,111602,,
8,"Susan Sarandon, Sean Penn, Robert Prosky, Raym...",Won 1 Oscar. Another 22 wins & 21 nominations.,,"UK, USA",30 Sep 1998,Tim Robbins,,"Crime, Drama",English,80.0,A convicted murderer on Death Row and the nun ...,https://m.media-amazon.com/images/M/MV5BMTM3Nz...,Gramercy Pictures,R,02 Feb 1996,True,122 min,,Dead Man Walking,movie,,"Helen Prejean (book), Tim Robbins",1995,tt0112818,7.5,84294,,
9,"Christopher Bowen, Edward Jewesbury, Ian McKel...",Nominated for 2 Oscars. Another 7 wins & 10 no...,,"UK, USA",15 Aug 2001,Richard Loncraine,,"Drama, Sci-Fi, War",English,,William Shakespeare's classic play is brought ...,https://m.media-amazon.com/images/M/MV5BOWI3Nj...,United Artists,R,29 Dec 1995,True,110 min,,Richard III,movie,,"William Shakespeare (play), Ian McKellen, Rich...",1995,tt0114279,7.4,13170,,


In [31]:
imdb_df.to_csv ('processed-data/imdb_df.csv', index = None, header=True,encoding='utf-8-sig')

## Combine the movie file and imdb file

In [5]:
combine = pd.merge(movies_edited,imdb_df)
combine.to_csv ('processed-data/combine.csv', index = None, header=True,encoding='utf-8-sig')
combine.head()