# Script to get details of the top 250 movies in IMDB from IMDB, Metacritic and Rotten tomatoes

## by Gautam Borgohain


In [None]:
import re
import pandas as pd
import time

pd.options.display.max_columns = None
pd.options.display.max_rows = None

pd.set_option('display.max_colwidth', 1000)

from bs4 import BeautifulSoup
from urllib.parse import quote
import urllib.request as req

user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
headers={'User-Agent':user_agent,}

In [14]:
#
# Function to get the soup objects for a given URL
#
def getSoup(url):
    request= req.Request(url,None,headers)
    try:
        response = req.urlopen(request) 
        time.sleep(2)
        data = response.read()
        soup = BeautifulSoup(data, "html.parser")
        
    except Exception as e:
        print("There was an error retrieving", url, e) 
    finally:
        response.close()
        
    return soup

def cleanText(text):
    return re.sub(r'\n','',text)

url_start = "http://www.imdb.com"

Step 1: Get the Top 250 Movie list from IMDB

In [None]:
imdb250_url = "http://www.imdb.com/chart/top?ref_=nv_mv_250_6"

imdb250_soup = getSoup(imdb250_url)

Step 2: Loop through the 250 movies and extract the movie name, rating and the url to their main page and store the data to a dataframe

In [None]:
table250 = imdb250_soup.find('tbody',{'class': 'lister-list'})
top250_df = pd.DataFrame(columns=['Title','Rating','IMDB_MAIN_URL'])# Initialize an empty dataframe
for movie_row in table250.findAll('tr'):
    title_cell = movie_row.find('td',{'class':'titleColumn'})
    if(title_cell.a):
        movie_url = url_start+ title_cell.a['href']
        movie_name = title_cell.a.text
    rating_cell = movie_row.find('td',{'class':'ratingColumn imdbRating'})
    if(rating_cell):
        rating = cleanText(rating_cell.text)
    top250_df.loc[len(top250_df)] = [movie_name,rating,movie_url]
    
top250_df.head()

Step 3: Loop through the dataframe and go to each movie's main page using the url obtained in the previous step.

From the main page, get Year and Genre information of the movies along with the link to the movie's awards page. Finally add everything into the dataframe

In [None]:
top250_df_copy = top250_df.copy()
# top250_df_copy = top250_df_copy[0:100]

year_list = []
all_genre_list = []
awards_url_list = []
for index, row in top250_df_copy.iterrows():
    temp_url = row['IMDB_MAIN_URL']
    oneMovie = getSoup(temp_url)
    year_soup = oneMovie.find('span',{'id':'titleYear'})
    year = re.sub(r'[()]',"",year_soup.text)
    genre_soup = oneMovie.findAll('span',{'itemprop':'genre'})
    genre_list = []
    for genre in genre_soup:
        genre_list.append(genre.text)
    awards_soup = oneMovie.find(text = 'See more awards')
    if(awards_soup):
        moreawards_url = url_start + awards_soup.parent['href']
    year_list.append(year)
    all_genre_list.append(' '.join(genre_list)) # Using a lit here. Maybe put them in different columns? Dont know if that would be useful
    awards_url_list.append(moreawards_url)
    moreawards_url = " "# Cleaning the variable in case the next movie does not have an awards page
    time.sleep(2)
    if(index%25==0):print("Fetched till ",index)

top250_df_copy.insert(loc = len(top250_df_copy.columns), column = 'YEAR',value = year_list)
top250_df_copy.insert(loc = len(top250_df_copy.columns),column ='Genre',value =all_genre_list)
top250_df_copy.insert(loc = len(top250_df_copy.columns),column ='IMDB_AWARDS_URL',value = awards_url_list)

top250_df_copy.head()

Step 4: Go to each of the awards page in the dataframe and extract the number of awards won by the movie.

The awards that are of interest to use are:
- Oscars - Academy awards
- BAFTA
- SAG
- Golden globes
- Critics Choice

In [None]:
oscar_nominations_list = []
oscar_wins_list = []
gg_nominations_list = []
gg_wins_list = []
sag_nominations_list = []
sag_wins_list = []
bafta_nominations_list = []
bafta_wins_list = []
cc_wins_list = []
cc_nominations_list = []
total_nominations_list = []
total_wins_list = []

def getCount(td):
    return td.get('rowspan') if (td.get('rowspan')) else 0

def toNumber(numberlist):
    return numberlist[0] if(numberlist) else 0

for index, row in top250_df_copy.iterrows():
    temp_url = row['IMDB_AWARDS_URL']
    if(temp_url != ' '): 
        temp_soup = getSoup(temp_url)
        allstats = temp_soup.find('div',{'class':'desc'})
        total_wins = toNumber(re.findall(r'([0-9]*) wins',allstats.text))
        total_nominations = toNumber(re.findall(r'([0-9]*) nominations',allstats.text))
        td_award = temp_soup.findAll('td', {'class': 'title_award_outcome'})
        oscar_win=0
        oscar_nom=0
        BAFTA_win=0
        BAFTA_nom=0
        gg_win=0
        gg_nom=0
        sag_win=0
        sag_nom=0
        cc_win=0
        cc_nom =0
        for td in td_award:
            if 'Won\nOscar' in td.text:
                oscar_win = getCount(td)
            elif 'Nominated\nOscar' in td.text:
                oscar_nom = getCount(td)
            elif 'Won\nBAFTA' in td.text:
                BAFTA_win = getCount(td)
            elif 'Nominated\nBAFTA' in td.text:
                BAFTA_nom = getCount(td)       
            elif 'Won\nGolden Globe' in td.text:
                gg_win = getCount(td)        
            elif 'Nominated\nGolden Globe' in td.text:
                gg_nom = getCount(td)        
            elif 'Won\nActor' in td.text:
                sag_win = getCount(td)        
            elif 'Nominated\nActor' in td.text:
                sag_nom = getCount(td)        
            elif 'Won\nCritics Choice' in td.text:
                cc_win = getCount(td)    
            elif 'Nominated\nCritics Choice' in td.text:
                cc_nom = getCount(td)
        oscar_wins_list.append(oscar_win)
        oscar_nominations_list.append(oscar_nom)
        cc_wins_list.append(cc_win )
        cc_nominations_list.append(cc_nom )
        bafta_wins_list.append(BAFTA_win)
        bafta_nominations_list.append(BAFTA_nom)
        gg_wins_list.append(gg_win)
        gg_nominations_list.append(gg_nom)
        sag_wins_list.append(sag_win)
        sag_nominations_list.append(sag_nom)
        total_nominations_list.append(total_nominations)
        total_wins_list.append(total_wins)
        if(index%25==0):
            print("Fetched till ",index)

movies_withoutAwards = top250_df_copy.loc[top250_df_copy.IMDB_AWARDS_URL == ' ']            
top250_df_copy = top250_df_copy.loc[top250_df_copy.IMDB_AWARDS_URL != ' ']# Remove the movies that did not have awards page
top250_df_copy.insert(loc = len(top250_df_copy.columns),column ='TOTAL_NOM',value = total_nominations_list)
top250_df_copy.insert(loc = len(top250_df_copy.columns),column ='TOTAL_WINS',value = total_wins_list)   
top250_df_copy.insert(loc = len(top250_df_copy.columns), column = 'OSCAR_NOM',value = oscar_nominations_list)
top250_df_copy.insert(loc = len(top250_df_copy.columns),column ='OSCAR_WIN',value =oscar_wins_list)
top250_df_copy.insert(loc = len(top250_df_copy.columns),column ='GG_NOM',value = gg_nominations_list)
top250_df_copy.insert(loc = len(top250_df_copy.columns),column ='GG_WINS',value = gg_wins_list)
top250_df_copy.insert(loc = len(top250_df_copy.columns), column = 'BAFTA_NOM',value = bafta_nominations_list)
top250_df_copy.insert(loc = len(top250_df_copy.columns),column ='BAFTA_WIN',value =bafta_wins_list)
top250_df_copy.insert(loc = len(top250_df_copy.columns),column ='SAG_NOM',value = sag_nominations_list)
top250_df_copy.insert(loc = len(top250_df_copy.columns),column ='SAG_WINS',value = sag_wins_list)
top250_df_copy.insert(loc = len(top250_df_copy.columns),column ='CC_NOM',value = cc_nominations_list)
top250_df_copy.insert(loc = len(top250_df_copy.columns),column ='CC_WINS',value = cc_wins_list)
# top250_df_copy = top250_df_copy.append(movies_withoutAwards)# To append the movies without the awards
top250_df_copy.head()


# top250_df_copy['TOTAL_NOM'] = [toNumber(i) for i in top250_df_copy['TOTAL_NOM']]
# top250_df_copy['TOTAL_WINS'] = [toNumber(i) for i in top250_df_copy['TOTAL_WINS']]

Metacritic

Step 5: Go over all the movies in the data frame, go to Metacritic.com and extract the critic and user ratings

In [None]:
metaSearch_Start_url = 'http://www.metacritic.com/search/movie/'
metaSearch_End_url = "/results"
meta_home = "http://www.metacritic.com/"
critic_score_list = []
user_score_list = []
movie_url_list = []
for index, row in top250_df_copy.iterrows():
    critic_score = 0
    user_score = 0
    movie_name = row['Title']
    movie_name_encoded = quote(movie_name.encode('utf8'))
    metaSearch_url = metaSearch_Start_url+movie_name_encoded+metaSearch_End_url
    results_soup = getSoup(metaSearch_url)
    time.sleep(2)
    firstResult = results_soup.find(text = movie_name) # Search the results for the exact movie name
    if(firstResult):
        movie_home_url_part = firstResult.parent['href']
        movie_home_url = meta_home+movie_home_url_part
        movie_soup = getSoup(movie_home_url) # Go to the movie's metacritic homepage
        critic_shell = movie_soup.findAll('a',{'class':'metascore_anchor'}) # This gives a list of all the scores, we need the first two
        if(critic_shell):
            critic_score = cleanText(critic_shell[0].text)
            user_score = cleanText(critic_shell[1].text)
    movie_url_list.append(metaSearch_url)
    critic_score_list.append(critic_score)
    user_score_list.append(user_score)
    time.sleep(2)
    if(index%25==0):print("Fetched till ",index)

top250_df_copy.insert(loc = len(top250_df_copy.columns),column ='MC_MOVIE_URL',value = movie_url_list)
top250_df_copy.insert(loc = len(top250_df_copy.columns),column ='MC_CRITIC_RATING',value = critic_score_list)
top250_df_copy.insert(loc = len(top250_df_copy.columns),column ='MC_USER_RATING',value = user_score_list)
top250_df_copy.head()

Rotten Tomatoes

Step 6: Iterate through the list of movies and go to Rottentomatoes.com to get the User and critic ratings.

Problem here is that the web site behaves differently for some movies.For some movies, even sending a request via 'search' url, the user is taken directly to the home pagep of the movie. Thus have to create condition to make sure we are in the right page.

In [None]:
rotten_home = "http://www.rottentomatoes.com"
rotten_search_start = "http://www.rottentomatoes.com/search/?search="
critic_score_list = []
user_score_list = []
movie_url_list = []

def getMovieURL(movie_name,results_soup):
    ul = results_soup.find('ul',{'id': 'movie_results_ul'})
    if(ul):
        lis = ul.findAll('li',{'class':'media bottom_divider clearfix'})
        for li in lis:
            if movie_name.lower() in li.text.lower().encode('ascii','ignore').decode('utf8'):
                return li.find('a')['href']
    return False

for index, row in top250_df_copy.iterrows():
    critic_score = 0
    user_score = 0
    movie_name = row['Title'].encode('ascii','ignore').decode('utf8')
    movie_name_encoded = re.sub(' ','+',movie_name)
    rtSearch_url = rotten_search_start+movie_name_encoded
    results_soup = getSoup(rtSearch_url)
    time.sleep(2)
    if(results_soup):
        resultsdiv = results_soup.findAll('div',{'id':'scoreStats'})
        if(resultsdiv): # If it went to the Home Page directly
#             print("Hom page", movie_name)
            text = resultsdiv[0].text
            critic_rating = toNumber(re.findall(r'Average Rating:  ([^/]+)',text))
            resultsdiv = results_soup.findAll('div',{'class':'audience-info hidden-xs superPageFontColor'})
            text = resultsdiv[0].text
            user_rating = toNumber(re.findall(r'Average Rating: ([^/]+)',text))
            critic_score_list.append(critic_rating)
            user_score_list.append(user_rating)
            movie_url_list.append(rtSearch_url)
        else:
#             print("Went to search page", movie_name,rtSearch_url)
            home_url = getMovieURL(movie_name,results_soup)
            if(home_url):
                complete_home_url = rotten_home+home_url
                results_list= getSoup(complete_home_url)
                resultsdiv = results_list.findAll('div',{'id':'scoreStats'})
                if(len(resultsdiv)>0):
#                     print("Now to Home page", movie_name)
                    text = resultsdiv[0].text
                    critic_rating = toNumber(re.findall(r'Average Rating:  ([^/]+)',text))
                    resultsdiv = results_list.findAll('div',{'class':'audience-info hidden-xs superPageFontColor'})
                    text = resultsdiv[0].text
                    user_rating = toNumber(re.findall(r'Average Rating: ([^/]+)',text))
                    critic_score_list.append(critic_rating)
                    user_score_list.append(user_rating)
                    movie_url_list.append(complete_home_url)
                else:
                    print("Didnt find ratings info", resultsdiv,complete_home_url)
                    critic_score_list.append(0)
                    user_score_list.append(0)
                    movie_url_list.append(complete_home_url)
            else:
                print("Movie not in the result list", home_url,rtSearch_url)
                critic_score_list.append(0)
                user_score_list.append(0)
                movie_url_list.append(rtSearch_url)
                
    time.sleep(2)
    if(index%25==0):print("Fetched till ",index)

top250_df_copy.insert(loc = len(top250_df_copy.columns),column ='RT_MOVIE_URL',value = movie_url_list)
top250_df_copy.insert(loc = len(top250_df_copy.columns),column ='RT_CRITIC_RATING',value = critic_score_list)
top250_df_copy.insert(loc = len(top250_df_copy.columns),column ='RT_USER_RATING',value = user_score_list)
top250_df_copy.head()

## Data cleaning and some calculations

In [None]:
top250_df_copy.loc[top250_df_copy.RT_CRITIC_RATING == 'N'] = 0
top250_df_copy.loc[top250_df_copy.RT_USER_RATING == 'N'] = 0

Convert the columns to float datatype

In [None]:
for column in top250_df_copy.columns:
    if column not in ['Genre','IMDB_AWARDS_URL','IMDB_MAIN_URL','Title','RT_MOVIE_URL','MC_MOVIE_URL','YEAR']:
        top250_df_copy[[column]] = top250_df_copy[[column]].astype(float, inplace  = True)

In [None]:
top250_df_copy['AVG_USR_RATING'] = (top250_df_copy.Rating + (2 * top250_df_copy.RT_USER_RATING) + top250_df_copy.MC_USER_RATING) / 3
top250_df_copy['AVG_CRTC_RATING'] = (top250_df_copy.RT_CRITIC_RATING + (top250_df_copy.MC_CRITIC_RATING / 10)) / 2
top250_df_copy['TOTAL_AWARDS'] = (3 * top250_df_copy.TOTAL_WINS + top250_df_copy.TOTAL_NOM) + (3 * (
top250_df_copy.OSCAR_WIN + top250_df_copy.SAG_WINS + top250_df_copy.BAFTA_WIN + top250_df_copy.CC_WINS + top250_df_copy.GG_WINS)) + (
                                 top250_df_copy.BAFTA_NOM + top250_df_copy.GG_NOM + top250_df_copy.CC_NOM + top250_df_copy.OSCAR_NOM + top250_df_copy.SAG_NOM)

top250_df_copy['AVG_RATING'] = (top250_df_copy.AVG_USR_RATING+top250_df_copy.AVG_CRTC_RATING) / 2
top250_df_copy.head()

In [None]:
top250_df_copy = pd.read_csv('/Users/gautamborgohain/Desktop/temp5.csv')
top250_df_copy_backup = top250_df_copy.copy()

In [None]:
top250_df_copy.to_csv('/Users/gautamborgohain/Desktop/step1.csv')

## For the 1000 movie list

In [None]:
top1000_df = pd.DataFrame(columns=['Title','Rating','IMDB_MAIN_URL'])# Initialize an empty dataframe
list_home_url = 'http://www.imdb.com/search/title?groups=top_1000&sort=user_rating&start=@@@&view=simple'
for i in range(1,1000,100):
    pagesoup = getSoup(re.sub('@@@',str(i),list_home_url))
    rows = pagesoup.findAll('tr',{'class':re.compile('even|odd')})
    for row in rows:
        try:
            movie_name = row.find('a').text
            movie_url = url_start+row.find('a')['href']
            rating = row.find('b').text
            top1000_df.loc[len(top1000_df)] = [movie_name,rating,movie_url]
        except Exception as e:
            print(row.text, movie_url,e)
top1000_df.head()

In [5]:
len(top1000_df)

1000

In [6]:
top250_df = top1000_df.copy()

In [7]:
top1000_df.to_clipboard()

In [13]:
top250_df_copy = top250_df
len(top250_df_copy)

1000

In [15]:
try:
    oscar_nominations_list = []
    oscar_wins_list = []
    gg_nominations_list = []
    gg_wins_list = []
    sag_nominations_list = []
    sag_wins_list = []
    bafta_nominations_list = []
    bafta_wins_list = []
    cc_wins_list = []
    cc_nominations_list = []
    total_nominations_list = []
    total_wins_list = []

    def getCount(td):
        return td.get('rowspan') if (td.get('rowspan')) else 0

    def toNumber(numberlist):
        return numberlist[0] if(numberlist) else 0

    for index, row in top250_df_copy.iterrows():
        temp_url = row['IMDB_AWARDS_URL']
        if(temp_url != ' '): 
            temp_soup = getSoup(temp_url)
            allstats = temp_soup.find('div',{'class':'desc'})
            total_wins = toNumber(re.findall(r'([0-9]*) wins',allstats.text))
            total_nominations = toNumber(re.findall(r'([0-9]*) nominations',allstats.text))
            td_award = temp_soup.findAll('td', {'class': 'title_award_outcome'})
            oscar_win=0
            oscar_nom=0
            BAFTA_win=0
            BAFTA_nom=0
            gg_win=0
            gg_nom=0
            sag_win=0
            sag_nom=0
            cc_win=0
            cc_nom =0
            for td in td_award:
                if 'Won\nOscar' in td.text:
                    oscar_win = getCount(td)
                elif 'Nominated\nOscar' in td.text:
                    oscar_nom = getCount(td)
                elif 'Won\nBAFTA' in td.text:
                    BAFTA_win = getCount(td)
                elif 'Nominated\nBAFTA' in td.text:
                    BAFTA_nom = getCount(td)       
                elif 'Won\nGolden Globe' in td.text:
                    gg_win = getCount(td)        
                elif 'Nominated\nGolden Globe' in td.text:
                    gg_nom = getCount(td)        
                elif 'Won\nActor' in td.text:
                    sag_win = getCount(td)        
                elif 'Nominated\nActor' in td.text:
                    sag_nom = getCount(td)        
                elif 'Won\nCritics Choice' in td.text:
                    cc_win = getCount(td)    
                elif 'Nominated\nCritics Choice' in td.text:
                    cc_nom = getCount(td)
            oscar_wins_list.append(oscar_win)
            oscar_nominations_list.append(oscar_nom)
            cc_wins_list.append(cc_win )
            cc_nominations_list.append(cc_nom )
            bafta_wins_list.append(BAFTA_win)
            bafta_nominations_list.append(BAFTA_nom)
            gg_wins_list.append(gg_win)
            gg_nominations_list.append(gg_nom)
            sag_wins_list.append(sag_win)
            sag_nominations_list.append(sag_nom)
            total_nominations_list.append(total_nominations)
            total_wins_list.append(total_wins)
            if(index%25==0):
                print("Fetched till ",index)

    movies_withoutAwards = top250_df_copy.loc[top250_df_copy.IMDB_AWARDS_URL == ' ']            
    top250_df_copy = top250_df_copy.loc[top250_df_copy.IMDB_AWARDS_URL != ' ']# Remove the movies that did not have awards page
    top250_df_copy.insert(loc = len(top250_df_copy.columns),column ='TOTAL_NOM',value = total_nominations_list)
    top250_df_copy.insert(loc = len(top250_df_copy.columns),column ='TOTAL_WINS',value = total_wins_list)   
    top250_df_copy.insert(loc = len(top250_df_copy.columns), column = 'OSCAR_NOM',value = oscar_nominations_list)
    top250_df_copy.insert(loc = len(top250_df_copy.columns),column ='OSCAR_WIN',value =oscar_wins_list)
    top250_df_copy.insert(loc = len(top250_df_copy.columns),column ='GG_NOM',value = gg_nominations_list)
    top250_df_copy.insert(loc = len(top250_df_copy.columns),column ='GG_WINS',value = gg_wins_list)
    top250_df_copy.insert(loc = len(top250_df_copy.columns), column = 'BAFTA_NOM',value = bafta_nominations_list)
    top250_df_copy.insert(loc = len(top250_df_copy.columns),column ='BAFTA_WIN',value =bafta_wins_list)
    top250_df_copy.insert(loc = len(top250_df_copy.columns),column ='SAG_NOM',value = sag_nominations_list)
    top250_df_copy.insert(loc = len(top250_df_copy.columns),column ='SAG_WINS',value = sag_wins_list)
    top250_df_copy.insert(loc = len(top250_df_copy.columns),column ='CC_NOM',value = cc_nominations_list)
    top250_df_copy.insert(loc = len(top250_df_copy.columns),column ='CC_WINS',value = cc_wins_list)

    top250_df_copy.to_csv('/Users/gautamborgohain/Desktop/step2_awards.csv')

except Exception as e:
    print("Failed getting awards", e)
    
try:    
    metaSearch_Start_url = 'http://www.metacritic.com/search/movie/'
    metaSearch_End_url = "/results"
    meta_home = "http://www.metacritic.com/"
    critic_score_list = []
    user_score_list = []
    movie_url_list = []
    for index, row in top250_df_copy.iterrows():
        critic_score = 0
        user_score = 0
        movie_name = row['Title']
        movie_name_encoded = quote(movie_name.encode('utf8'))
        metaSearch_url = metaSearch_Start_url+movie_name_encoded+metaSearch_End_url
        results_soup = getSoup(metaSearch_url)
        time.sleep(1)
        firstResult = results_soup.find(text = movie_name) # Search the results for the exact movie name
        if(firstResult):
            movie_home_url_part = firstResult.parent['href']
            movie_home_url = meta_home+movie_home_url_part
            movie_soup = getSoup(movie_home_url) # Go to the movie's metacritic homepage
            critic_shell = movie_soup.findAll('a',{'class':'metascore_anchor'}) # This gives a list of all the scores, we need the first two
            if(critic_shell):
                critic_score = cleanText(critic_shell[0].text)
                user_score = cleanText(critic_shell[1].text)
        movie_url_list.append(metaSearch_url)
        critic_score_list.append(critic_score)
        user_score_list.append(user_score)
        time.sleep(1)
        if(index%25==0):print("Fetched till ",index)

    top250_df_copy.insert(loc = len(top250_df_copy.columns),column ='MC_MOVIE_URL',value = movie_url_list)
    top250_df_copy.insert(loc = len(top250_df_copy.columns),column ='MC_CRITIC_RATING',value = critic_score_list)
    top250_df_copy.insert(loc = len(top250_df_copy.columns),column ='MC_USER_RATING',value = user_score_list)

    top250_df_copy.to_csv('/Users/gautamborgohain/Desktop/step3_meta.csv')

except Exception as e:
    print("Failed getting Metacritic", e)

try:
    rotten_home = "http://www.rottentomatoes.com"
    rotten_search_start = "http://www.rottentomatoes.com/search/?search="
    critic_score_list = []
    user_score_list = []
    movie_url_list = []

    def getMovieURL(movie_name,results_soup):
        ul = results_soup.find('ul',{'id': 'movie_results_ul'})
        if(ul):
            lis = ul.findAll('li',{'class':'media bottom_divider clearfix'})
            for li in lis:
                if movie_name.lower() in li.text.lower().encode('ascii','ignore').decode('utf8'):
                    return li.find('a')['href']
        return False

    for index, row in top250_df_copy.iterrows():
        critic_score = 0
        user_score = 0
        movie_name = row['Title'].encode('ascii','ignore').decode('utf8')
        movie_name_encoded = re.sub(' ','+',movie_name)
        rtSearch_url = rotten_search_start+movie_name_encoded
        results_soup = getSoup(rtSearch_url)
        time.sleep(1)
        if(results_soup):
            resultsdiv = results_soup.findAll('div',{'id':'scoreStats'})
            if(resultsdiv): # If it went to the Home Page directly
    #             print("Hom page", movie_name)
                text = resultsdiv[0].text
                critic_rating = toNumber(re.findall(r'Average Rating:  ([^/]+)',text))
                resultsdiv = results_soup.findAll('div',{'class':'audience-info hidden-xs superPageFontColor'})
                text = resultsdiv[0].text
                user_rating = toNumber(re.findall(r'Average Rating: ([^/]+)',text))
                critic_score_list.append(critic_rating)
                user_score_list.append(user_rating)
                movie_url_list.append(rtSearch_url)
            else:
    #             print("Went to search page", movie_name,rtSearch_url)
                home_url = getMovieURL(movie_name,results_soup)
                if(home_url):
                    complete_home_url = rotten_home+home_url
                    results_list= getSoup(complete_home_url)
                    resultsdiv = results_list.findAll('div',{'id':'scoreStats'})
                    if(len(resultsdiv)>0):
    #                     print("Now to Home page", movie_name)
                        text = resultsdiv[0].text
                        critic_rating = toNumber(re.findall(r'Average Rating:  ([^/]+)',text))
                        resultsdiv = results_list.findAll('div',{'class':'audience-info hidden-xs superPageFontColor'})
                        text = resultsdiv[0].text
                        user_rating = toNumber(re.findall(r'Average Rating: ([^/]+)',text))
                        critic_score_list.append(critic_rating)
                        user_score_list.append(user_rating)
                        movie_url_list.append(complete_home_url)
                    else:
                        print("Didnt find ratings info", resultsdiv,complete_home_url)
                        critic_score_list.append(0)
                        user_score_list.append(0)
                        movie_url_list.append(complete_home_url)
                else:
                    print("Movie not in the result list", home_url,rtSearch_url)
                    critic_score_list.append(0)
                    user_score_list.append(0)
                    movie_url_list.append(rtSearch_url)

        time.sleep(1)
        if(index%25==0):print("Fetched till ",index)

    top250_df_copy.insert(loc = len(top250_df_copy.columns),column ='RT_MOVIE_URL',value = movie_url_list)
    top250_df_copy.insert(loc = len(top250_df_copy.columns),column ='RT_CRITIC_RATING',value = critic_score_list)
    top250_df_copy.insert(loc = len(top250_df_copy.columns),column ='RT_USER_RATING',value = user_score_list)

    top250_df_copy.to_csv('/Users/gautamborgohain/Desktop/step4_rt.csv')
    
except Exception as e:
    print("Failed getting rottentomatoes", e)

Failed getting awards 'IMDB_AWARDS_URL'
Fetched till  0
Fetched till  25
Fetched till  50
Fetched till  75
Fetched till  100
Fetched till  125
Fetched till  150
Fetched till  175
Fetched till  200
Fetched till  225
Fetched till  250
Fetched till  275
Fetched till  300
Fetched till  325
Fetched till  350
Fetched till  375
Fetched till  400
Fetched till  425
Fetched till  450
Fetched till  475
Fetched till  500
Fetched till  525
Fetched till  550
Fetched till  575
Fetched till  600
Fetched till  625
Fetched till  650
There was an error retrieving http://www.metacritic.com/search/movie/Frost/Nixon/results HTTP Error 404: Not Found
Failed getting Metacritic local variable 'soup' referenced before assignment
Fetched till  0
Didnt find ratings info [] http://www.rottentomatoes.com/m/seven_samurai/
Didnt find ratings info [] http://www.rottentomatoes.com/m/city_of_god_2011/
Didnt find ratings info [] http://www.rottentomatoes.com/m/life_is_beautiful_2012/
Fetched till  25
Movie not in the res

In [22]:

top250_df_copy.insert(loc = len(top250_df_copy.columns),column ='MC_MOVIE_URL',value = movie_url_list)
top250_df_copy.insert(loc = len(top250_df_copy.columns),column ='MC_CRITIC_RATING',value = critic_score_list)
top250_df_copy.insert(loc = len(top250_df_copy.columns),column ='MC_USER_RATING',value = user_score_list)


In [24]:
top250_df_copy.to_csv('/Users/gautamborgohain/Desktop/step3_meta.csv')