### Load the relevant paths and modules

In [2]:
# set the auto-reload for the cells with %aimport
%load_ext autoreload
%autoreload 1

In [3]:
import os
import sys

# get source and data directories
src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
data_dir = os.path.join(os.getcwd(), os.pardir, 'data')

# set the source data directory as the current path
src_data_dir = os.path.join(src_dir, 'data')
sys.path.append(src_data_dir)

interim_data_dir = os.path.join(data_dir, 'interim')
processed_data_dir = os.path.join(data_dir, 'processed')

In [4]:
# load the web scraping modules
%aimport feedback
import feedback as fb

In [5]:
%aimport clean
import clean as cl

### Clean the existing csv data files

In [42]:
#audit movie financial csv file
cl.clean_imdb_data(data_dir)

### Scrape movie data and put into separate csv files
I segment the movies this way in case I encounter some kind of error for a particular year, I only disregard that year so as to not lose all of the data from the other years.

In [47]:
import time
import numpy as np

# get a list of movies for a particular year
years = ['2000', '2001', '2002', '2003', '2004', '2005', '2006', \
         '2007', '2008', '2009', '2010', '2011', '2012', '2013', \
         '2014', '2015', '2016']

for year in years:
    print('starting year: ' + str(year))
    movie_urls = []
    movie_data = []
    start_time = time.time()
    
    # get a list of movie urls for each desired year
    movie_urls = fb.get_movie_urls(year)
    
    # scrape the website for the movie data
    movie_data = fb.scrape(movie_urls)
    
    # add the interest score
    avg_rating = np.array([m['rating'] for m in movie_data]).mean()
    avg_num_votes = np.array([m['total_votes'] for m in movie_data]).mean()
    for movie in movie_data:
        movie['interest_score'] = ((avg_rating*avg_num_votes) + (movie['total_votes']*movie['rating']))/(avg_num_votes + movie['total_votes'])
    
    # create a csv for the given year
    fb.convert_to_csv(movie_data, interim_data_dir, str(year))
    
    end_time = time.time() - start_time
    print(end_time/60)
    print('finished year: ' + str(year))

starting year: 2000
 getting movie urls... 
 done!
 scraping data from movie website...
 done!
 creating /Users/dmoton/projects/data_projects/movie-hype/notebooks/../data/interim/2000.csv...
 done!
0.050003015995
finished year: 2000
starting year: 2001
 getting movie urls... 
 done!
 scraping data from movie website...
 done!
 creating /Users/dmoton/projects/data_projects/movie-hype/notebooks/../data/interim/2001.csv...
 done!
0.0656933347384
finished year: 2001
starting year: 2002
 getting movie urls... 
 done!
 scraping data from movie website...
 done!
 creating /Users/dmoton/projects/data_projects/movie-hype/notebooks/../data/interim/2002.csv...
 done!
0.524883568287
finished year: 2002
starting year: 2003
 getting movie urls... 
 done!
 scraping data from movie website...
 done!
 creating /Users/dmoton/projects/data_projects/movie-hype/notebooks/../data/interim/2003.csv...
 done!
1.84589506785
finished year: 2003
starting year: 2004
 getting movie urls... 
 done!
 scraping data fr

ConnectionError: ('Connection aborted.', BadStatusLine("''",))

### Merge the csv files into one csv

In [61]:
import pandas as pd 

# join the interest data into a dataframe
mi = pd.DataFrame([])

for year in years:
    year_csv = os.path.join(interim_data_dir, year + '.csv')
    year_df = pd.read_csv(year_csv, skiprows=0)
    mi = mi.append(year_df)  

In [67]:
# add the interest score
ratings = list(mi['rating'])
num_votes = list(mi['total_votes'])

avg_rating = np.array(ratings).mean()
avg_num_votes = np.array(num_votes).mean()
mi['interest_score'] = ((avg_rating*avg_num_votes) + (mi['total_votes']*mi['rating']))/(avg_num_votes + mi['total_votes'])

output = os.path.join(processed_data_dir, 'movie_interest.csv')
mi.to_csv(output) 