In [75]:
import pandas as pd
import numpy as np
import datetime

In [111]:
class IMDBData:
    def __init__(self):
        self.location = "../data/movies_metadata.csv"
        self.location_clean = "../data/movies_metadata_clean.csv"
        self.keep_vars = ['title', 'release_date', 
                          'budget', 'revenue', 
                          'runtime', 'genres', 
                          'vote_count', 'vote_average'
                         ]
        self.current_time = datetime.datetime.now()
        pass
    
    def get_raw(self):
        raw = pd.read_csv(self.location)
        return raw
    
    def clean_raw(self):
        raw = self.get_raw()
        #filtering by select variables
        narrow = raw[self.keep_vars]
        #converting datatypes
        def to_float(x):
            try:
                x = float(x)
            except:
                x = np.nan
            return x
        narrow['budget'] = narrow['budget'].apply(to_float)
        #convert release_date into into pandas dataframe
        narrow['release_date'] = pd.to_datetime(narrow['release_date'], errors='coerce')
        #extract year from the datetime
        narrow['year'] = narrow['release_date'].apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)
        return narrow
    
    def save_clean(self):
        df = self.clean_raw()
        df = df.to_csv(self.location_clean, index=False)
        print("INFO: saved to {0}".format(self.location_clean))
        return None
    
    def get_clean(self):
        df = pd.read_csv(self.location_clean)
        return df
    
    def summarize(self, dataset):
        summary_dict = {}
        if dataset == 'raw':
            df = self.get_raw()
        elif dataset == 'clean':
            df = self.get_clean()
        summary_dict['describe'] = df.describe()
        return summary_dict

In [116]:
def main():
    main_dict = {}
    main_dict['raw_data'] = IMDBData().get_raw()
    main_dict['raw_summary'] = IMDBData().summarize(dataset='raw')
    main_dict['clean_summary'] = IMDBData().summarize(dataset='clean')
    main_dict['clean_data'] = IMDBData().get_clean()
    print("INFO: dictionary returned with the following datasets")
    print(main_dict.keys())
    
    return main_dict

In [118]:
main()

  exec(code_obj, self.user_global_ns, self.user_ns)
  """Entry point for launching an IPython kernel.


INFO: dictionary returned with the following datasets
dict_keys(['raw_data', 'raw_summary', 'clean_summary', 'clean_data'])


{'raw_data':        adult                              belongs_to_collection    budget  \
 0      False  {'id': 10194, 'name': 'Toy Story Collection', ...  30000000   
 1      False                                                NaN  65000000   
 2      False  {'id': 119050, 'name': 'Grumpy Old Men Collect...         0   
 3      False                                                NaN  16000000   
 4      False  {'id': 96871, 'name': 'Father of the Bride Col...         0   
 5      False                                                NaN  60000000   
 6      False                                                NaN  58000000   
 7      False                                                NaN         0   
 8      False                                                NaN  35000000   
 9      False  {'id': 645, 'name': 'James Bond Collection', '...  58000000   
 10     False                                                NaN  62000000   
 11     False                                       