In [134]:
import pandas as pd
import numpy as np
from crawler import *
%reload_ext autoreload
%autoreload 2
import json
import pickle

In [2]:
df_mov = pd.read_table('../data/movie.metadata.tsv', header=None)

In [135]:
df_mov.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,time,year,details,credits,ids,revenue,budget
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",2001-08-24,2001,"{'adult': False, 'backdrop_path': '/anSbunnEMI...","[{'adult': False, 'gender': 1, 'id': 57395, 'k...","{'id': 10016, 'imdb_id': 'tt0228333', 'wikidat...",14010832.0,28000000.0
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp...",2000-02-16,2000,"{'adult': False, 'backdrop_path': None, 'belon...",,"{'id': 784579, 'imdb_id': 'tt0245916', 'wikida...",,
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D...",1988-01-01,1988,"{'adult': False, 'backdrop_path': '/6nGFaOiHOo...","[{'adult': False, 'gender': 2, 'id': 1014342, ...","{'id': 396302, 'imdb_id': 'tt0094806', 'wikida...",,


## Crawling
Nearly 90% of revenue entries are missing

In [3]:
sum(df_mov[4].isna())/len(df_mov)

0.8972241592346558

### Preprocessing

In [5]:
# Changing date format to Datetime and filling in NA values
filler = pd.to_datetime('2040-01-01')
df_mov['time'] = pd.to_datetime(df_mov[3], errors='coerce', format='mixed')
df_mov['time'].fillna(filler, inplace=True)
df_mov['year'] = df_mov['time'].apply(lambda x: int(x.year))

# Create new, empty columns
df_mov.loc[:,['details', 'credits','ids']] = np.nan

### Crawling (Multithread)

In [10]:
# Split the dataframe into chunks
chunk_size = 6000
chunks = [(i, min(i + chunk_size, len(df_mov))) for i in range(1000, len(df_mov), chunk_size)]

# Use ThreadPoolExecutor to run the process_chunk function in parallel
with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(lambda args: process_chunk(*args), chunks)

  df_mov.loc[i, 'details'] = str(details)
  df_mov.loc[i, 'credits'] = str(credits)
  df_mov.loc[i, 'ids'] = str(ids)


iteration: 31000iteration: 7000  details:  None  ids:  None
  details:  None  ids:  None
iteration: 43000  details:  None  ids:  None
iteration: 19000  details:  None  ids:  None
iteration: 67000  details:  None  ids:  None
iteration: 49000  details:  {'adult': False, 'backdrop_path': None, 'belongs_to_collection': None, 'budget': 0, 'genres': [{'id': 28, 'name': 'Action'}], 'homepage': '', 'id': 76969, 'imdb_id': 'tt2084939', 'original_language': 'ta', 'original_title': 'சிவப்பதிகாரம்', 'overview': "A man is working on a project with a smart professor whose daughter starts to like the man. During the legislature elections some mysterious killings take place. It's up to the guy to trace the killer.", 'popularity': 2.157, 'poster_path': '/kJgnNe912Jrn0FyrP6wEXu1gjFW.jpg', 'production_companies': [{'id': 124777, 'logo_path': None, 'name': 'Screen Play Entertainment', 'origin_country': ''}], 'production_countries': [], 'release_date': '2006-11-24', 'revenue': 0, 'runtime': 135, 'spoken_la

### Crawling (Missing)

In [41]:
missings = df_mov[df_mov['credits'].isna()].copy(deep=False)

In [39]:
for i in missings.index:
    name = df_mov.loc[i,2]
    year = df_mov.loc[i,'year']
    try:
        details, credits, ids = get_movie(name, year)
        df_mov.loc[i,'details'] = str(details)
        df_mov.loc[i,'credits'] = str(credits)
        df_mov.loc[i,'ids'] = str(ids)

        if not (i % 100):
            print(f"iteration: {i}"," details: ", details, " ids: ", ids)
        if not (i % 1000):
            with open(f"crawling_{i}.obj",'wb') as file:
                pickle.dump(df_mov, file)
    except Exception as error:
        print(f'iteration: {i}, error: {error}')

iteration: 0  details:  {'adult': False, 'backdrop_path': '/anSbunnEMI0TSmizqUSRACoe18l.jpg', 'belongs_to_collection': None, 'budget': 28000000, 'genres': [{'id': 28, 'name': 'Action'}, {'id': 27, 'name': 'Horror'}, {'id': 878, 'name': 'Science Fiction'}], 'homepage': 'http://www.theofficialjohncarpenter.com/ghost-of-mars/', 'id': 10016, 'imdb_id': 'tt0228333', 'original_language': 'en', 'original_title': 'Ghosts of Mars', 'overview': 'In 2176, a Martian police unit is sent to pick up a highly dangerous criminal at a remote mining post. Upon arrival, the cops find the post deserted and something far more dangerous than any criminal — the original inhabitants of Mars, hellbent on getting their planet back.', 'popularity': 17.683, 'poster_path': '/i2zztssCIbahGES1fdfWFmDXian.jpg', 'production_companies': [{'id': 51312, 'logo_path': None, 'name': 'Animationwerks', 'origin_country': ''}, {'id': 3287, 'logo_path': '/bz6GbCQQXGNE56LTW9dwgksW0Iw.png', 'name': 'Screen Gems', 'origin_country': 

## Cleanup & Extraction

In [85]:
# Fill in pd.NA instead of None (change initial function if reused)
df_mov['details'] = df_mov.loc[:,'details'].apply(lambda x: changeNA(x))
df_mov['credits'] = df_mov.loc[:,'credits'].apply(lambda x: changeNA(x))
df_mov['ids'] = df_mov.loc[:,'ids'].apply(lambda x: changeNA(x))

In [117]:
# Extract Budget & Revenue
df_mov['budget'] = df_mov.loc[:,'details'].apply(lambda x: extractBudget(x))
df_mov['revenue'] = df_mov.loc[:,'details'].apply(lambda x: extractRevenue(x))
df_mov['revenue'] = df_mov.loc[:,'revenue'].apply(lambda x: zeroToNA(x))

## Results

In [148]:
new_info = df_mov.apply(lambda x: True if not pd.isnull(x['revenue']) and (pd.isnull(x[4])) else False, axis=1)

In [149]:
print('empty details: ',sum(df_mov['details'].isna()))
print('TMDB - initial info: ',sum(df_mov['budget'].isna()) - sum(df_mov[4].isna()))
print('new info from TMDB: ', sum(new_info))


empty details:  19338
TMDB - initial info:  -1479
new info from TMDB:  3593


# Saving

In [124]:
# Save df for later
with open(f"crawling.obj",'wb') as file:
    pickle.dump(df_mov, file)

In [130]:
with open('crawling.obj', 'rb') as file:
    tst = pickle.load(file)

In [131]:
tst

Unnamed: 0,0,1,2,3,4,5,6,7,8,time,year,details,credits,ids,revenue,budget
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",2001-08-24,2001,"{'adult': False, 'backdrop_path': '/anSbunnEMI...","[{'adult': False, 'gender': 1, 'id': 57395, 'k...","{'id': 10016, 'imdb_id': 'tt0228333', 'wikidat...",14010832,28000000
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp...",2000-02-16,2000,"{'adult': False, 'backdrop_path': None, 'belon...",,"{'id': 784579, 'imdb_id': 'tt0245916', 'wikida...",,
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D...",1988-01-01,1988,"{'adult': False, 'backdrop_path': '/6nGFaOiHOo...","[{'adult': False, 'gender': 2, 'id': 1014342, ...","{'id': 396302, 'imdb_id': 'tt0094806', 'wikida...",,
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic...",1987-01-01,1987,"{'adult': False, 'backdrop_path': '/6aLSnAgqOy...","[{'adult': False, 'gender': 2, 'id': 18181, 'k...","{'id': 33592, 'imdb_id': 'tt0094320', 'wikidat...",,
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}",1983-01-01,1983,"{'adult': False, 'backdrop_path': '/n12jS98GML...","[{'adult': False, 'gender': 1, 'id': 37343, 'k...","{'id': 11192, 'imdb_id': 'tt0083949', 'wikidat...",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81736,35228177,/m/0j7hxnt,Mermaids: The Body Found,2011-03-19,,120.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/07s9rl0"": ""Drama""}",2011-03-19,2011,"{'adult': False, 'backdrop_path': '/zwXUPj4D1x...","[{'adult': False, 'gender': 2, 'id': 86320, 'k...","{'id': 117124, 'imdb_id': 'tt1816585', 'wikida...",,
81737,34980460,/m/0g4pl34,Knuckle,2011-01-21,,96.0,"{""/m/02h40lc"": ""English Language""}","{""/m/03rt9"": ""Ireland"", ""/m/07ssc"": ""United Ki...","{""/m/03bxz7"": ""Biographical film"", ""/m/07s9rl0...",2011-01-21,2011,"{'adult': False, 'backdrop_path': '/xXMLr75S7m...","[{'adult': False, 'gender': 2, 'id': 561927, '...","{'id': 71771, 'imdb_id': 'tt1606259', 'wikidat...",,
81738,9971909,/m/02pygw1,Another Nice Mess,1972-09-22,,66.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/06nbt"": ""Satire"", ""/m/01z4y"": ""Comedy""}",1972-09-22,1972,"{'adult': False, 'backdrop_path': '/p5bkA8EN7E...","[{'adult': False, 'gender': 2, 'id': 146138, '...","{'id': 285337, 'imdb_id': 'tt0362411', 'wikida...",,
81739,913762,/m/03pcrp,The Super Dimension Fortress Macross II: Lover...,1992-05-21,,150.0,"{""/m/03_9r"": ""Japanese Language""}","{""/m/03_3d"": ""Japan""}","{""/m/06n90"": ""Science Fiction"", ""/m/0gw5n2f"": ...",1992-05-21,1992,,,,,
