https://github.com/EthanRosenthal/rec-a-sketch

In [1]:
import os
import requests
import pandas as pd
from PIL import Image
from tqdm import tqdm
from joblib import Parallel, delayed

# 1. magic to print version
# 2. magic so that the notebook will reload external python modules
%load_ext watermark
%load_ext autoreload 
%autoreload 2
%watermark -a 'Ethen' -d -t -v -p requests,pandas,tqdm,joblib

Ethen 2017-04-01 20:16:27 

CPython 3.5.2
IPython 5.1.0

requests 2.11.1
pandas 0.19.2
tqdm 4.11.2
joblib 0.11


In [2]:
path = os.path.join('data', 'model_likes_anon.psv')
df = pd.read_csv(path, sep = '|', quotechar = '\\')
df = df.drop_duplicates()
df.head()

Unnamed: 0,modelname,mid,uid
0,3D fanart Noel From Sora no Method,5dcebcfaedbd4e7b8a27bd1ae55f1ac3,7ac1b40648fff523d7220a5d07b04d9b
1,3D fanart Noel From Sora no Method,5dcebcfaedbd4e7b8a27bd1ae55f1ac3,2b4ad286afe3369d39f1bb7aa2528bc7
2,3D fanart Noel From Sora no Method,5dcebcfaedbd4e7b8a27bd1ae55f1ac3,1bf0993ebab175a896ac8003bed91b4b
3,3D fanart Noel From Sora no Method,5dcebcfaedbd4e7b8a27bd1ae55f1ac3,6484211de8b9a023a7d9ab1641d22e7c
4,3D fanart Noel From Sora no Method,5dcebcfaedbd4e7b8a27bd1ae55f1ac3,1109ee298494fbd192e27878432c718a


In [3]:
def get_mid_data(mid):
    response = requests.get('https://sketchfab.com/i/models/{}'.format(mid))
    status = response.status_code 
    if status != 200:
        thumbnail = None
        category = None
        name = None
        url = None
    else:
        response = response.json()
        name = response['name']
        url = response['viewerUrl']
        
        category = None
        categories = response['categories']
        if categories:
            category = categories[0]['name']
        
        thumbnail = None
        thumbnails = [x['url'] for x in response['thumbnails']['images']
                      if x['width'] == 200 and x['height'] == 200]
        if thumbnails:
            thumbnail = thumbnails[0]
    
    mid_data = {'thumbnail': thumbnail, 'name': name, 'url': url, 'category': category}
    return mid_data


def get_all_mid_data(all_mids, verbose = True, n_jobs = -1, pre_dispatch = '2 * n_jobs'):
    """"""
    parallel = Parallel(n_jobs = n_jobs, verbose = verbose, pre_dispatch = pre_dispatch)
    output = parallel(delayed(get_mid_data)(mid) for mid in all_mids)
    df = pd.DataFrame(output)
    df['mid'] = all_mids
    df = df[['mid', 'name', 'thumbnail', 'url', 'category']] 
    return df

In [4]:
all_mids = df['mid'].unique()
mid_data = get_all_mid_data(all_mids)
mid_data.head()

[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 416 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 866 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 1416 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 2066 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 2816 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done 3666 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done 4616 tasks      | elapsed:   14.8s
[Parallel(n_jobs=-1)]: Done 5666 tasks      | elapsed:   18.1s
[Parallel(n_jobs=-1)]: Done 6816 tasks      | elapsed:   21.7s
[Parallel(n_jobs=-1)]: Done 8066 tasks      | elapsed:   25.5s
[Parallel(n_jobs=-1)]: Done 9416 tasks      | elapsed:   29.6s
[Parallel(n_jobs=-1)]: Done 10866 tasks      | elapsed:   34.2s
[Parallel(n_jobs=-1)]: Done 12416 tasks      | elapsed:   39.0s
[Parallel(n_jobs=-1)]: Done 14066 tasks      | elapsed:   44.0s
[Parallel(n_jobs=-1)]: Done 15816 tasks      | elapsed:

Unnamed: 0,mid,name,thumbnail,url,category
0,5dcebcfaedbd4e7b8a27bd1ae55f1ac3,3D fanart Noel From Sora no Method,,https://sketchfab.com/models/5dcebcfaedbd4e7b8...,Characters
1,0c2c222919944c1fb879764b03f424ea,Riot art contest Caitlyn,,https://sketchfab.com/models/0c2c222919944c1fb...,Characters
2,8609caf1cd8c452eb7b6d4ca4228fcd0,Firewatch Fan Art,https://dg5bepmjyhz9h.cloudfront.net/urls/8609...,https://sketchfab.com/models/8609caf1cd8c452eb...,Architecture
3,96340701c2ed4d37851c7d9109eee9c0,Scifi Girl v.01,,https://sketchfab.com/models/96340701c2ed4d378...,Characters
4,311d052a9f034ba8bce55a1a8296b6f9,Van gogh Room,https://dg5bepmjyhz9h.cloudfront.net/urls/311d...,https://sketchfab.com/models/311d052a9f034ba8b...,Architecture


In [5]:
# remove mid that doesn't have a url or category
mid_data_subset = mid_data[ ~mid_data['thumbnail'].isnull() & ~mid_data['category'].isnull() ]
mid_data_subset = mid_data_subset.reset_index(drop = True)
print(mid_data_subset.shape)

(16402, 5)


In [6]:
def download_img(mid, thumbnail):
    img_data = requests.get(thumbnail).content
    img_path = mid + '.jpg'
    with open(img_path, 'wb') as handler:
        handler.write(img_data)
    
def download_all_img(mid_data, save_folder, verbose = True, n_jobs = -1, pre_dispatch = '2*n_jobs'):
    if not os.path.isdir(save_folder):
        os.mkdir(save_folder)
        
    current_path = os.getcwd()
    os.chdir(save_folder)
    
    mids = mid_data['mid']
    thumbnails = mid_data['thumbnail']
    parallel = Parallel(n_jobs = n_jobs, verbose = verbose, pre_dispatch = pre_dispatch)
    parallel(delayed(download_img)(mid, thumbnail) for mid, thumbnail in zip(mids, thumbnails))
    os.chdir(current_path)

In [7]:
img_dir = 'thumbnails'
download_all_img(mid_data_subset, save_folder = img_dir)

[Parallel(n_jobs=-1)]: Done  66 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 416 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 866 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 1416 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 2066 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 2816 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 3666 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 4616 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 5666 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 6816 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done 8066 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done 9416 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done 10866 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done 12416 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done 14066 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done 16019 out of 16402 | elapse

In [8]:
def remove_damaged_img(img_dir):
    # reading in the images and preprocessing
    img_names = os.listdir(img_dir)
    for img_name in tqdm(img_names):
        img_path = os.path.join(img_dir, img_name)
        try:
            img = Image.open(img_path)
        except OSError:
            os.remove(img_path)
            
    img_names = os.listdir(img_dir)
    return img_names

In [9]:
img_names = remove_damaged_img(img_dir)

100%|██████████| 16402/16402 [00:17<00:00, 940.99it/s]


In [10]:
valid_mid = [img.split('.')[0] for img in img_names]
df_subset = df[ df['mid'].isin(valid_mid) ]
df_subset = df_subset.drop('modelname', axis = 1)
df_subset = df_subset.merge(mid_data_subset, on = 'mid', how = 'inner')
df_subset.to_csv('model_likes.csv', index = False, quotechar = '\\')
print('dimension: ', df_subset.shape)
df_subset.head()

dimension:  (379336, 6)


Unnamed: 0,mid,uid,name,thumbnail,url,category
0,8609caf1cd8c452eb7b6d4ca4228fcd0,c3d22af246056d258c3493582fbf378f,Firewatch Fan Art,https://dg5bepmjyhz9h.cloudfront.net/urls/8609...,https://sketchfab.com/models/8609caf1cd8c452eb...,Architecture
1,8609caf1cd8c452eb7b6d4ca4228fcd0,6e795101ca8f455070b908a193500410,Firewatch Fan Art,https://dg5bepmjyhz9h.cloudfront.net/urls/8609...,https://sketchfab.com/models/8609caf1cd8c452eb...,Architecture
2,8609caf1cd8c452eb7b6d4ca4228fcd0,21f84161b0a59f1f05d298c523002fcd,Firewatch Fan Art,https://dg5bepmjyhz9h.cloudfront.net/urls/8609...,https://sketchfab.com/models/8609caf1cd8c452eb...,Architecture
3,8609caf1cd8c452eb7b6d4ca4228fcd0,21f75b7330fb937f8d905e54f903c1b4,Firewatch Fan Art,https://dg5bepmjyhz9h.cloudfront.net/urls/8609...,https://sketchfab.com/models/8609caf1cd8c452eb...,Architecture
4,8609caf1cd8c452eb7b6d4ca4228fcd0,be8276038689010a47e0c512b5247d13,Firewatch Fan Art,https://dg5bepmjyhz9h.cloudfront.net/urls/8609...,https://sketchfab.com/models/8609caf1cd8c452eb...,Architecture
