In [180]:
%matplotlib inline
import os
import numpy as np
import pandas as pd
import cStringIO, urllib
import tqdm


In [149]:
# Snagged from Probabilistic-Programming-and-Bayesian-Methods-for-Hackers Ch. 4
def intervals(u, d):
    a = 1. + u
    b = 1. + d
    mu = a / (a + b)
    std_err = 1.65 * np.sqrt((a * b) / ((a + b) ** 2 * (a + b + 1.)))
    return (mu, std_err)

def pornPercentage(dataframe):
    posterior_mean, std_err = intervals(dataframe['likes'],dataframe['dislikes'])
    quality = posterior_mean - std_err
    return quality if dataframe['views'] < 10 else quality*(np.log10(dataframe['views']))

In [239]:
def rgb_to_hsv(arr):
    """
    convert rgb values in a numpy array to hsv values
    input and output arrays should have shape (M,N,3)
    """
    arr = arr.astype('float')  # <- add this line
    out = np.zeros(arr.shape, dtype=np.float)
    arr_max = arr.max(-1)
    ipos = arr_max > 0
    delta = arr.ptp(-1)
    s = np.zeros_like(delta)
    s[ipos] = delta[ipos] / arr_max[ipos]
    ipos = delta > 0
    # red is max
    idx = (arr[:, :, 0] == arr_max) & ipos
    out[idx, 0] = (arr[idx, 1] - arr[idx, 2]) / delta[idx]
    # green is max
    idx = (arr[:, :, 1] == arr_max) & ipos
    out[idx, 0] = 2. + (arr[idx, 2] - arr[idx, 0]) / delta[idx]
    # blue is max
    idx = (arr[:, :, 2] == arr_max) & ipos
    out[idx, 0] = 4. + (arr[idx, 0] - arr[idx, 1]) / delta[idx]
    out[:, :, 0] = (out[:, :, 0] / 6.0) % 1.0
    out[:, :, 1] = s
    out[:, :, 2] = arr_max
    return out

In [3]:
header = ['iframe', 'thumbnail', 'samples', 'title', 'tags', 'more_tags', 'unknown', 'length','views','likes','dislikes']
goodcols = ['thumbnail', 'samples', 'title', 'tags', 'views','likes','dislikes']
dataTypes = {'title':str,'thumbnail':str, 'samples':str, 'tags':str, 'views': int, 'likes':int, 'dislikes':int}
df = pd.read_csv('Data/pornhub.com-db.csv',delimiter='|',names=header,error_bad_lines=False,usecols=goodcols,dtype=dataTypes)

Skipping line 127805: expected 11 fields, saw 14

Skipping line 140316: expected 11 fields, saw 14
Skipping line 140329: expected 11 fields, saw 14
Skipping line 148476: expected 11 fields, saw 14
Skipping line 149319: expected 11 fields, saw 14
Skipping line 153266: expected 11 fields, saw 14
Skipping line 154063: expected 11 fields, saw 14

Skipping line 637308: expected 11 fields, saw 14

Skipping line 772093: expected 11 fields, saw 14

Skipping line 852659: expected 11 fields, saw 14

Skipping line 1008972: expected 11 fields, saw 14

Skipping line 1328323: expected 11 fields, saw 20

Skipping line 1587956: expected 11 fields, saw 14

Skipping line 1643203: expected 11 fields, saw 14
Skipping line 1658114: expected 11 fields, saw 14

Skipping line 1759694: expected 11 fields, saw 14

Skipping line 2124713: expected 11 fields, saw 14



In [150]:
percents = df.apply(pornPercentage,axis=1)
df = pd.concat([df,percents], axis=1)
df.drop(df.columns[7],axis=1, inplace=True)
df.columns = ['thumbnail', 'samples', 'title', 'tags', 'views','likes','dislikes','quality']

In [105]:
df.to_msgpack('df.msg')

In [163]:
df = df.sort_values('quality', ascending=False)

In [None]:
num_videos = 3600
bad_links = []
for c in tqdm.tqdm(range(0,num_videos)):
    row = df.iloc[c]
    urlList = row['samples'].split(';')
    oldScore = 0;
    bestImage = ""
    for url in urlList:
        try:
            file = cStringIO.StringIO(urllib.urlopen(url).read())
            image = PIL.Image.open(file)
        except IOError:
            try:
                url = row['thumbnail']
                file = cStringIO.StringIO(urllib.urlopen(url).read())
                image = PIL.Image.open(file)
            except IOError:
                bad_links.append(c)
                continue
        np_image = np.asarray(image)
        hsv_image = rgb_to_hsv(np_image)
        np_bw_image = hsv_image[:, :, 2]
        saturation = 1-np.median(hsv_image[:,:,1])
        condition = np_bw_image > 3
        std_dev = 1-np.std(np_bw_image)/123.5
        mean = abs(np.mean(np_bw_image) -123.5)/123.5
        score = 1-(saturation+std_dev+mean)/3
        if score>oldScore: 
            bestUrl = url
            oldScore = score
    #if score > .3:
    #    bestUrl = url
    #num = str(c)
    #num = num.zfill(7)
    #new_filename = 'Images/thumb' + "_" + num + ".jpg"
    #urllib.urlretrieve(bestUrl,new_filename)
    if score < .3:
        num = str(c)
        num = num.zfill(7)
        new_filename = 'Images/thumb' + "_" + num + ".jpg"
        urllib.urlretrieve(bestUrl,new_filename)


|#########-| 442/467  94% [elapsed: 23:25 left: 01:19,  0.31 iters/sec]

In [None]:
df_dist = DataFrame(columns = ["np_image","distance"])
for c in tqdm.tqdm(range(0,num_videos)):
    image = PIL.Image.open(file)
    image = image.resize((4, 4), PIL.Image.ANTIALIAS)
    ycbcr = image.convert('YCbCr')
    np_image = np.ndarray((image.size[1], image.size[0], 3), 'u1', ycbcr.tobytes())
    #np_image = np.asarray(image)
    df_temp = DataFrame([np_image, np.mean(np_image[:,:,0])], columns = ["np_image","distance"])
    df_dist.append(df_temp)

In [None]:
df_dist.to_msgpack('df_dist.msg')