# Data preparation : 1c, picture processing

In this notebook, we store the appartments pictures for the highly rented full AirBnB appartments, then :

- compute the brightness qnd contrast, 
- convert them into greyscale numpy array and reduce its dimensionality,
- compute the top 5 RGB colours for each picture.

Finally, all those infos are stored into a Dataframe.

In [123]:
%matplotlib inline

from os import listdir
from os.path import isfile, join

import matplotlib.pyplot as plt

from pylab import rcParams
import numpy as np
import pandas as pd
from ggplot import *

import scipy
import scipy.misc
import scipy.cluster

from sklearn.decomposition import PCA
from sklearn.preprocessing import scale


from PIL import Image, ImageFilter
import urllib

## Create a list of appart ID / picture urls

In [124]:
df_listing = pd.read_csv('../../data/listing_cleansed.csv')

In [125]:
df_listing['listings_per_host']=df_listing.groupby(["host_id"])["listing_id"].transform("count")

In [126]:
df_listing= df_listing[df_listing.availability_90>0]

In [86]:
df_listing

Unnamed: 0,listing_id,host_id,host_name,host_since,host_response_time,host_response_rate,host_is_superhost,host_listings_count,host_has_profile_pic,host_identity_verified,...,text_experiences_offered,text_neighborhood_overview,text_notes,text_transit,text_access,text_interaction,text_house_rules,text_host_about,text_language,listings_per_host
0,16682014,110413588,Julia,200.0,1.0,100.0,0,3.0,1,0,...,none,,,,,,,,de,1
1,15767215,39537907,Andre,733.0,4.0,100.0,0,1.0,1,0,...,none,,"Bitte beachten Sie, dass in Berlin Gäste zusä...",,,,,,de,1
2,115576,584750,Axel,2267.0,,,0,1.0,1,0,...,none,,,,,,non smoking area! Garden is usable for guests....,"I love my wife and three kids, my little dog (...",en,1
3,7568544,39056052,Claudia,738.0,4.0,100.0,0,1.0,1,1,...,none,Die Wohnung befindet sich über einem tollen kl...,,Sehr gute Anbindung an den öffentlichen Nahver...,Ihr dürft die komplette Wohnung einschließlic...,Während Deines Aufenthaltes in unserer Wohnung...,"Rücksichtsvoller Umgang mit allen Sachen: so, ...",Ich war früher eine Weltenbummlerin und immer ...,de,1
8,16926322,25707959,Anja,933.0,4.0,100.0,0,2.0,1,1,...,none,,,,,,,,de,2
9,10402942,53565483,Burghardt,568.0,24.0,100.0,0,1.0,1,0,...,none,,,Haltestellen der Straßenbahnlinie M1 und der B...,,,,,de,1
10,10027737,50795723,Victoria,597.0,24.0,67.0,0,1.0,1,0,...,none,In unmittelbarer Umgebung befinden sich einige...,,M1 Straßenbahn (fährt direkt bis zur Friedrich...,Ihr habt zur gesamten Wohnung Zugang,,,,de,1
13,16153479,105373097,Vanessa,242.0,1.0,100.0,0,1.0,1,0,...,none,Pankow ist ein familienfreundlicher grüner Sta...,,Vom Apartment fahren Sie in etwa 20 Minuten zu...,,,,,de,1
14,13920566,3351408,Oliver,1799.0,,,0,1.0,1,0,...,none,,,,"You are able to use the Home-Cinema System, Wi...",You can reach me via Mobile-Phone if you have ...,,Wir sind eine 7-köpfige Familie aus Berlin und...,en,1
15,17783667,121453952,Thomas,130.0,,,0,1.0,1,0,...,none,Dinner? > Trattoria Pasta Degli Angeli - (URL ...,,Tram next station 2 minutes is Heinrich-Böll-...,,,,,en,1


In [5]:
df_listing_all = pd.read_csv('../../data/insideAirBnB/listings.csv')

col_url = [col for col in df_listing_all.columns if 'url' in col]
col_reviews = [col for col in df_listing.columns if 'review' in col]

df_listing_all[df_listing_all.id==4106583][col_url].transpose()

Unnamed: 0,18254
listing_url,https://www.airbnb.com/rooms/4106583
thumbnail_url,https://a0.muscache.com/im/pictures/52427497/3...
medium_url,https://a0.muscache.com/im/pictures/52427497/3...
picture_url,https://a0.muscache.com/im/pictures/52427497/3...
xl_picture_url,https://a0.muscache.com/im/pictures/52427497/3...
host_url,https://www.airbnb.com/users/show/20998252
host_thumbnail_url,https://a0.muscache.com/im/users/20998252/prof...
host_picture_url,https://a0.muscache.com/im/users/20998252/prof...


In [6]:
df_listing_all[df_listing_all.id==4106583][col_url].xl_picture_url.values

array([ 'https://a0.muscache.com/im/pictures/52427497/3b3b7391_original.jpg?aki_policy=x_large'], dtype=object)

In [7]:
print df_listing.shape
print df_listing_all.shape

(4887, 221)
(20576, 95)


In [8]:
df_pictures_url = pd.merge(df_listing[['listing_id']], df_listing_all[['id']+col_url], left_on='listing_id',right_on='id')

In [9]:
df_pictures_url.drop('id',inplace=True,axis=1)

In [10]:
df_pictures_url.to_csv('../../data/listings_active_pictures_url.csv')

## Scrap all appartment pictures for high rented flats

In [11]:
def scrap_pictures(id,url):
   
    try:
        urllib.urlretrieve(url, "../../data/pictures/"+  str(id )+".jpg")
    except Exception as e:
        print e
        print "cannot scrap {}".format(url)


In [12]:
def check_pictures_files_is_local(list_url):
    local_files =  [f for f in listdir('../../data/pictures') if isfile(join('../../data/pictures', f))]
    local_files =[int(f.split(".")[0]) for f in local_files]
    
    server_files = [f for f in list_url if f[0] not in local_files]
    
    
    return server_files

In [13]:
list_url = df_pictures_url[['listing_id','picture_url']].sort_values(by='listing_id').values.tolist()
list_url = check_pictures_files_is_local(list_url)

In [14]:
len(list_url)

0

In [15]:
for url in list_url:
    scrap_pictures(url[0],url[1])

## Extract pictures features

In [16]:
#Pictures paramters
SIZE = (64, 64)
FORMAT = 'L'

### Compute contrast and brightness for each picture

In [17]:
def compute_contrast_and_brightness(pic_file) :
    pic_id = pic_file.split('.')[0]
    img = Image.open('../../data/pictures/'+pic_file).convert('RGB')
    img = img.resize(SIZE, Image.ANTIALIAS) 
    img.load()
    data = np.asarray(img, dtype="int32" )
    R = data[:,:,0]
    G = data[:,:,1]
    B = data[:,:,2]
    LuminanceA = (0.2126*R) + (0.7152*G) + (0.0722*B)
    
    return np.asarray((pic_id,R.std(),G.std(),B.std(),LuminanceA.mean(),LuminanceA.std()),dtype="int32")


In [18]:
def create_contrast_brightness_matrix():
    all_pics =  [f for f in listdir('../../data/pictures') if isfile(join('../../data/pictures', f))]
    
    i=0
    for pic_file in all_pics:
        
        try: 
            npdata = compute_contrast_and_brightness(pic_file)
        except Exception as e:
            print "{0} for {1}".format(e,pic_file)
            continue
               
        if i==0:
            npdata_all = npdata
            i=1
        else :
            npdata_all = np.vstack((npdata_all,npdata))
        
    return npdata_all

In [19]:
df_contrasts_brightness = pd.DataFrame(create_contrast_brightness_matrix())

cannot identify image file '../../data/pictures/15959427.jpg' for 15959427.jpg
cannot identify image file '../../data/pictures/16543634.jpg' for 16543634.jpg
image file is truncated (0 bytes not processed) for 5284687.jpg


In [20]:
df_contrasts_brightness.columns =['listing_id','R_contrast','G_contrast','B_contrast','Brightness_mean','Brightness_std']

In [21]:
df_contrasts_brightness.describe()

Unnamed: 0,listing_id,R_contrast,G_contrast,B_contrast,Brightness_mean,Brightness_std
count,4884.0,4884.0,4884.0,4884.0,4884.0,4884.0
mean,9545633.0,55.72543,58.972973,61.555078,135.855446,57.74181
std,5974547.0,11.893245,11.247503,12.002715,33.488546,11.025406
min,1971.0,14.0,14.0,13.0,25.0,14.0
25%,3863223.0,48.0,51.0,54.0,111.0,50.0
50%,10083410.0,56.0,59.0,62.0,134.0,58.0
75%,15243650.0,64.0,67.0,70.0,161.0,65.0
max,18106940.0,109.0,105.0,103.0,230.0,98.0


In [22]:
df_contrasts_brightness.to_csv('../../data/df_contrasts_brightness.csv', index=False)

## Convert all pictures to greyscale array and reduce dimensionality

In [23]:
def load_image(infilename) :
    img = Image.open( infilename).convert(FORMAT)
    img = img.resize(SIZE, Image.ANTIALIAS) 
    img.load()
    data = np.asarray(img, dtype="int32" )
    
    #save_image(data)
    data = data.reshape(1,-1)
    return data

def reconstruct_image( npdata) :
    clipped_ndata = np.asarray(np.clip(npdata,0,255), dtype="uint8")  
    img = Image.fromarray(clipped_ndata, FORMAT )
    img.show()

In [25]:
def create_matrix_greyscale_pictures():
    all_pics =  [f for f in listdir('../../data/pictures') if isfile(join('../../data/pictures', f))]
    
    i=0
    for pic_file in all_pics:
        pic_id = pic_file.split('.')[0]
        try: 
            npdata = load_image('../../data/pictures/'+pic_file)
        except Exception as e:
            print "{0} for {1}".format(e,pic_file)
            continue
        npdata = np.append([pic_id],npdata)
        
        if i==0:
            
            npdata_all = npdata
            i=1
        else :
            npdata_all = np.vstack((npdata_all,npdata))
        
    return npdata_all
        

In [26]:
npdata_all = create_matrix_greyscale_pictures()
df_pictures = pd.DataFrame(npdata_all)

cannot identify image file '../../data/pictures/15959427.jpg' for 15959427.jpg
cannot identify image file '../../data/pictures/16543634.jpg' for 16543634.jpg
image file is truncated (0 bytes not processed) for 5284687.jpg


In [27]:
df_pictures.shape

(4884, 4097)

In [28]:
cols = ["pix_"+ str(i) for i in range(0,df_pictures.shape[1]-1)]

In [29]:
df_pictures.columns = ['listing_id']+ cols

In [30]:
df_pictures.to_csv('../../data/df_pictures_listing.csv')

### Convert pictures array to PC

In [31]:
pca = PCA(n_components=50)

X_train = scale(df_pictures[cols].values)

X_transformed = pca.fit_transform(X_train)

df_pictures_PCA = pd.DataFrame(X_transformed)
df_pictures_PCA.columns = ['pictures_PC_'+str(i) for i in range(1,51)]



In [32]:
df_pictures_PCA= pd.concat([df_pictures.listing_id,df_pictures_PCA],axis=1)

In [35]:
df_pictures.head()

Unnamed: 0,listing_id,pix_0,pix_1,pix_2,pix_3,pix_4,pix_5,pix_6,pix_7,pix_8,...,pix_4086,pix_4087,pix_4088,pix_4089,pix_4090,pix_4091,pix_4092,pix_4093,pix_4094,pix_4095
0,4725997,90,72,57,43,29,30,32,34,50,...,47,43,30,37,61,86,99,103,106,108
1,16013696,34,35,36,35,37,43,31,20,36,...,47,51,54,54,62,44,90,132,39,53
2,9379127,189,188,188,187,186,185,185,183,182,...,10,10,151,128,16,40,50,53,51,24
3,14159874,100,109,121,118,127,136,134,106,114,...,2,26,33,30,32,16,19,8,28,26
4,14195566,255,255,235,188,229,255,255,255,254,...,38,30,49,57,53,40,48,49,49,52


## Extract top 5 colours in each pictures

In [36]:


NUM_CLUSTERS = 5

def extract_top_colour_in_picture(pic_file):
    
    pic_id = np.asarray([pic_file.split('.')[0]])
     
    
    im = Image.open('../../data/pictures/'+pic_file)
    im = im.resize((150, 150))      # optional, to reduce time
    ar = scipy.misc.fromimage(im)
    ar = ar.astype(float)
    shape = ar.shape
    ar = ar.reshape(scipy.product(shape[:2]), shape[2])

   
    codes, dist = scipy.cluster.vq.kmeans(ar, NUM_CLUSTERS)
    #print 'cluster centres:\n', codes

    vecs, dist = scipy.cluster.vq.vq(ar, codes)         # assign codes
    counts, bins = scipy.histogram(vecs, len(codes))    # count occurrences

    index_max = scipy.argmax(counts)                    # find most frequent
    peak = codes[index_max]
    
    

    peak = peak.astype(int)
    colour = ''.join(chr(c) for c in peak).encode('hex')
    #print 'most frequent is %s (#%s) \n' % (peak, colour)
    
    i=0
    
    for c in range (NUM_CLUSTERS):
        c_centroid=  np.asarray(codes[c])
        c_counts =  np.asarray([counts[c]])
        c_hex = ''.join(chr(c) for c in c_centroid.astype(int)).encode('hex')
        c_hex = np.asarray([c_hex])
        
        if i==0:
            npdata = np.concatenate((pic_id, c_centroid,c_counts,c_hex))#.astype('int32')
            i=1
        else :
            npdata = np.hstack((npdata,np.concatenate((c_centroid,c_counts,c_hex))))#.astype('int32')))
        
    return npdata 

In [37]:
extract_top_colour_in_picture('81081.jpg')

array(['81081', '31.6452863962', '22.5784606205', '9.80071599045', '3322',
       '1f1609', '159.360507246', '100.082729469', '22.1467391304', '1658',
       '9f6416', '200.760361552', '181.243606702', '162.800485009', '4475',
       'c8b5a2', '114.980956599', '92.7852081488', '88.1346324181', '2267',
       '725c58', '226.319685923', '216.115442139', '208.327631333',
       '10778', 'e2d8d0'], 
      dtype='|S32')

In [38]:
def create_colour_clusters_matrix():
    all_pics =  [f for f in listdir('../../data/pictures') if isfile(join('../../data/pictures', f))]
    
    i=0
    for pic_file in all_pics:
        
        try: 
            npdata = extract_top_colour_in_picture(pic_file)
        except Exception as e:
            print "{0} for {1}".format(e,pic_file)
            continue
        
        if i==0:
            
            npdata_all = npdata
            i=1
        else :
            npdata_all = np.vstack((npdata_all,npdata))
        
    return npdata_all

In [39]:
df_colors_cluster = pd.DataFrame(create_colour_clusters_matrix())

cannot identify image file '../../data/pictures/15959427.jpg' for 15959427.jpg
cannot identify image file '../../data/pictures/16543634.jpg' for 16543634.jpg
index 4 is out of bounds for axis 0 with size 4 for 16655645.jpg
index 4 is out of bounds for axis 0 with size 4 for 14854731.jpg
image file is truncated (0 bytes not processed) for 5284687.jpg


In [40]:
df_colors_cluster.shape

(4882, 26)

In [41]:
df_colors_cluster.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,4725997,141.703960015,144.572472126,138.253748558,5279,8d908a,214.950200089,240.074255225,248.261671854,4510,...,47.3173852573,43.6328233658,34.3418636996,3573,2f2b22,108.118238994,102.139622642,88.213836478,3891,6c6658
1,16013696,246.606832098,245.868514341,249.586206897,3103,f6f5f9,95.1568676822,82.3886440891,78.5413571698,7986,...,53.4909971251,47.0118020881,46.4018762294,6530,352f2e,198.73357016,197.71107164,200.609236234,1689,c6c5c8
2,9379127,202.747633216,194.681769002,185.733432513,7410,cac2b9,69.00514259,56.8728377747,35.0028050491,2118,...,243.102139582,240.491085074,236.175751401,3948,f3f0ec,168.038702929,156.757147838,145.44735007,5703,a89c91
3,14159874,28.3095276479,27.9174592939,19.6803159761,6244,1c1b13,134.807058367,133.694589878,101.550901687,5014,...,81.83408,85.3752,64.94256,6398,515540,229.466666667,222.991515152,181.739090909,3273,e5deb5
4,14195566,19.1839876828,23.6712856043,28.5681293303,2616,13171c,106.288870924,118.415386688,119.656965777,7430,...,229.982880536,236.532192036,237.121324898,2673,e5eced,71.5329465396,83.0358474927,76.409241608,4888,47534c


In [42]:
df_colors_cluster.columns = ['listing_id','Centroid_R_1','Centroid_G_1','Centroid_B_1','Centroid_Count_1','Centroid_hex_1',
                            'Centroid_R_2','Centroid_G_2','Centroid_B_2','Centroid_Count_2','Centroid_hex_2',
                          'Centroid_R_3','Centroid_G_3','Centroid_B_3','Centroid_Count_3','Centroid_hex_3',
                         'Centroid_R_4','Centroid_G_4','Centroid_B_4','Centroid_Count_4','Centroid_hex_4',
                           'Centroid_R_5','Centroid_G_5','Centroid_B_5','Centroid_Count_5','Centroid_hex_5']

In [43]:
cols_int = [col for col in df_colors_cluster.columns if 'hex' not in col]

In [44]:
df_colors_cluster[cols_int] = df_colors_cluster[cols_int].apply(pd.to_numeric)

In [45]:
df_colors_cluster.to_csv('../../data/df_colours_clusters.csv',index=False)

# Combine all (brightness/contrast, PCA of picture, colors clusters)

## Select only new column 

In [46]:
df_contrasts_brightness= df_contrasts_brightness[['listing_id',
 'R_contrast',
 'G_contrast',
 'B_contrast',
 'Brightness_mean',
 'Brightness_std']]

In [112]:
df_pictures_PCA.listing_id = pd.to_numeric(df_pictures_PCA.listing_id)
df_contrasts_brightness.listing_id = pd.to_numeric(df_contrasts_brightness.listing_id)
df_colors_cluster.listing_id = pd.to_numeric(df_colors_cluster.listing_id)

In [128]:
df_listing = pd.merge(df_listing[['listing_id']],df_pictures_PCA, on='listing_id')

In [129]:
df_listing.head()

Unnamed: 0,listing_id,pictures_PC_1,pictures_PC_2,pictures_PC_3,pictures_PC_4,pictures_PC_5,pictures_PC_6,pictures_PC_7,pictures_PC_8,pictures_PC_9,...,pictures_PC_41,pictures_PC_42,pictures_PC_43,pictures_PC_44,pictures_PC_45,pictures_PC_46,pictures_PC_47,pictures_PC_48,pictures_PC_49,pictures_PC_50
0,16682014,44.891212,12.452659,6.97422,-5.184285,-6.731535,7.200861,0.510843,6.50311,-3.68169,...,0.682334,-4.030088,3.763776,-1.164527,0.981281,2.797508,-1.978749,1.575972,3.760916,2.553579
1,15767215,-13.490105,-12.614338,-2.372806,1.94937,11.773108,-16.377617,3.137355,-4.526477,3.566227,...,-0.869935,0.591841,1.417125,2.625522,4.016214,-0.275299,-1.638685,-8.802399,-1.819419,3.519856
2,115576,70.972239,5.991331,4.960066,-6.315522,24.580115,-0.503351,-2.934977,4.063151,5.987481,...,3.898138,0.664644,1.542807,-1.076519,-3.881148,2.896477,2.806097,-0.469314,-1.662616,1.910708
3,7568544,18.405457,9.907819,32.082884,6.662087,-6.1883,-2.909725,-7.448552,6.885595,6.643106,...,-2.307442,11.075362,1.057746,3.687718,6.816297,-6.136876,4.444175,-0.225182,-4.88658,6.408059
4,16926322,45.456981,-9.00968,1.653136,-1.455836,-21.874762,1.806398,10.596269,2.150038,0.668536,...,0.196205,-0.733661,-6.514989,0.235094,1.769292,-0.171437,1.263812,1.044665,-1.200203,-0.555247


In [130]:
df_listing = pd.merge(df_listing,df_colors_cluster, on='listing_id')

In [131]:
df_listing.head()

Unnamed: 0,listing_id,pictures_PC_1,pictures_PC_2,pictures_PC_3,pictures_PC_4,pictures_PC_5,pictures_PC_6,pictures_PC_7,pictures_PC_8,pictures_PC_9,...,Centroid_R_4,Centroid_G_4,Centroid_B_4,Centroid_Count_4,Centroid_hex_4,Centroid_R_5,Centroid_G_5,Centroid_B_5,Centroid_Count_5,Centroid_hex_5
0,16682014,44.891212,12.452659,6.97422,-5.184285,-6.731535,7.200861,0.510843,6.50311,-3.68169,...,177.469172,132.889456,92.004013,2768,b1845c,148.382132,101.487544,55.720868,7556,946537
1,15767215,-13.490105,-12.614338,-2.372806,1.94937,11.773108,-16.377617,3.137355,-4.526477,3.566227,...,33.548015,27.437394,22.637258,2949,211b16,174.857235,154.274522,117.711881,3093,ae9a75
2,115576,70.972239,5.991331,4.960066,-6.315522,24.580115,-0.503351,-2.934977,4.063151,5.987481,...,151.552294,139.262752,126.100917,2658,978b7e,42.008374,38.863547,35.771921,3886,2a2623
3,7568544,18.405457,9.907819,32.082884,6.662087,-6.1883,-2.909725,-7.448552,6.885595,6.643106,...,101.47619,86.259857,63.597798,3864,65563f,213.969355,232.059677,233.895968,1246,d5e8e9
4,16926322,45.456981,-9.00968,1.653136,-1.455836,-21.874762,1.806398,10.596269,2.150038,0.668536,...,82.833176,48.248068,30.432422,5374,52301e,130.570991,80.278447,50.572012,4887,825032


In [132]:
df_listing = pd.merge(df_listing,df_contrasts_brightness, 
                      on='listing_id')

In [133]:
df_listing.head()

Unnamed: 0,listing_id,pictures_PC_1,pictures_PC_2,pictures_PC_3,pictures_PC_4,pictures_PC_5,pictures_PC_6,pictures_PC_7,pictures_PC_8,pictures_PC_9,...,Centroid_R_5,Centroid_G_5,Centroid_B_5,Centroid_Count_5,Centroid_hex_5,R_contrast,G_contrast,B_contrast,Brightness_mean,Brightness_std
0,16682014,44.891212,12.452659,6.97422,-5.184285,-6.731535,7.200861,0.510843,6.50311,-3.68169,...,148.382132,101.487544,55.720868,7556,946537,37,36,33,91,36
1,15767215,-13.490105,-12.614338,-2.372806,1.94937,11.773108,-16.377617,3.137355,-4.526477,3.566227,...,174.857235,154.274522,117.711881,3093,ae9a75,58,59,62,148,59
2,115576,70.972239,5.991331,4.960066,-6.315522,24.580115,-0.503351,-2.934977,4.063151,5.987481,...,42.008374,38.863547,35.771921,3886,2a2623,64,63,61,69,63
3,7568544,18.405457,9.907819,32.082884,6.662087,-6.1883,-2.909725,-7.448552,6.885595,6.643106,...,213.969355,232.059677,233.895968,1246,d5e8e9,68,65,54,118,64
4,16926322,45.456981,-9.00968,1.653136,-1.455836,-21.874762,1.806398,10.596269,2.150038,0.668536,...,130.570991,80.278447,50.572012,4887,825032,51,51,55,88,50


In [134]:
df_listing.to_csv('../../data/df_listing_active_with_pic.csv',index=False)

In [135]:
df_listing.shape

(4882, 81)