In [2]:
%pylab inline --no-import-all
import seaborn as sns
import cv2
import pandas as pd

Populating the interactive namespace from numpy and matplotlib


In [3]:
import csv
import re
import os

def read_data(data_file, is_train=True):
    data = []
    header = []
    with open(data_file) as f:
        for i, line in enumerate(f):
            if i == 0:
                header = np.array(re.split(r"\s*,\s*", line.strip()))
                continue
            line_data = re.split(r"\s*,\s*", line.strip())
            data.append(line_data)
            
    X = np.array(data)
    header = np.array(header)
    
    if is_train: # extract targets
        y = X[:, 1].astype(int)
        X = np.hstack((X[:, 0].reshape(-1, 1), X[:, 2:]))
        header = np.hstack((header[0], header[2:]))
    else:
        y = None

    X = pd.DataFrame(X, columns=header).set_index("Id")
    
    return X, y

X, y = read_data("task_2_data/train.csv")

In [856]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
import copy

class DataTransformer:
    def __init__(self, images_path):
        self._impath = images_path
        self._lang_encoder = (LabelEncoder(), OneHotEncoder())
        self._country_encoder = (LabelEncoder(), OneHotEncoder())
        self._rating_encoder = (LabelEncoder(), OneHotEncoder())
        self._bins = 8
        
    def fit_encoder(self, encoder, strings):
        classes = encoder[0].fit_transform(strings).reshape(-1, 1)
        encoder[1].fit(classes)
        
    def encode_strings(self, encoder, strings):
        classes = encoder[0].transform(strings).reshape(-1, 1)
        return encoder[1].transform(classes).toarray()
        
    def fit(self, X, y, **fit_params):
        self.fit_encoder(self._lang_encoder, X['Language']) 
        self.fit_encoder(self._country_encoder, X['Country'])
        self.fit_encoder(self._rating_encoder, X['Rating'])
        return self
     
    def transform(self, X, **transform_params):        
#         langs = pd.DataFrame(self.encode_strings(self._lang_encoder, X['Language']), index=X.index)
#         countries = pd.DataFrame(self.encode_strings(self._country_encoder, X['Country']), index=X.index)
#         ratings = pd.DataFrame(self.encode_strings(self._rating_encoder, X['Rating']), index=X.index)
        
        X = X.copy()
#         X.loc[X['Language'] == 'None', 'Language'] = None
#         X.loc[X['Language'] == '', 'Language'] = None
        
#         X.loc[X['Country'] == '', 'Country'] = None
        
#         X.loc[X['Rating'] == 'Not Rated', 'Rating'] = None
#         X.loc[X['Rating'] == 'Unrated', 'Rating'] = None
#         X.loc[X['Rating'] == '', 'Rating'] = None

#         print(langs.shape)
#         print(countries.shape)
#         print(ratings.shape)

        hists = pd.DataFrame(
            np.zeros((X.shape[0], 3 * self._bins)),
            columns=["bin%d" % i for i in range(3 * self._bins)]
        )
        hists.index = X.index

        for i in range(X.shape[0]):
            img = plt.imread(os.path.join(self._impath, X.iloc[i, -1]))
            hists.iloc[i, :] = np.array([
                cv2.calcHist([channel], [0], None, [8], [0, 256])
                for channel in cv2.split(img)
            ]).reshape((1, -1))
                                    
        new_data = pd.concat((
            X[['Duration']],
            X.loc[:, 'Action':'Western'],
  #          langs,
#              countries,
#              ratings,
            hists
        ), axis=1, join_axes=[X.index])
                
        return new_data

In [854]:
pipe = Pipeline([("transformer", DataTransformer("task_2_data/posters/")), ("ada", AdaBoostClassifier())])
pipe.fit(X, y)

(3635, 56)
(3635, 16)


Pipeline(memory=None,
     steps=[('transformer', <__main__.DataTransformer object at 0x7fec01ae10f0>), ('ada', AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None))])

In [855]:
testX, testY = read_data("task_2_data/test.csv", False)
pipe.predict(testX)
pass

ValueError: y contains new labels: ['' 'Libya' 'Panama']

In [813]:
pipe.score(X, y)

(3635, 40)
(3635, 56)
(3635, 13)


0.82338376891334253

In [4]:
import keras
from keras.models import Sequential
from keras.layers import Conv2D, Dense, Input, Activation, ZeroPadding2D, MaxPooling2D, Dropout, Flatten

Using TensorFlow backend.


In [5]:
def get_posters(X, impath = 'task_2_data/posters/'):
    images = []
    for i in range(X.shape[0]):
        img = plt.imread(os.path.join(impath, X.iloc[i, -1]))
        images.append(img)
        
    return np.array(images)


trainX = get_posters(X)
trainY = y
img_shape = trainX[0].shape
print(img_shape)

(268, 182, 3)


In [6]:
model = Sequential([
    Conv2D(32, (3, 3), padding='same', input_shape=img_shape),
    MaxPooling2D(),
    Conv2D(32, (3, 3), padding='same'),
    MaxPooling2D(),
    Dropout(0.25),

    Flatten(),
    Dense(120, activation='relu', kernel_initializer='random_uniform'),
    Dropout(0.5),
    Dense(128, activation='relu', kernel_initializer='random_uniform'),
    Dropout(0.5),
    Dense(1, activation='sigmoid', kernel_initializer='random_uniform')
])

In [7]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
trainY.reshape((-1, 1)).shape
print(trainY.dtype)

int64


In [9]:
model.fit(trainX, trainY)

Epoch 1/10
 192/3635 [>.............................] - ETA: 267s - loss: 6.6319 - acc: 0.5885

KeyboardInterrupt: 