In [37]:
import zipfile
import os

base_dir = "/kaggle/input/cian-datafest-2019/train.zip"

with zipfile.ZipFile(base_dir, 'r') as z:
    z.extractall()

In [3]:
import cv2
img = cv2.imread("./train/indoor/3.jpg")

In [26]:
height, width, channels = img.shape

In [35]:
import numpy as np
import pandas as pd

### Mean

In [141]:
def mean(image, *, channels=3):
    result = [0 for _ in range(channels)]

    for i in range(channels):
        result[i] = np.mean(image[:, :, i])
    
    return result

In [54]:
mean(img)

[96.78, 100.79594594594595, 108.32621621621621]

### Mean square deviation

In [142]:
def std(image, *, channels=3):
    res = [0 for _ in range(channels)]

    for i in range(channels):
        res[i] = np.std(image[:, :, i])

    return res

In [51]:
std(img)

[67.73071665336474, 68.09813852118964, 67.6133407934451]

### Median

In [157]:
def median(image, *, channels=3):
    res = [0 for _ in range(channels)]

    for i in range(channels):
        res[i] = np.median(image[:, :, i])

    return res

In [158]:
median(img)

[211.0, 209.0, 207.0]

### DataFrame

In [45]:
_, _, files_indoor = next(os.walk("./train/indoor"))
_, _, files_outdoor = next(os.walk("./train/outdoor"))
files_count = (len(files_indoor), len(files_outdoor))

In [46]:
files_count

(28106, 27216)

In [81]:
df = pd.DataFrame({'filename': pd.Series(dtype='str'),
                   'mean_red': pd.Series(dtype='float'),
                   'mean_green': pd.Series(dtype='float'),
                   'mean_blue': pd.Series(dtype='float'),
                   'std_red': pd.Series(dtype='float'),
                   'std_green': pd.Series(dtype='float'),
                   'std_blue': pd.Series(dtype='float'),
                   'type': pd.Series(dtype='int')})

In [143]:
def append_img_data_to_df(df: pd.DataFrame, type_: str):
    directory = os.fsencode(f"./train/{type_}/")
    d_type_ = 0 if type_ == "indoor" else 1

    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        if filename.endswith(".jpg"):
            img = cv2.imread(f"./train/{type_}/{filename}")
            mean_img = mean(img)
            std_img = std(img)
            df = df.append({'filename': filename,
                            'mean_red': mean_img[0], 'mean_green': mean_img[1], 'mean_blue': mean_img[2],
                            'std_red': std_img[0], 'std_green': std_img[1], 'std_blue': std_img[2],
                            'type': d_type_}, ignore_index=True)
        else:
            continue
    return df

In [144]:
df_indoor = append_img_data_to_df(df, "indoor")

In [145]:
df_outdoor = append_img_data_to_df(df, "outdoor")

In [146]:
df_final = pd.concat([df_indoor, df_outdoor], ignore_index=True)

In [147]:
df_final

Unnamed: 0,filename,mean_red,mean_green,mean_blue,std_red,std_green,std_blue,type
0,51904.jpg,66.741786,83.814643,109.880179,45.505440,45.752630,51.868199,0
1,13727.jpg,105.630000,114.147067,131.441867,54.403579,55.406162,50.990754,0
2,80821.jpg,125.974464,147.038571,164.409107,68.255271,65.312143,59.750361,0
3,63459.jpg,95.371867,122.251600,145.406400,46.505979,53.537810,54.377938,0
4,1230.jpg,133.168393,158.979821,179.304643,39.897351,40.628800,42.330406,0
...,...,...,...,...,...,...,...,...
55317,41449.jpg,73.577162,85.392568,98.067162,62.009720,51.436385,41.907739,1
55318,20125.jpg,72.160533,89.904400,81.934533,63.433952,59.002574,62.178016,1
55319,75459.jpg,114.858571,122.889643,123.809643,57.076221,55.114630,54.849259,1
55320,46531.jpg,77.625323,115.280000,99.340161,39.399305,39.601495,36.371833,1


In [148]:
df_final.loc[:, 'mean_red':'std_blue']

Unnamed: 0,mean_red,mean_green,mean_blue,std_red,std_green,std_blue
0,66.741786,83.814643,109.880179,45.505440,45.752630,51.868199
1,105.630000,114.147067,131.441867,54.403579,55.406162,50.990754
2,125.974464,147.038571,164.409107,68.255271,65.312143,59.750361
3,95.371867,122.251600,145.406400,46.505979,53.537810,54.377938
4,133.168393,158.979821,179.304643,39.897351,40.628800,42.330406
...,...,...,...,...,...,...
55317,73.577162,85.392568,98.067162,62.009720,51.436385,41.907739
55318,72.160533,89.904400,81.934533,63.433952,59.002574,62.178016
55319,114.858571,122.889643,123.809643,57.076221,55.114630,54.849259
55320,77.625323,115.280000,99.340161,39.399305,39.601495,36.371833


### Classification

In [152]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score

In [153]:
X = df_final.loc[:, "mean_red":"std_blue"]
y = df_final.loc[:, "type"]

In [154]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [155]:
reg = LogisticRegression(random_state=42).fit(X, y)

In [156]:
cross_val_score(reg, X, y, cv=kf, scoring="accuracy")

array([0.80153638, 0.79692725, 0.79238973, 0.80133767, 0.79663774])

### With Median

In [159]:
df_median = pd.DataFrame({'filename': pd.Series(dtype='str'),
                          'mean_red': pd.Series(dtype='float'),
                          'mean_green': pd.Series(dtype='float'),
                          'mean_blue': pd.Series(dtype='float'),
                          'std_red': pd.Series(dtype='float'),
                          'std_green': pd.Series(dtype='float'),
                          'std_blue': pd.Series(dtype='float'),
                          'median_red': pd.Series(dtype='float'),
                          'median_green': pd.Series(dtype='float'),
                          'median_blue': pd.Series(dtype='float'),
                          'type': pd.Series(dtype='int')})

In [160]:
def append_img_data_to_df_median(df: pd.DataFrame, type_: str):
    directory = os.fsencode(f"./train/{type_}/")
    d_type_ = 0 if type_ == "indoor" else 1

    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        if filename.endswith(".jpg"):
            img = cv2.imread(f"./train/{type_}/{filename}")
            mean_img = mean(img)
            std_img = std(img)
            median_img = median(img)
            df = df.append({'filename': filename,
                            'mean_red': mean_img[0], 'mean_green': mean_img[1], 'mean_blue': mean_img[2],
                            'std_red': std_img[0], 'std_green': std_img[1], 'std_blue': std_img[2],
                            'median_red': median_img[0], 'median_green': median_img[1], 'median_blue': median_img[2],
                            'type': d_type_}, ignore_index=True)
        else:
            continue
    return df

In [162]:
df_median_indoor = append_img_data_to_df_median(df_median, "indoor")

In [163]:
df_median_outdoor = append_img_data_to_df_median(df_median, "outdoor")

In [165]:
df_median_final = pd.concat([df_median_indoor, df_median_outdoor], ignore_index=True)

In [166]:
X = df_median_final.loc[:, "mean_red":"median_blue"]
y = df_median_final.loc[:, "type"]

In [173]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [174]:
reg = LogisticRegression(random_state=42).fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [175]:
cross_val_score(reg, X, y, cv=kf, scoring="accuracy")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

array([0.80921826, 0.80587438, 0.80097614, 0.80477223, 0.80676067])