# Import & Read

### import

In [2]:
from calendar import month_abbr

import pandas as pd
pd.set_option("display.max_columns", None)
import numpy as np

from sklearn.preprocessing import LabelEncoder, MinMaxScaler

import tensorflow as tf
from tensorflow.keras.layers import Dense, Flatten, Embedding, Input, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical

gpus = tf.config.list_physical_devices(device_type="GPU")
tf.config.experimental.set_memory_growth(gpus[0], True)

### func

In [85]:
def drop_useless_rows(df):
    df = df.dropna(subset=["battery_date", "Standard Sensor size"])
    df = df[~df["phone"].isin(["Black Shark 5", "Vivo X80 Pro"])]
    return df

def get_month_no(df):
    df[["launch_month", "launch_year"]] = df["battery_date"].str.split(" ", expand=True)
    df["launch_month"] = df["launch_month"]\
                                    .str[:3].apply(lambda x: np.where(np.array(month_abbr) == x)[0][0])\
                                    .apply(lambda x: str(x).zfill(2))

    df["month_no"] = df[["launch_year", "launch_month"]].apply(lambda x: f"{x['launch_year']}{x['launch_month']}", axis=1)
    df = df.drop(["battery_date", "launch_year", "launch_month"], axis=1)
    return df

def deal_resistence(df):
    df["Resistance certificates (dust, water)"] = df["Resistance certificates (dust, water)"].fillna("")
    df["temp"] = df["Resistance certificates (dust, water)"].str.split(",")

    df["Mil Spec"] = df["temp"].apply(lambda x: x[1] if len(x) == 2 else np.nan)
    df["IP Grade"] = df["temp"].apply(lambda x: x[0])
    return df

def deal_battery_Operating_System(df):
    def func(x):
        if "(" in x:
            x = x.split("(")[1]
        
        x = " ".join(x.split(" ")[:2])
        x = x.replace(")", "")
        return x
    
    df["battery_Operating System"] = df["battery_Operating System"].str.split("\n").str[-1].apply(lambda x: func(x))
    return df

def deal_columns(df):
    df = df.rename(columns={"battery_Google Services (official)":"Google Services(official)",
                            "battery_Operating System":"Operating System",
                            "battery_brand":"Brand",
                            "battery_Capacity":"Battery Capacity",
                            "battery_Type":"Battery Type",
                            "battery_Fast charge":"Battery Fast charge"})
    for col in df.columns:
        df[col] = df[col].str.strip()

    df = df.drop(["Resistance certificates (dust, water)", "temp", "Colors", "Unnamed: 66", "Others", "battery_Others", "battery_Extra", "Score"], axis=1)
    return df

### basic tidy

In [86]:
df_general_spec = pd.read_csv("C:/Users/user/Desktop/park/회사 업무/PhoneData/crawler/result.csv").drop("Unnamed: 0", axis=1)
df_battery_spec = pd.read_csv("C:/Users/user/Desktop/park/회사 업무/PhoneData/crawler/battery.csv").drop("Unnamed: 0", axis=1)
df_battery_spec.columns = ["battery_"+col for col in df_battery_spec.columns]

# concat data
df_spec_raw = pd.concat([df_general_spec, df_battery_spec], axis=1)
print(df_general_spec.shape)
print(df_battery_spec.shape)
print(df_spec_raw.shape)

# basic tidy data
df_spec_raw = drop_useless_rows(df_spec_raw)
df_spec_raw = get_month_no(df_spec_raw)
df_spec_raw = deal_resistence(df_spec_raw)
df_spec_raw = deal_battery_Operating_System(df_spec_raw)
df_spec_raw = deal_columns(df_spec_raw)

df_spec_raw.head(2)

(1484, 97)
(1484, 11)
(1484, 108)


Unnamed: 0,phone,Size,Weight,Usable surface,Materials,Diagonal,Type,Aspect Ratio,Resolution,Density,Audio,Model,CPU,Nanometer,Frequency,64 Bits,GPU,RAM,Capacity,SD Slot,Fingerprint,Proximity,Light sensor,Accelerometer,Compass,Gyroscope,RGB,Notifications LED,Cooling system,Gaming buttons,Extra,Standard Resolution,Standard Sensor,Standard Type,Standard Aperture,Standard ISO,Standard Pixel size,Standard Pixel Binning,Standard Sensor size,Wide Angle lens Resolution,Wide Angle lens Sensor,Wide Angle lens Type,Wide Angle lens Aperture,Wide Angle lens Pixel size,Wide Angle lens Pixel Binning,Wide Angle lens Sensor size,Portrait mode (depth) Resolution,Portrait mode (depth) Sensor,Portrait mode (depth) Type,Portrait mode (depth) Aperture,Portrait mode (depth) Pixel size,Portrait mode (depth) Pixel Binning,Portrait mode (depth) Sensor size,Hall,Macro lens Aperture,Macro lens Pixel size,Macro lens Resolution,Macro lens Sensor,Macro lens Sensor size,Macro lens Type,Ultrasonic proximity virtual,Barometer,Geomagnetic,Telephoto + portrait Aperture,Telephoto + portrait Pixel size,Telephoto + portrait Resolution,Telephoto + portrait Sensor,Telephoto + portrait Sensor size,Telephoto + portrait Type,Cooling System,Heart Rate,Gravity,Magnetometer,LiDAR Scanner,Red Core 1 Game chip,Display Pressure,Touch,BLE S Pen,Hardware,Utrasonic Fingerprint,Pedometer,Telephoto + portrait Pixel Binning,Altimeter,Temperature,360° ambient light sensor,Iris scanner,Thermometer,Coulometer,Secondary,UV light sensor,Humidity,Hall IC,Brand,Battery Capacity,Battery Type,Battery Fast charge,Operating System,Google Services(official),battery_Widevine L1,battery_Widevine L3,month_no,Mil Spec,IP Grade
0,Vivo iQOO 9 Pro,75.2 mm • 164.8 mm • 9.2 mm,204 g,89 %,Aluminium alloy,"6.78""",UFS Storage 3.1,20:9,1440 x 3200 px • WQHD+,518 ppi •,Stereo Speakers,Qualcomm Snapdragon 8 Gen1,1x3.0GHz Cortex X2 + 3x2.5GHz Cortex A710 + 4x...,4 nm,3 GHz,Yes,Adreno 730,12 GB,256 GB,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,--,Yes,No,Dual linear motorDual-control pressure sensing...,50 Mpx,Samsung GN5,ISOCELL,ƒ/ 1.75,--,1.00 µm,1-4 (2x2),1/1.57,50 Mpx,Samsung S5KJN1,ISOCELL,ƒ/ 2.27,0.64 µm,1-4 (2x2),1/2.76,16 Mpx,Samsung S5K3P9,ISOCELL,ƒ/ 2.23,1.00 µm,1-4 (2x2),1/3.1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Vivo,4700 mAh\nSee more details,Lithium,"Yes , 120.0W",Android 12,No,,,202201,,
1,Vivo iQOO 9 Pro,75.2 mm • 164.8 mm • 9.2 mm,204 g,89 %,Aluminium alloy,"6.78""",UFS Storage 3.1,20:9,1440 x 3200 px • WQHD+,518 ppi •,Stereo Speakers,Qualcomm Snapdragon 8 Gen1,1x3.0GHz Cortex X2 + 3x2.5GHz Cortex A710 + 4x...,4 nm,3 GHz,Yes,Adreno 730,8 GB,256 GB,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,--,Yes,No,Dual linear motorDual-control pressure sensing...,50 Mpx,Samsung GN5,ISOCELL,ƒ/ 1.75,--,1.00 µm,1-4 (2x2),1/1.57,50 Mpx,Samsung S5KJN1,ISOCELL,ƒ/ 2.27,0.64 µm,1-4 (2x2),1/2.76,16 Mpx,Samsung S5K3P9,ISOCELL,ƒ/ 2.23,1.00 µm,1-4 (2x2),1/3.1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Vivo,4700 mAh\nSee more details,Lithium,"Yes , 120.0W",Android 12,No,,,202201,,


# Data Cleanse

### cleanse

In [87]:
def drop_old_phones(df, start_month=None, end_month=None):
    if not start_month: start_month = df["month_no"].min()
    if not end_month: end_month = df["month_no"].max()

    df = df[
        (df["month_no"] >= start_month)
        &(df["month_no"] <= end_month)
    ]
    return df

def deal_GBs(df):
    for col in ["RAM", "Capacity"]:
        df[f"{col}_"] = df[f"{col}"].apply(lambda x: float(x.replace("TB", ""))*1024 if "TB" in x else float(x.replace("GB", "")))
        df[f"{col}_max"] = df.groupby("phone")[f"{col}_"].transform(max)
        df = df[df[f"{col}_"] == df[f"{col}_max"]]
        df = df.drop([f"{col}_", f"{col}_max"], axis=1)
    return df

def deal_google_services(df):
    df = df.drop("Google Services(official)", axis=1)
    return df

def deal_operating_sys(df):
    df["temp"] = df["Operating System"].str.split(" ").str[1].str[:3].astype(float)
    df["max"] = df.groupby("phone")["temp"].transform(max)
    df = df[df["temp"] == df["max"]]
    df = df.drop(["temp", "max"], axis=1)
    return df

def deal_diagonal(df):
    df["temp"] = df["Diagonal"].str.replace('"', "").astype(float)
    df["max"] = df.groupby("phone")["temp"].transform(max)
    df = df[df["temp"] == df["max"]]
    df = df.drop(["temp", "max"], axis=1)
    return df

def deal_model(df):
    df = df[
        ~((df["phone"] == "Samsung Galaxy A71 5G") & (df["Model"] == "Qualcomm Snapdragon 765G"))
        &~((df["phone"] == "Samsung Galaxy Note10") & (df["Model"] == "Qualcomm Snapdragon 855"))
        &~((df["phone"] == "Samsung Galaxy Note20 Ultra") & (df["Model"] == "Qualcomm Snapdragon 865+"))
        &~((df["phone"] == "Samsung Galaxy S10") & (df["Model"] == "Qualcomm Snapdragon 855"))
        &~((df["phone"] == "Samsung Galaxy S10 Plus") & (df["Model"] == "Qualcomm Snapdragon 855"))
        &~((df["phone"] == "Samsung Galaxy S10e") & (df["Model"] == "Qualcomm Snapdragon 855"))
        &~((df["phone"] == "Samsung Galaxy S20 Ultra") & (df["Model"] == "Qualcomm Snapdragon 865"))
        &~((df["phone"] == "Samsung Galaxy S21 Ultra") & (df["Model"] == "Qualcomm Snapdragon 888"))
        &~((df["phone"] == "Samsung Galaxy S22") & (df["Model"] == "Qualcomm Snapdragon 8 Gen1"))
        &~((df["phone"] == "Samsung Galaxy S22 Ultra") & (df["Model"] == "Qualcomm Snapdragon 8 Gen1"))
        &~((df["phone"] == "Samsung Galaxy S22+") & (df["Model"] == "Qualcomm Snapdragon 8 Gen1"))
        &~((df["phone"] == "Samsung Galaxy S8") & (df["Model"] == "Qualcomm Snapdragon 835"))
        &~((df["phone"] == "Samsung Galaxy S9") & (df["Cooling system"] == "Yes"))
        &~((df["phone"] == "Xiaomi Redmi Note 10 Pro") & (df["Standard Sensor"] == "Samsung S5KHM2"))
        &~((df["phone"] == "Motorola Moto G9 Play") & (df["Compass"] == "Yes"))
        &~((df["phone"] == "Motorola One Vision") & (df["Resolution"] == "1080 x 2520 px • FHD+"))
        &~((df["phone"] == "Samsung Galaxy M12") & (df["Wide Angle lens Resolution"] == "5 Mpx"))
        &~((df["phone"] == "Vivo X60 Pro+") & (df["Gravity"] == "Yes"))
        &~((df["phone"] == "Xiaomi Redmi Note 9 Pro") & (df["Notifications LED"] == "LED Notifications"))
        &~((df["phone"] == "realme 6 Pro") & (df["Geomagnetic"] == "Yes"))
    ]
    return df

def deal_score(df):
    df["temp"] = df["Score"].astype(float)
    df["max"] = df.groupby("phone")["temp"].transform(max)
    df = df[df["temp"] == df["max"]]
    df = df.drop(["temp", "max"], axis=1)
    return df

def check_duplicates(df):
    duplicate_phone_li = df.groupby("phone").size()
    duplicate_phone_li = duplicate_phone_li[duplicate_phone_li > 1].index.to_list()
    print(len(duplicate_phone_li))

    valid_idx_li = []
    is_stop = False
    for phone in duplicate_phone_li:
        df = df[df["phone"] == phone]
        idx = df.index

        for col in df.columns:
            val = df[col].values
            if len(set(val)) != 1: # 중복이 있는 컬럼을 처리
                print(col)
                display(df[["phone", col]])
                is_stop = True
            
        if is_stop:
            break

In [89]:
df_spec = df_spec_raw.copy()

# cleanse
df_spec = drop_old_phones(df_spec, start_month="201701")
df_spec = deal_GBs(df_spec)
df_spec = deal_google_services(df_spec)
df_spec = deal_operating_sys(df_spec)
df_spec = deal_diagonal(df_spec)
df_spec = deal_model(df_spec)

df_spec = df_spec.drop_duplicates().reset_index(drop=True)

print(df_spec.shape)
check_duplicates(df_spec)


(424, 102)
0


### deal numeric

In [101]:
def size_func(x):
    x = x.replace("mm", "").split("•")
    res = 1
    for i in x:
        res *= float(i)
    return res

def resolution_func(x):
    x = x["Resolution"].split("•")

    # resolution1
    x[0] = x[0].replace("px", "").split("x")
    x[0] = float(x[0][0]) * float(x[0][1])

    # resolution2
    try :
        x[0] = x[1].strip()
    except IndexError:
        x.append("")
    
    return x

def capacity_func(x):
    if "TB" in x :
        x = float(x.replace("TB", ""))
        x *= 1024
    elif "GB" in x:
        x = float(x.replace("GB", ""))
    return x

def battery_fast_charge_func(x):
    x = x["Battery Fast charge"].split(",")
    x[0] = x[0].strip()

    try:
        x[1] = float(x[1].replace("W", ""))
    except IndexError:
        x.append(0)
    return x

def month_diff_func(x):
    x["month_no"] = pd.to_datetime(x["month_no"], format="%Y%m")
    x["Month Max"] = pd.to_datetime(x["Month Max"], format="%Y%m")
    x["Month Diff"] = np.round((x["Month Max"] - x["month_no"]) / np.timedelta64(1, "M"))

    return x["Month Diff"]

In [103]:
df_final = df_spec.copy()

df_final["Size"] = df_final["Size"].apply(lambda x: size_func(x)) # width X height X depth 값
df_final["Weight"] = df_final["Weight"].apply(lambda x: float(x.replace("g", "")))
df_final["Usable surface"] = df_final["Usable surface"].apply(lambda x: float(x.replace("%", "")))
df_final["Diagonal"] = df_final["Diagonal"].apply(lambda x: float(x.replace('"', "")))
df_final[["Resolution1", "Resolution2"]] = df_final.apply(lambda x: resolution_func(x), axis=1, result_type="expand")
df_final["Density"] = df_final["Density"].apply(lambda x: float(x.replace("ppi", "").replace("•", "")))
df_final["Nanometer"] = df_final["Nanometer"].apply(lambda x: float(x.replace("nm", "")))
df_final["Frequency"] = df_final["Frequency"].apply(lambda x: float(x.replace("GHz", "")))
df_final["RAM"] = df_final["RAM"].apply(lambda x: float(x.replace("GB", "")))
df_final["Capacity"] = df_final["Capacity"].apply(lambda x: capacity_func(x))
df_final["Battery Capacity"] = df_final["Battery Capacity"].apply(lambda x: float(x.split("\n")[0].replace("mAh", "")))
df_final[["Fast charge", "Charge Watt"]] = df_final.apply(lambda x: battery_fast_charge_func(x), axis=1, result_type="expand")
df_final["Month Max"] = df_final["month_no"].max()
df_final["Month Diff"] = df_final.apply(lambda x: month_diff_func(x), axis=1)

df_final = df_final.drop(["Resolution", "Battery Fast charge", "month_no", "Month Max"], axis=1)
print(df_final.shape)
df_final.head(3)

(424, 104)


Unnamed: 0,phone,Size,Weight,Usable surface,Materials,Diagonal,Type,Aspect Ratio,Density,Audio,Model,CPU,Nanometer,Frequency,64 Bits,GPU,RAM,Capacity,SD Slot,Fingerprint,Proximity,Light sensor,Accelerometer,Compass,Gyroscope,RGB,Notifications LED,Cooling system,Gaming buttons,Extra,Standard Resolution,Standard Sensor,Standard Type,Standard Aperture,Standard ISO,Standard Pixel size,Standard Pixel Binning,Standard Sensor size,Wide Angle lens Resolution,Wide Angle lens Sensor,Wide Angle lens Type,Wide Angle lens Aperture,Wide Angle lens Pixel size,Wide Angle lens Pixel Binning,Wide Angle lens Sensor size,Portrait mode (depth) Resolution,Portrait mode (depth) Sensor,Portrait mode (depth) Type,Portrait mode (depth) Aperture,Portrait mode (depth) Pixel size,Portrait mode (depth) Pixel Binning,Portrait mode (depth) Sensor size,Hall,Macro lens Aperture,Macro lens Pixel size,Macro lens Resolution,Macro lens Sensor,Macro lens Sensor size,Macro lens Type,Ultrasonic proximity virtual,Barometer,Geomagnetic,Telephoto + portrait Aperture,Telephoto + portrait Pixel size,Telephoto + portrait Resolution,Telephoto + portrait Sensor,Telephoto + portrait Sensor size,Telephoto + portrait Type,Cooling System,Heart Rate,Gravity,Magnetometer,LiDAR Scanner,Red Core 1 Game chip,Display Pressure,Touch,BLE S Pen,Hardware,Utrasonic Fingerprint,Pedometer,Telephoto + portrait Pixel Binning,Altimeter,Temperature,360° ambient light sensor,Iris scanner,Thermometer,Coulometer,Secondary,UV light sensor,Humidity,Hall IC,Brand,Battery Capacity,Battery Type,Operating System,battery_Widevine L1,battery_Widevine L3,Mil Spec,IP Grade,Resolution1,Resolution2,Fast charge,Charge Watt,Month Diff
0,Vivo iQOO 9 Pro,114015.232,204.0,89.0,Aluminium alloy,6.78,UFS Storage 3.1,20:9,518.0,Stereo Speakers,Qualcomm Snapdragon 8 Gen1,1x3.0GHz Cortex X2 + 3x2.5GHz Cortex A710 + 4x...,4.0,3.0,Yes,Adreno 730,12.0,512.0,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,--,Yes,No,Dual linear motorDual-control pressure sensing...,50 Mpx,Samsung GN5,ISOCELL,ƒ/ 1.75,--,1.00 µm,1-4 (2x2),1/1.57,50 Mpx,Samsung S5KJN1,ISOCELL,ƒ/ 2.27,0.64 µm,1-4 (2x2),1/2.76,16 Mpx,Samsung S5K3P9,ISOCELL,ƒ/ 2.23,1.00 µm,1-4 (2x2),1/3.1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Vivo,4700.0,Lithium,Android 12,,,,,WQHD+,WQHD+,Yes,120.0,7.0
1,Vivo iQOO 9,105984.06,206.0,87.0,Aluminium alloy,6.78,UFS Storage 3.1,20:9,388.0,Stereo Speakers,Qualcomm Snapdragon 8 Gen1,1x3.0GHz Cortex X2 + 3x2.5GHz Cortex A710 + 4x...,4.0,3.0,Yes,Adreno 730,12.0,512.0,No,Yes,Yes,Yes,Yes,Yes,Yes,,--,Yes,No,Dual linear motorDual-control pressure sensing...,50 Mpx,Samsung GN5,ISOCELL,ƒ/ 1.75,--,1.00 µm,1-4 (2x2),1/1.57,13 Mpx,Samsung S5K3L6,ISOCELL,ƒ/ 2.2,1.12 µm,,1/3.1,12 Mpx,Sony IMX663,CMOS,ƒ/ 1.98,1.22 µm,,1/2.93,Yes,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Vivo,4700.0,Lithium,Android 12,,,,,FHD+,FHD+,Yes,120.0,7.0
2,Xiaomi Mi 11 Pro,104182.63,208.0,91.0,Glass,6.81,UFS Storage 3.1,20:9,515.0,Hi-Res AudioStereo Speakers3 microphones,Qualcomm Snapdragon 888,1x Cortex X1 2.84GHz + 3x Cortex A78 2.42GHz +...,5.0,2.84,Yes,Adreno 660,12.0,256.0,No,Yes,Yes,Yes,Yes,Yes,Yes,,--,Yes,No,X-axis Linear motorLiquid-cooled VC three-dime...,50 Mpx,Samsung GN2,ISOCELL,ƒ/ 1.95,--,0.70 µm,1-4 (2x2),1/1.12,13 Mpx,Omnivision OV13B10,PureCel,ƒ/ 2.2,1.12 µm,,1/3.06,,,,,,,,,,,,,,,,,,,,,,,,,Yes,,,,,,,,,,,,,,,,,,,,,,Xiaomi,5000.0,Li-Ion Polymer,Android 12,Yes,,,IP68,WQHD+,WQHD+,Yes,67.0,17.0


# Model

In [109]:
df_train = df_final.drop("phone", axis=1).copy()

# cat_cols num_cols split
cat_cols = [col for col in df_train.columns if df_train[col].dtype == "O"]
num_cols = [col for col in df_train.columns if df_train[col].dtype != "O"]
print(cat_cols)
print(num_cols)

# label encode categorical columns
for col in cat_cols:
    encoder = LabelEncoder()
    df_train[col] = encoder.fit_transform(df_train[col].astype(str))

# scale numerical columns
tepm = df_train.copy()
scaler = MinMaxScaler()
df_train[num_cols] = scaler.fit_transform(df_train[num_cols])

# make data_li
cat_train_li = [df_train[col].values.reshape(-1,1) for col in cat_cols]
cat_train = df_train[cat_cols]
num_train = df_train[num_cols]

data_li = cat_train_li + [num_train]

# make label_li
data_label_li = []
for i in cat_train_li:
    data_label_li.append(to_categorical(i))
data_label_li = data_label_li + [num_train]

['Materials', 'Type', 'Aspect Ratio', 'Audio', 'Model', 'CPU', '64 Bits', 'GPU', 'SD Slot', 'Fingerprint', 'Proximity', 'Light sensor', 'Accelerometer', 'Compass', 'Gyroscope', 'RGB', 'Notifications LED', 'Cooling system', 'Gaming buttons', 'Extra', 'Standard Resolution', 'Standard Sensor', 'Standard Type', 'Standard Aperture', 'Standard ISO', 'Standard Pixel size', 'Standard Pixel Binning', 'Standard Sensor size', 'Wide Angle lens Resolution', 'Wide Angle lens Sensor', 'Wide Angle lens Type', 'Wide Angle lens Aperture', 'Wide Angle lens Pixel size', 'Wide Angle lens Pixel Binning', 'Wide Angle lens Sensor size', 'Portrait mode (depth) Resolution', 'Portrait mode (depth) Sensor', 'Portrait mode (depth) Type', 'Portrait mode (depth) Aperture', 'Portrait mode (depth) Pixel size', 'Portrait mode (depth) Pixel Binning', 'Portrait mode (depth) Sensor size', 'Hall', 'Macro lens Aperture', 'Macro lens Pixel size', 'Macro lens Resolution', 'Macro lens Sensor', 'Macro lens Sensor size', 'Macro 

In [122]:
# deal categorical variables
cat_input_li = []
embedded_li = []

for col in cat_cols :
    total_vocab = df_train[col].nunique()
    input = Input(shape=(1,))
    embedding = Embedding(input_dim=total_vocab, output_dim=1024)(input)
    flatten = Flatten()(embedding)

    cat_input_li.append(input)
    embedded_li.append(flatten)

# deal total input
num_input = Input(shape=(num_train.shape[1],), name="num_input")
total_input_li = cat_input_li + [num_input]

# concat layers
concat = tf.keras.layers.concatenate(embedded_li+[num_input])
print(concat.shape)

# encoder
dense = Dense(2**10, activation="selu")(concat)
dense = tf.keras.layers.BatchNormalization()(dense)
dense = Dense(2**9, activation="selu")(concat)
dense = tf.keras.layers.BatchNormalization()(dense)
encoder = Dense(2**8, activation="selu")(concat)
encoder = tf.keras.layers.BatchNormalization()(dense)

# decoder
dense = Dense(2**8, activation="selu")(concat)
dense = tf.keras.layers.BatchNormalization()(dense)
dense = Dense(2**9, activation="selu")(concat)
dense = tf.keras.layers.BatchNormalization()(dense)
dense = Dense(2*10, activation="selu")(concat)
dense = tf.keras.layers.BatchNormalization()(dense)

# deal cat_output_li
cat_output_li = []
for col in cat_cols:
    total_vocab = df_train[col].nunique()
    cat_output = Dense(total_vocab, activation="softmax")(dense)
    cat_output_li.append(cat_output)

# deal num_ouput
num_output = Dense(num_train.shape[1], activation="sigmoid", name="num_output")(dense)

# deal total_output
total_output_li = cat_output_li + [num_output]

# deal loss function
loss_func = ["categorical_crossentropy" for i in cat_cols] + ["binary_crossentropy"]

# model
model = Model(inputs=total_input_li, outputs=total_output_li)
model.compile(loss=loss_func, optimizer=tf.keras.optimizers.Nadam())
lr_schedule = tf.keras.callbacks.ReduceLROnPlateau(patience=5, min_delta=0.01, min_lr=1e-5, monitor="num_output_loss")
early_stopping = tf.keras.callbacks.EarlyStopping(patience=10, min_delta=0.01, monitor="num_output_loss")
model.fit(x=data_li, y=data_label_li, epochs=1000, callbacks=[lr_schedule, early_stopping], batch_size=16)

(None, 93196)
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000


<keras.callbacks.History at 0x1f0d47b39c8>

In [123]:
import io

new_model = Model(inputs=total_input_li, outputs=encoder)
pred = new_model.predict(data_li)

out_v = io.open("vectors.tsv", "w", encoding="utf-8")
out_m = io.open("metadata.tsv", "w", encoding="utf-8")

phone_name = df_spec["phone"]
for n, (vec, phone) in enumerate(zip(pred, phone_name)):
    out_v.write("\t".join([str(x) for x in vec]) + "\n")
    out_m.write(phone + "\n")

out_v.close()
out_m.close()

In [125]:
from sklearn.metrics.pairwise import cosine_similarity

res = pd.DataFrame(pred, index=phone_name)
res = pd.DataFrame(cosine_similarity(res, res), index=phone_name, columns=phone_name)

col = [i for i in res.columns if "Samsung Galaxy S22 Ultra" in i]
print(col)

res[col].sort_values(col, ascending=False)

['Samsung Galaxy S22 Ultra']


phone,Samsung Galaxy S22 Ultra
phone,Unnamed: 1_level_1
Samsung Galaxy S22 Ultra,0.999999
Samsung Galaxy S21 Ultra,0.860662
Samsung Galaxy S22+,0.837840
Samsung Galaxy S22,0.837545
Samsung Galaxy Note20 Ultra,0.830560
...,...
Xiaomi Redmi Note 10 Pro,0.521004
TCL 30,0.512490
Doogee S59 Pro,0.506948
Doogee S59,0.505341


In [128]:
pred_li = model.predict(data_li)
pred_cols = cat_cols.copy()
pred_cols.append(num_cols)

res_li = []
for n, (col, data) in enumerate(zip(pred_cols, pred_li)):
    if n < len(pred_cols) - 1:
        data = [i.argmax() for i in data]
        res = pd.Series(data, name=col)
    else:
        res = pd.DataFrame(data, columns=col)
    res_li.append(res)

df_res = pd.concat(res_li, axis=1)[df_final.drop("phone", axis=1).columns]
df_res[num_cols] = scaler.inverse_transform(df_res[num_cols])

df_res.iloc[[65, 30, 34]]

Unnamed: 0,Size,Weight,Usable surface,Materials,Diagonal,Type,Aspect Ratio,Density,Audio,Model,CPU,Nanometer,Frequency,64 Bits,GPU,RAM,Capacity,SD Slot,Fingerprint,Proximity,Light sensor,Accelerometer,Compass,Gyroscope,RGB,Notifications LED,Cooling system,Gaming buttons,Extra,Standard Resolution,Standard Sensor,Standard Type,Standard Aperture,Standard ISO,Standard Pixel size,Standard Pixel Binning,Standard Sensor size,Wide Angle lens Resolution,Wide Angle lens Sensor,Wide Angle lens Type,Wide Angle lens Aperture,Wide Angle lens Pixel size,Wide Angle lens Pixel Binning,Wide Angle lens Sensor size,Portrait mode (depth) Resolution,Portrait mode (depth) Sensor,Portrait mode (depth) Type,Portrait mode (depth) Aperture,Portrait mode (depth) Pixel size,Portrait mode (depth) Pixel Binning,Portrait mode (depth) Sensor size,Hall,Macro lens Aperture,Macro lens Pixel size,Macro lens Resolution,Macro lens Sensor,Macro lens Sensor size,Macro lens Type,Ultrasonic proximity virtual,Barometer,Geomagnetic,Telephoto + portrait Aperture,Telephoto + portrait Pixel size,Telephoto + portrait Resolution,Telephoto + portrait Sensor,Telephoto + portrait Sensor size,Telephoto + portrait Type,Cooling System,Heart Rate,Gravity,Magnetometer,LiDAR Scanner,Red Core 1 Game chip,Display Pressure,Touch,BLE S Pen,Hardware,Utrasonic Fingerprint,Pedometer,Telephoto + portrait Pixel Binning,Altimeter,Temperature,360° ambient light sensor,Iris scanner,Thermometer,Coulometer,Secondary,UV light sensor,Humidity,Hall IC,Brand,Battery Capacity,Battery Type,Operating System,battery_Widevine L1,battery_Widevine L3,Mil Spec,IP Grade,Resolution1,Resolution2,Fast charge,Charge Watt,Month Diff
65,114454.992188,205.683304,87.383995,3,6.650398,4,3,407.142578,21,104,31,4.667629,2.635421,1,23,5.508705,649.459717,1,1,0,0,0,1,0,0,0,0,0,50,0,38,2,9,0,4,1,2,1,22,3,5,4,1,1,7,18,5,1,10,1,6,0,1,5,5,13,2,5,1,0,0,0,6,5,7,5,5,3,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,4,1,1,0,35,2773.960205,2,2,0,1,2,4,7,7,1,65.224525,49.655731
30,110693.164062,210.692932,86.568939,0,6.150591,5,4,436.927307,34,92,31,7.839993,2.659733,1,23,9.271391,474.482452,0,1,0,0,0,0,0,1,0,0,0,50,1,38,2,9,0,12,2,2,10,23,0,1,8,1,1,7,18,5,1,10,1,6,0,1,5,5,13,2,5,1,0,0,0,6,5,7,5,5,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,4,1,1,0,35,4066.591309,0,2,0,1,2,4,7,7,1,32.316998,43.2174
34,131040.742188,241.553055,89.33889,0,6.555308,5,3,449.682953,31,89,31,6.185626,2.721784,1,23,10.256947,528.847778,0,1,0,0,0,0,0,1,0,1,0,50,0,38,2,2,0,3,2,2,6,22,3,5,4,0,1,7,18,5,1,10,1,6,0,1,5,5,13,2,5,1,0,0,3,6,5,7,5,5,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,4,1,1,0,35,3690.789062,2,2,0,1,2,4,7,7,1,38.450424,44.768291


In [129]:
df_final[df_final["phone"].isin(["Samsung Galaxy Note20 Ultra", "Samsung Galaxy S21 Ultra", "Samsung Galaxy S22 Ultra"])].sort_values("phone").drop_duplicates(subset=["phone"])

Unnamed: 0,phone,Size,Weight,Usable surface,Materials,Diagonal,Type,Aspect Ratio,Density,Audio,Model,CPU,Nanometer,Frequency,64 Bits,GPU,RAM,Capacity,SD Slot,Fingerprint,Proximity,Light sensor,Accelerometer,Compass,Gyroscope,RGB,Notifications LED,Cooling system,Gaming buttons,Extra,Standard Resolution,Standard Sensor,Standard Type,Standard Aperture,Standard ISO,Standard Pixel size,Standard Pixel Binning,Standard Sensor size,Wide Angle lens Resolution,Wide Angle lens Sensor,Wide Angle lens Type,Wide Angle lens Aperture,Wide Angle lens Pixel size,Wide Angle lens Pixel Binning,Wide Angle lens Sensor size,Portrait mode (depth) Resolution,Portrait mode (depth) Sensor,Portrait mode (depth) Type,Portrait mode (depth) Aperture,Portrait mode (depth) Pixel size,Portrait mode (depth) Pixel Binning,Portrait mode (depth) Sensor size,Hall,Macro lens Aperture,Macro lens Pixel size,Macro lens Resolution,Macro lens Sensor,Macro lens Sensor size,Macro lens Type,Ultrasonic proximity virtual,Barometer,Geomagnetic,Telephoto + portrait Aperture,Telephoto + portrait Pixel size,Telephoto + portrait Resolution,Telephoto + portrait Sensor,Telephoto + portrait Sensor size,Telephoto + portrait Type,Cooling System,Heart Rate,Gravity,Magnetometer,LiDAR Scanner,Red Core 1 Game chip,Display Pressure,Touch,BLE S Pen,Hardware,Utrasonic Fingerprint,Pedometer,Telephoto + portrait Pixel Binning,Altimeter,Temperature,360° ambient light sensor,Iris scanner,Thermometer,Coulometer,Secondary,UV light sensor,Humidity,Hall IC,Brand,Battery Capacity,Battery Type,Operating System,battery_Widevine L1,battery_Widevine L3,Mil Spec,IP Grade,Resolution1,Resolution2,Fast charge,Charge Watt,Month Diff
65,Samsung Galaxy Note20 Ultra,103052.736,209.0,92.0,Glass,6.9,UFS Storage 3.1,19.5:9,494.0,Dolby AtmosDolby Digital Plus,Samsung Exynos 990,2x Exynos M5 2.73 GHz + 2x Cortex•A76 2.5 GHz ...,7.0,2.73,Yes,ARM Mali-G77 MP11,12.0,512.0,Yes,Yes,Yes,Yes,Yes,,Yes,Yes,--,No,No,,108 Mpx,Samsung Bright S5KHM1,ISOCELL,ƒ/ 1.8,--,0.80 µm,1-9 (3x3),1/1.33,12.2 Mpx,Samsung S5K2L3,ISOCELL,ƒ/ 2.2,1.40 µm,,1/7.6,,,,,,,,Yes,,,,,,,,Yes,Yes,,,,,,,,,,,,,,,Bluetooth enabledLithium Titanate Battery: Up ...,,,,,,,,,,,,,,,Samsung,4500.0,Li-Polymer,Android 12,Yes,,,IP68,WQHD+,WQHD+,Yes,45.0,24.0
30,Samsung Galaxy S21 Ultra,111085.884,227.0,89.0,Glass,6.8,UFS Storage 3.1,20:9,516.0,Dolby AtmosHi-Res AudioNoise cancellation micr...,Samsung Exynos 2100,"1x Cortex•X1 2,91 GHz + 3x Cortex•A78 2,81 GHz...",5.0,2.91,Yes,ARM Mali-G78 MC14,16.0,512.0,No,Yes,Yes,Yes,Yes,Yes,Yes,,--,No,No,,108 Mpx,Samsung S5KHM3,ISOCELL,ƒ/ 1.8,--,0.80 µm,1-9 (3x3),1/1.33,,,,,,,,,,,,,,,Yes,,,,,,,,Yes,Yes,ƒ/ 2.4,1.22 µm,10 Mpx,Samsung S5K3J1,1/2.65,ISOCELL,,,,,,,,,,,,,,,,,,,,,,,,Samsung,5000.0,Li-Polymer,Android 12,,,,IP68,WQHD+,WQHD+,Yes,25.0,19.0
34,Samsung Galaxy S22 Ultra,113217.523,227.0,89.0,Aluminium alloy,6.8,UFS Storage 3.1,19.5:9,501.0,Dolby AtmosHi-Res AudioNoise cancellation micr...,Samsung Exynos 2200,1x2.8 GHz Cortex•X2 + 3x2.496 GHz Cortex•A710 ...,4.0,2.8,Yes,AMD Xclipse 920 555MHz,12.0,1024.0,No,Yes,Yes,Yes,Yes,Yes,Yes,,--,Yes,No,,108 Mpx,Samsung S5KHM3,ISOCELL,ƒ/ 1.8,--,0.80 µm,1-9 (3x3),1/1.33,12.2 Mpx,Sony IMX563,CMOS,ƒ/ 2.2,1.40 µm,,1/2.55,,,,,,,,Yes,,,,,,,,Yes,Yes,ƒ/ 2.4,1.12 µm,12 Mpx,Sony IMX754,1/3.52,CMOS BSI,,,,,,,,,,,,,,,,,,,,,,,,Samsung,5000.0,Li-Ion,Android 12,,,,IP68,WQHD+,WQHD+,Yes,45.0,6.0
