In [1]:
'''
数据分析
'''
from pyspark.sql import SparkSession
import pandas as pd


spark = SparkSession.builder.getOrCreate()

#分布式读取训练数据。
train_images_spark = spark.read.format("image").load("../Datasets/cifar_100/imgs/train/")
train_images = train_images_spark.select('image.origin', 'image.data').toPandas()
train_images['Image'] = train_images['origin'].map(lambda x: x[73:])
train_images = train_images.set_index('Image')

train_labels = pd.read_csv('../Datasets/cifar_100/cifar100_train.csv', index_col='Image')
train_data = train_images.join(train_labels)[['data', 'Label']]

#分布式读取测试数据。
test_images_spark = spark.read.format('image').load('../Datasets/cifar_100/imgs/test/')
test_images = test_images_spark.select('image.origin', 'image.data').toPandas()
test_images['Image'] = test_images['origin'].map(lambda x: x[73:])
test_images = test_images.set_index('Image')

21/10/03 19:50:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/10/03 19:51:37 WARN SharedInMemoryCache: Evicting cached table partition metadata from memory due to size constraints (spark.sql.hive.filesourcePartitionFileCacheSize = 262144000 bytes). This may impact query planning performance.
                                                                                

In [2]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50000 entries, imgs/train/computer_keyboard_s_000712.png to imgs/train/adriatic_s_001723.png
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   data    50000 non-null  object
 1   Label   50000 non-null  object
dtypes: object(2)
memory usage: 2.2+ MB


In [3]:
test_images.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, imgs/test/computer_keyboard_s_002225.png to imgs/test/beer_bottle_s_000236.png
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   origin  10000 non-null  object
 1   data    10000 non-null  object
dtypes: object(2)
memory usage: 234.4+ KB


In [5]:
'''
数据预处理
'''
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

#抽取训练和测试集中的图像特征。
X_train = np.array(np.vstack([np.array(feature).astype('float32') for feature in train_data['data'].values]))
X_test = np.array(np.vstack([np.array(feature).astype('float32') for feature in test_images['data'].values]))

#将图像的类别标签进行编码。
le = LabelEncoder()
y_train = le.fit_transform(train_data['Label'])

#将训练和测试集中的图像特征进行标准化处理。
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

#将训练集拆分为训练和验证集。
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.8)

In [6]:
'''
采用卷积神经网络，并且在验证集上进行模型寻优。
'''
import paddle
from paddle import nn, optimizer, metric


#设定超参数。
NUM_CLASSES = 100
EPOCHS = 5
BATCH_SIZE = 64
LEARNING_RATE = 1e-3
DROPOUT_RATE = 0.2

# 搭建卷积神经网络。
paddle_model = nn.Sequential(
    nn.Conv2D(in_channels=3, out_channels=32, kernel_size=3),
    nn.ReLU(),
    nn.MaxPool2D(kernel_size=2, stride=2),
    
    nn.Conv2D(in_channels=32, out_channels=64, kernel_size=3),
    nn.ReLU(),
    nn.MaxPool2D(kernel_size=2, stride=2),
    
    nn.Conv2D(in_channels=64, out_channels=64, kernel_size=3),
    
    nn.Flatten(),
    
    nn.Dropout(DROPOUT_RATE),
    
    nn.Linear(in_features=1024, out_features=NUM_CLASSES),
)

#初始化卷积神经网络模型。
model = paddle.Model(paddle_model)

# 为模型训练做准备，设置优化器，损失函数和评估指标。
model.prepare(optimizer=optimizer.Adam(learning_rate=LEARNING_RATE, parameters=model.parameters()),
              loss=nn.CrossEntropyLoss(),
              metrics=metric.Accuracy())

sysctl: unknown oid 'machdep.cpu.leaf7_features'


In [7]:
from paddle.io import TensorDataset


X_train = X_train.reshape([-1, 3, 32, 32])
X_train = paddle.to_tensor(X_train)
train_dataset = TensorDataset([X_train, y_train])

X_val = X_val.reshape([-1, 3, 32, 32])
X_val = paddle.to_tensor(X_val)
val_dataset = TensorDataset([X_val, y_val])

#模型训练与寻优。
model.fit(train_dataset, val_dataset, epochs=EPOCHS, batch_size=BATCH_SIZE, save_dir='../Checkpoints/cifar_100', verbose=1)

#保存在验证集上表现最优的模型。
model.save('../Checkpoints/cifar_100/test')

The loss value printed in the log is the current step, and the metric is the average value of previous steps.
Epoch 1/5


  return (isinstance(seq, collections.Sequence) and


save checkpoint at /Users/michael_fan/ML-Kaggle-2nd-Edition-Gitee/Checkpoints/cifar_100/0
Eval begin...
Eval samples: 10000
Epoch 2/5
save checkpoint at /Users/michael_fan/ML-Kaggle-2nd-Edition-Gitee/Checkpoints/cifar_100/1
Eval begin...
Eval samples: 10000
Epoch 3/5
save checkpoint at /Users/michael_fan/ML-Kaggle-2nd-Edition-Gitee/Checkpoints/cifar_100/2
Eval begin...
Eval samples: 10000
Epoch 4/5
save checkpoint at /Users/michael_fan/ML-Kaggle-2nd-Edition-Gitee/Checkpoints/cifar_100/3
Eval begin...
Eval samples: 10000
Epoch 5/5
save checkpoint at /Users/michael_fan/ML-Kaggle-2nd-Edition-Gitee/Checkpoints/cifar_100/4
Eval begin...
Eval samples: 10000
save checkpoint at /Users/michael_fan/ML-Kaggle-2nd-Edition-Gitee/Checkpoints/cifar_100/final


In [8]:
'''
使用最优的模型，依据测试数据的特征进行类别预测。
'''
X_test = X_test.reshape([-1, 3, 32, 32])
X_test = paddle.to_tensor(X_test)
test_dataset = TensorDataset([X_test])

model.load('../Checkpoints/cifar_100/test')
results = model.predict(test_dataset)

predictions = le.inverse_transform([np.argmax(item[0]) for item in results[0]])
test_images['Prediction'] = predictions

Predict begin...
Predict samples: 10000


In [9]:
test_ids = pd.read_csv('../Datasets/cifar_100/cifar100_test.csv', index_col='Image')

submission_df = test_images.join(test_ids)

submission_df['Label'] = submission_df['Prediction']

submission_df['Label'].to_csv('../Kaggle_submissions/cifar100_submission.csv')