In [1]:
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
import time

# Load dataset
TRAINING = "Data/5001 kaggle/train.csv"
TEST = "Data/5001 kaggle/test.csv"


In [2]:
train_data = pd.read_csv(TRAINING)  # type: pandas.core.frame.DataFrame
test_x = pd.read_csv(TEST)
print(train_data.shape)  #(87, 13)
print(test_x.shape)      #(59, 12)

train_x = train_data.iloc[:, :12]  #iloc是坐标，loc需要列名
train_y = train_data.iloc[:, 12]
print(train_x.shape) #(87, 12)
print(train_y.shape) #(87,)
print(test_x.shape)
# print(train_x.head(5))
# print(train_y.head(5))

(87, 13)
(59, 12)
(87, 12)
(87,)
(59, 12)


# Viewing Data

In [3]:
train_data.describe()
# train_data["label"].describe()

Unnamed: 0,id,MO HLADR+ MFI (cells/ul),Neu CD64+MFI (cells/ul),CD3+T (cells/ul),CD8+T (cells/ul),CD4+T (cells/ul),NK (cells/ul),CD19+ (cells/ul),CD45+ (cells/ul),Age,Sex 0M1F,Mono CD64+MFI (cells/ul),label
count,87.0,86.0,86.0,87.0,87.0,87.0,87.0,87.0,87.0,87.0,87.0,86.0,87.0
mean,43.0,1264.244186,290.383721,982.570115,479.34092,494.904023,212.732874,118.78092,1325.096437,40.218391,0.482759,2066.534884,0.333333
std,25.258662,765.452376,490.283499,617.332545,344.326452,311.836604,173.553264,96.218344,791.602538,10.461919,0.502599,1198.401364,0.474137
min,0.0,112.0,30.0,74.4,36.61,39.59,0.0,4.2,209.25,19.0,0.0,72.0,0.0
25%,21.5,685.5,77.5,549.39,237.92,272.745,78.815,52.425,780.615,33.0,0.0,1461.25,0.0
50%,43.0,1108.5,124.5,871.71,423.27,459.72,188.78,89.79,1179.27,41.0,0.0,1757.5,0.0
75%,64.5,1602.25,244.5,1268.085,624.45,624.36,262.845,155.45,1617.725,49.5,1.0,2238.25,1.0
max,86.0,4145.0,3124.0,3791.23,2548.1,1517.81,878.04,485.86,4757.28,60.0,1.0,7515.0,1.0


In [4]:
# label distribution is unbalance
train_data["label"].value_counts()

0    58
1    29
Name: label, dtype: int64

# Handling missing data

### using mean to fill NaN in training dataset

In [5]:
train_x.isnull().sum(axis = 0)

id                          0
MO HLADR+ MFI (cells/ul)    1
Neu CD64+MFI (cells/ul)     1
CD3+T (cells/ul)            0
CD8+T (cells/ul)            0
CD4+T (cells/ul)            0
NK (cells/ul)               0
CD19+ (cells/ul)            0
CD45+ (cells/ul)            0
Age                         0
Sex 0M1F                    0
Mono CD64+MFI (cells/ul)    1
dtype: int64

In [6]:
train_x = train_x.fillna(value={"MO HLADR+ MFI (cells/ul)": train_x["MO HLADR+ MFI (cells/ul)"].mean(), "Neu CD64+MFI (cells/ul)": train_x["Neu CD64+MFI (cells/ul)"].mean(),"Mono CD64+MFI (cells/ul)": train_x["Mono CD64+MFI (cells/ul)"].mean()})
# 正负样本各自均值填充？

In [7]:
#train_x.isnull().sum(axis = 0)

In [8]:
# if test set has NaN, use last number to fill in
test_x.isnull().sum(axis = 0)
test_x = test_x.fillna(method='pad', axis = 0)

# Random Forest

### Modeling

In [9]:
from sklearn.model_selection import train_test_split
# 2022/05/11 v1 
# use mean to fill NaN in training set
# split training set into training set and validation set (80%:20%, accuracy_val: 0.88888, score_test: 0.96551 )
x_train, x_val, y_train, y_val = train_test_split(train_x, train_y, test_size=0.2, random_state=100)
print(x_train.shape) #(69, 12)
print(x_val.shape) #(18, 12)
print(y_train.shape) #(69,)
print(y_val.shape) #(18,)
clf = RandomForestClassifier(n_estimators = 15, max_depth = 5, random_state = 0)#max_features= parameters, criterion = 'gini',

# clf = clf.fit(x_train, y_train)
# predict_y_val = clf.predict(x_val)
# accuracy = accuracy_score(y_val, predict_y_val)
# print("accuracy in validation set: ", accuracy)

#2022/05/11v2 use the whole training set to train(score_test: 0.89655)
# clf = clf.fit(train_x, train_y)

#2022/05/11v3 use the sample weight 0.4, 0.6 to train the model since unbalance label
#2022/05/11v4 RF sample_weight 0.5, 1.   test set item 30=1 (only different to v3 result)
w0 = 0.5
w1 = 1

clf = clf.fit(train_x, train_y, sample_weight= np.array([w0 if r==0 else w1 for r in train_y])) # the whole training set
# clf = clf.fit(x_train, y_train, sample_weight= np.array([w0 if r==0 else w1 for r in y_train]))
# predict_y_val = clf.predict(x_val)
# accuracy = accuracy_score(y_val, predict_y_val)
# print("accuracy in validation set: ", accuracy)

(69, 12)
(18, 12)
(69,)
(18,)


### prediction

In [11]:
predict_y_test=clf.predict(test_x)
print(predict_y_test.shape)
size = predict_y_test.shape[0]
print(size)
id = np.arange(size)

(59,)
59


In [10]:
#print(predict_y_val.shape)
#print(type(predict_y_val))

In [12]:
# ##############training set accuracy############
# predict_y_train=clf.predict(train_x)
# accuracy_train = accuracy_score(train_y, predict_y_train)
# print("accuracy in validation set: ", accuracy_train)

In [16]:
output = pd.DataFrame({'id': id, 'label':predict_y_test})
print(output['label'].value_counts())

0    47
1    12
Name: label, dtype: int64


In [15]:
output.to_csv("Data/5001 kaggle/output_sample_weight_0511v4.csv", index = False)