In [1]:
import numpy as np
import pandas as pd

In [4]:
data = pd.read_csv('data/train.csv')
data.head()

Unnamed: 0,id,bone_length,rotting_flesh,hair_length,has_soul,color,type
0,0,0.354512,0.350839,0.465761,0.781142,clear,Ghoul
1,1,0.57556,0.425868,0.531401,0.439899,green,Goblin
2,2,0.467875,0.35433,0.811616,0.791225,black,Ghoul
3,4,0.776652,0.508723,0.636766,0.884464,black,Ghoul
4,5,0.566117,0.875862,0.418594,0.636438,green,Ghost


In [5]:
data.isnull().sum()

id               0
bone_length      0
rotting_flesh    0
hair_length      0
has_soul         0
color            0
type             0
dtype: int64

In [8]:
data.dtypes

id                 int64
bone_length      float64
rotting_flesh    float64
hair_length      float64
has_soul         float64
color             object
type              object
dtype: object

In [10]:
data = pd.get_dummies(data, columns=['color'], drop_first=True)

In [12]:
data.drop('id', axis=1, inplace=True)

In [13]:
data.head()

Unnamed: 0,bone_length,rotting_flesh,hair_length,has_soul,type,color_blood,color_blue,color_clear,color_green,color_white
0,0.354512,0.350839,0.465761,0.781142,Ghoul,0,0,1,0,0
1,0.57556,0.425868,0.531401,0.439899,Goblin,0,0,0,1,0
2,0.467875,0.35433,0.811616,0.791225,Ghoul,0,0,0,0,0
3,0.776652,0.508723,0.636766,0.884464,Ghoul,0,0,0,0,0
4,0.566117,0.875862,0.418594,0.636438,Ghost,0,0,0,1,0


In [14]:
X = data.drop('type', axis=1)
y = data['type']

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [97]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=1000, min_samples_leaf=10)

model.fit(X_train, y_train)

print(model.score(X_train, y_train))
print(model.score(X_test, y_test))

0.8451178451178452
0.7432432432432432


In [44]:
from sklearn.model_selection import KFold 

kf = KFold(n_splits=5) 

In [118]:
for i,j in kf.split(X):
    print(f"{i.shape} --- {j.shape}")

(296,) --- (75,)
(297,) --- (74,)
(297,) --- (74,)
(297,) --- (74,)
(297,) --- (74,)


In [98]:
train_accuracy = []
test_accuracy = []

for train_index, test_index in kf.split(X):
    
    X_train = X.loc[train_index]
    X_test = X.loc[test_index]
    y_train = y.loc[train_index]
    y_test = y.loc[test_index]
    
    train_accuracy.append(model.score(X_train, y_train))
    test_accuracy.append(model.score(X_test, y_test))
    
print(np.mean(np.array(train_accuracy)))
print(np.mean(np.array(test_accuracy)))

0.8247952497952497
0.8247567567567566


In [108]:
data_pred = pd.read_csv('data/test.csv')

In [109]:
data_pred.head()

Unnamed: 0,id,bone_length,rotting_flesh,hair_length,has_soul,color
0,3,0.471774,0.387937,0.706087,0.698537,black
1,6,0.427332,0.645024,0.565558,0.451462,white
2,9,0.549602,0.491931,0.660387,0.449809,black
3,10,0.638095,0.682867,0.471409,0.356924,white
4,13,0.361762,0.583997,0.377256,0.276364,black


In [110]:
data_pred = pd.get_dummies(data_pred, columns=['color'], drop_first=True)

In [111]:
X_pred = data_pred.drop('id', axis=1)

In [112]:
X_pred.head()

Unnamed: 0,bone_length,rotting_flesh,hair_length,has_soul,color_blood,color_blue,color_clear,color_green,color_white
0,0.471774,0.387937,0.706087,0.698537,0,0,0,0,0
1,0.427332,0.645024,0.565558,0.451462,0,0,0,0,1
2,0.549602,0.491931,0.660387,0.449809,0,0,0,0,0
3,0.638095,0.682867,0.471409,0.356924,0,0,0,0,1
4,0.361762,0.583997,0.377256,0.276364,0,0,0,0,0


In [120]:
predictions = model.predict(X_pred)

submission = pd.DataFrame({'Id': data_pred['id'].values, 'type': predictions})

In [114]:
submission.to_csv('submission.csv', index=False)

In [121]:
submission.head()

Unnamed: 0,Id,type
0,3,Ghoul
1,6,Goblin
2,9,Ghoul
3,10,Ghost
4,13,Ghost
