# 載入數據

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
data_train = pd.read_csv('/content/drive/MyDrive/training_data/train.csv')
data_test = pd.read_csv('/content/drive/MyDrive/training_data/test.csv'))

# 各類別之間的相關性

In [None]:
for i in data_train.columns:
  print(i,data_train[i].corr(data_train['price_range'],method='pearson'))

battery_power 0.20072261211373094
blue 0.0205728540614185
clock_speed -0.006605690881732072
dual_sim 0.01744447923722472
fc 0.021998207776904255
four_g 0.014771711417239368
int_memory 0.04443495938898744
m_dep 0.0008530365050864314
mobile_wt -0.030302171314386412
n_cores 0.004399274799457278
pc 0.03359930021353949
px_height 0.14885755500042175
px_width 0.16581750172625515
ram 0.9170457362649905
sc_h 0.022986073167424428
sc_w 0.038711271664484175
talk_time 0.021858871162374796
three_g 0.023611216880045034
touch_screen -0.03041107189821805
wifi 0.018784812012788994
price_range 1.0


# 資料標準化

In [None]:
from sklearn import preprocessing

# 對訓練集標準化
X = data_train.iloc[:, :-1]    # 最後一個column是label，不須放入訓練
Y = data_train.iloc[:, -1]
normalized_X = (X - X.min()) / (X.max() - X.min())
normalized_X.head()

# 對測試集標準化
X_test = data_test.iloc[:, 1:] # 第一個column是id，做預測時不參考
normalized_X_test = (X_test - X_test.min()) / (X_test.max() - X_test.min())

# 將Train.csv分為Training set和Validation set兩組資料

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_vali,y_train,y_vali=train_test_split(normalized_X,Y,test_size=0.3,random_state=42)

# 定義性能指標函式

In [None]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
def get_score(y_true, y_pred):
  precision, recall, fscore, support = score(y_true,y_pred)
  print("Accuracy : ", accuracy_score(y_true, y_pred))
  print('Precision: {}'.format(precision))
  print('Recall: {}'.format(recall))
  print('F1_score: {}'.format(fscore))
  print('Support: {}'.format(support))
  print("Confusion Matrix   :\n", confusion_matrix(y_true, y_pred))

# SVM

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn import svm
'''
# 透過Grid search自動找出合適的參數設定
parameter_candidates = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
]
clf = GridSearchCV(estimator=svm.SVC(), param_grid=parameter_candidates, n_jobs=-1)
clf.fit(x_train, y_train)

print('Best score for training data:', clf.best_score_)
print('Best `C`:',clf.best_estimator_.C)
print('Best kernel:',clf.best_estimator_.kernel)
print('Best `gamma`:\n',clf.best_estimator_.gamma)
'''
# 將合適的參數套入SVM模型進行訓練
m_svm = svm.SVC(kernel='linear',C=1000,gamma='scale')
model_svm = m_svm.fit(x_train, y_train)
y_pred = model_svm.predict(x_vali)

print('Predict Validation Set:\n',y_pred)
get_score(y_vali, y_pred)

y_test = model_svm.predict(normalized_X_test)
print('Predict Testing Set:\n',y_test)


Predict Validation Set:
 [0 2 1 3 1 1 2 0 3 1 0 0 2 3 3 2 3 3 1 0 0 2 1 2 0 1 3 2 2 0 0 0 3 0 1 1 2
 0 3 0 1 3 2 0 2 2 2 1 3 1 3 1 0 0 1 1 1 3 0 0 1 3 3 1 0 0 3 3 1 2 2 2 0 1
 2 0 1 3 2 2 3 2 1 0 1 3 1 3 3 0 3 3 2 1 3 2 2 3 1 1 0 0 1 0 1 3 2 0 1 1 0
 0 3 1 3 2 3 2 0 2 1 3 2 1 3 3 0 2 0 2 3 0 2 2 0 3 1 0 0 2 2 1 2 2 0 0 0 1
 1 2 3 1 1 0 2 2 0 1 0 2 2 3 3 3 1 0 1 2 2 3 3 0 1 0 3 1 1 2 1 0 0 0 0 0 3
 2 0 3 0 0 0 0 1 3 3 1 0 1 1 1 1 2 2 3 3 3 1 2 0 0 0 2 1 1 3 1 1 2 1 1 3 2
 3 0 0 2 1 3 0 1 2 0 2 3 2 0 1 3 3 0 1 3 3 3 0 3 1 2 3 3 2 1 1 3 3 1 3 3 3
 3 3 0 1 2 2 2 3 0 2 3 2 2 2 1 0 2 0 3 3 1 3 1 1 3 1 2 0 0 3 0 1 2 3 3 3 1
 1 0 1 3 3 0 1 2 2 0 3 3 2 3 2 3 2 0 2 1 1 1 0 0 0 3 3 3 1 0 1 0 2 2 3 0 3
 3 2 1 3 0 0 2 1 3 2 0 1 1 1 1 1 3 2 0 0 3 3 0 3 0 0 2 0 1 2 2 2 3 0 3 2 3
 3 3 3 2 1 1 0 3 1 3 3 0 2 3 2 3 3 3 0 0 2 3 0 0 2 3 2 1 1 2 1 3 0 3 1 2 0
 0 1 0 2 0 1 0 2 2 3 2 1 1 2 1 1 3 1 0 0 3 0 1 0 0 2 2 3 0 2 0 1 1 3 3 1 2
 0 2 0 0 3 3 0 2 2 2 3 1 1 0 1 3 1 0 3 1 0 0 3 2 3 1 0 2 1 0 1 2 3 2 1 1 0


# KNN

In [None]:
from sklearn import neighbors
import numpy as np
import matplotlib.pyplot as plt
'''
#用for-loop，尋找合適的k值
error_rate = []
for i in range(550,650):
    knn = neighbors.KNeighborsClassifier(n_neighbors=i)
    knn.fit(x_train.fillna(0), y_train)
    pred_i = knn.predict(x_vali)
    error_rate.append(np.mean(pred_i != y_vali))
    
#將k=550~650的錯誤率製圖畫出。
plt.figure(figsize=(10,10))
plt.plot(range(550,650),error_rate,color='blue', linestyle='dashed', marker='o', markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
'''
m_knn = neighbors.KNeighborsClassifier(n_neighbors=605)
model_knn = m_knn.fit(x_train, y_train)
y_pred = predict_knn=model_knn.predict(x_vali)
print('Predict Validation Set:\n',y_pred)
get_score(y_vali, y_pred)

y_test = model_svm.predict(normalized_X_test)
print('Predict Testing Set:\n',y_test)


Predict Validation Set:
 [0 2 0 2 1 2 2 0 3 1 1 1 3 2 3 3 3 3 1 0 0 1 1 2 0 2 3 2 3 0 1 0 3 0 2 1 2
 1 3 0 2 3 2 1 2 1 1 0 3 1 3 1 1 0 0 2 2 3 0 1 1 3 2 2 0 1 3 3 2 2 0 1 0 1
 2 1 0 3 1 1 3 1 1 0 1 2 2 2 3 1 3 3 2 1 3 2 2 3 2 1 0 0 1 1 0 2 2 0 0 1 0
 0 3 1 2 1 3 2 0 2 1 2 2 2 3 3 0 2 0 2 3 0 1 2 1 1 0 0 1 1 2 0 2 1 0 0 1 0
 2 2 3 1 1 0 2 3 1 1 0 2 2 3 2 1 1 1 1 2 2 2 2 0 0 1 3 1 2 1 2 1 0 0 1 0 1
 2 0 2 0 1 0 0 2 3 2 2 0 2 2 1 0 3 3 3 2 2 1 3 1 1 1 1 2 1 2 2 0 2 2 1 3 1
 2 0 1 2 1 3 0 1 1 1 1 2 2 0 0 2 3 0 0 2 2 3 0 3 2 3 3 3 2 1 0 3 3 1 3 2 2
 3 3 1 1 2 2 2 2 1 2 2 2 2 2 1 0 2 0 3 3 1 3 0 0 3 1 1 0 1 2 0 1 2 3 2 3 1
 0 1 0 3 3 0 1 2 2 0 3 2 2 3 1 3 2 0 3 1 1 1 0 1 1 2 3 3 2 0 1 1 2 2 2 1 2
 3 2 1 2 0 1 3 1 2 1 1 1 1 1 1 1 3 1 0 0 3 3 0 2 1 0 2 0 1 1 2 2 3 1 3 1 3
 3 3 3 1 1 1 1 3 0 2 3 1 2 3 2 3 3 3 1 1 2 3 0 0 1 2 2 1 2 2 1 1 1 2 2 1 0
 0 2 0 1 0 0 1 2 3 3 2 1 1 2 0 1 3 2 0 0 3 0 0 1 1 2 2 3 0 3 1 1 0 2 3 0 2
 1 2 0 0 2 3 0 2 3 2 1 0 1 1 0 3 1 0 3 2 0 0 2 2 2 2 0 2 0 0 0 1 3 1 1 0 0


# Decision Tree(DT)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

DT_model = DecisionTreeClassifier()
DT_model.fit(x_train,y_train)

y_pred = DT_model.predict(x_vali)
print('Predict Validation Set:\n',y_pred)
get_score(y_vali,y_pred)

y_test = model_svm.predict(normalized_X_test)
print('Predict Testing Set:\n',y_test)


Predict Validation Set:
 [0 2 1 3 1 1 2 0 2 2 0 1 1 3 2 2 3 3 1 0 0 1 1 2 0 2 2 2 2 0 0 0 3 0 1 1 2
 1 3 0 2 3 2 0 2 2 1 1 3 1 3 1 0 0 0 0 1 3 0 0 1 3 3 1 0 0 3 3 2 2 2 2 0 1
 3 0 1 3 2 2 3 2 1 0 1 3 2 3 3 0 2 3 2 1 3 2 2 3 1 2 0 0 1 1 0 3 3 0 1 1 0
 0 2 1 2 2 3 2 0 2 1 3 2 1 3 3 0 2 0 2 3 0 2 2 0 2 1 1 0 2 3 1 2 3 0 0 0 1
 2 2 3 1 1 0 2 2 0 1 1 1 2 3 3 3 1 0 1 2 2 3 3 1 0 0 3 1 2 2 1 0 0 0 0 0 3
 2 0 3 0 0 0 0 1 3 2 2 0 1 1 1 1 1 2 2 3 3 1 2 0 0 0 2 1 1 3 1 0 2 1 1 3 2
 3 0 0 1 1 2 0 0 1 0 2 3 2 1 1 3 3 0 2 3 3 3 0 3 1 2 3 3 2 1 1 3 3 0 3 3 3
 3 3 0 1 2 2 1 3 0 2 3 2 2 2 0 0 2 0 2 3 1 3 1 0 3 1 2 0 0 2 0 1 3 3 3 3 0
 1 0 1 3 3 0 1 2 2 0 3 3 2 3 2 3 2 0 2 1 1 1 0 0 0 3 2 3 2 0 1 0 1 3 2 0 2
 3 2 1 2 0 0 3 1 3 2 0 2 1 1 0 2 3 1 0 0 3 3 0 3 0 0 1 0 1 2 2 2 3 1 3 2 2
 3 3 3 3 1 1 0 3 2 3 3 0 2 3 2 3 3 3 1 0 2 3 0 0 3 3 1 2 1 2 1 3 1 3 1 2 0
 0 1 0 1 0 1 0 1 3 3 2 1 1 2 1 0 3 1 0 1 3 0 2 1 0 1 3 3 0 2 0 1 1 3 3 0 3
 0 2 0 0 3 3 0 2 2 1 3 1 2 0 1 3 1 0 3 1 0 0 3 2 3 2 0 3 0 0 1 2 3 2 1 0 0


# Random Forest(RF)

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=1000,random_state=42)
model.fit(x_train,y_train)

y_pred = model.predict(x_vali)
print('Predict Validation Set:\n',y_pred)
get_score(y_vali,y_pred)

y_test = model_svm.predict(normalized_X_test)
print('Predict Testing Set:\n',y_test)

Predict Validation Set:
 [0 2 1 3 1 2 2 0 3 1 0 1 2 3 2 2 3 3 1 0 0 1 1 2 0 1 3 2 2 0 0 0 3 0 1 1 2
 0 3 0 2 3 2 0 3 2 1 1 3 1 3 1 0 0 1 1 1 2 0 0 0 3 3 1 0 0 3 3 1 2 2 3 0 1
 2 0 0 3 2 2 3 2 1 0 1 3 2 3 3 0 3 3 2 1 3 2 2 3 2 1 0 0 1 0 0 3 2 0 1 1 0
 0 3 1 2 2 3 3 0 2 1 3 2 1 3 3 0 3 0 2 3 0 2 2 0 3 1 0 0 2 3 0 2 2 0 0 0 1
 1 2 3 1 1 0 2 2 0 1 0 1 2 3 3 3 1 0 0 2 2 3 3 1 0 0 3 1 2 2 1 0 0 0 0 0 3
 2 0 3 0 0 0 0 1 3 3 1 0 1 2 1 1 2 2 2 3 3 1 2 0 0 0 2 1 1 3 1 0 2 1 1 3 1
 3 0 0 2 1 2 0 0 2 0 1 3 2 0 1 3 3 0 1 3 3 3 0 3 1 2 3 3 2 1 1 3 3 1 3 3 3
 3 3 0 1 2 2 2 2 0 2 3 2 2 2 1 0 2 0 2 3 1 3 1 0 3 1 2 0 0 3 0 1 2 3 3 3 1
 0 0 1 3 3 0 1 1 2 0 3 3 2 3 1 3 2 0 2 1 1 1 0 0 0 3 2 3 1 0 1 0 2 2 2 0 3
 3 2 1 3 0 0 3 1 3 2 0 1 1 2 1 1 3 1 0 0 3 3 0 3 0 0 2 0 0 2 2 2 3 0 3 2 2
 3 3 3 2 1 2 0 3 1 3 3 0 2 3 2 3 3 3 0 0 2 3 0 0 2 3 2 1 1 2 1 3 1 3 1 2 0
 0 1 0 1 0 2 0 2 2 3 2 1 1 3 1 0 3 1 0 0 3 0 1 0 0 1 3 3 0 2 1 1 1 3 3 0 2
 0 2 0 0 3 3 0 2 2 1 3 1 1 0 1 3 1 0 3 1 0 0 3 2 3 2 0 3 1 0 1 2 3 2 1 1 0
