In [1]:
import pandas as pd
from numpy import arange
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

In [2]:
fake_set = pd.read_csv('news-temporal-series-fake.csv', sep='\t')
not_fake_set = pd.read_csv('news-temporal-series-notFake.csv', sep='\t')

In [3]:
fake_set['is fake'] = 1
not_fake_set['is fake'] = 0

In [4]:
dataset = pd.concat([fake_set, not_fake_set]).sample(frac=1)
dataset = dataset.drop(columns='news_id')

In [5]:
X = dataset.iloc[:, 0:3]
y = dataset.iloc[:, 8]

In [19]:
classifiers = [
    KNeighborsClassifier(3),
    KNeighborsClassifier(5),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    RandomForestClassifier(max_depth=10, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    GaussianNB(),
    XGBClassifier(random_state=1,learning_rate=0.01),
    XGBClassifier(random_state=1,learning_rate=0.1),
    XGBClassifier(random_state=0,learning_rate=0.01)
#     XGBClassifier(random_state=1,learning_rate=0.01),
#     XGBClassifier(random_state=1,learning_rate=0.1)
]

In [21]:
for clf in classifiers:
    scores = cross_val_score(clf, X, y, cv=10)
    score = max(scores)
    
    print(clf, score)
    print()

KNeighborsClassifier(n_neighbors=3) 0.9

KNeighborsClassifier() 0.9

SVC(C=0.025, kernel='linear') 0.8833333333333333

SVC(C=1, gamma=2) 0.75

RandomForestClassifier(max_depth=5, max_features=1, n_estimators=10) 0.9333333333333333

RandomForestClassifier(max_depth=10, max_features=1, n_estimators=10) 0.9

MLPClassifier(alpha=1, max_iter=1000) 0.9166666666666666

GaussianNB() 0.7166666666666667

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=0.01, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              random_state=1, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=None, subsample=None, tree_method=None,
              validate_parame

## Tuning KNN

In [28]:
max_score = 0

for n in range(1,11):
    for w in ['uniform', 'distance']:
        for a in ['auto', 'ball_tree', 'kd_tree', 'brute']:
            for p in range(1,4):
                if a in ['ball_tree', 'kd_tree']:
                    for l in [20, 30, 50, 100]:
                        clf = KNeighborsClassifier(n_neighbors=n, weights=w, algorithm=a, leaf_size=l, p=p)
                        
                        scores = cross_val_score(clf, X, y, cv=10)
                        score = max(scores)
                        
                        if score > max_score:
                            max_score = score
                            best_clf = clf
                            print(best_clf, max_score)
                        
                else:
                    clf = KNeighborsClassifier(n_neighbors=n, weights=w, algorithm=a, p=p)
                    
                    scores = cross_val_score(clf, X, y, cv=10)
                    score = max(scores)
                    
                    if score > max_score:
                            max_score = score
                            best_clf = clf
                            print(best_clf, max_score)

KNeighborsClassifier(n_neighbors=1, p=1) 0.8333333333333334
KNeighborsClassifier(n_neighbors=1) 0.8666666666666667
KNeighborsClassifier(n_neighbors=3, p=1) 0.9
KNeighborsClassifier(algorithm='ball_tree', leaf_size=50, n_neighbors=3, p=1) 0.9166666666666666
KNeighborsClassifier(n_neighbors=4, weights='distance') 0.9333333333333333
KNeighborsClassifier(algorithm='ball_tree', leaf_size=100, n_neighbors=10, p=1) 0.95


In [31]:
max_score = 0

for n in range(1,21):
    for w in ['uniform', 'distance']:
        for p in range(1,3):
            for l in range(60, 80):
                clf = KNeighborsClassifier(n_neighbors=n, weights=w, algorithm='ball_tree', leaf_size=l, p=p)

                scores = cross_val_score(clf, X, y, cv=10)
                score = max(scores)

                if score > max_score:
                    max_score = score
                    best_clf = clf
                    print(best_clf, max_score)

KNeighborsClassifier(algorithm='ball_tree', leaf_size=60, n_neighbors=1, p=1) 0.8166666666666667
KNeighborsClassifier(algorithm='ball_tree', leaf_size=68, n_neighbors=1, p=1) 0.8333333333333334
KNeighborsClassifier(algorithm='ball_tree', leaf_size=60, n_neighbors=1) 0.8666666666666667
KNeighborsClassifier(algorithm='ball_tree', leaf_size=60, n_neighbors=3, p=1) 0.9166666666666666
KNeighborsClassifier(algorithm='ball_tree', leaf_size=60, n_neighbors=4,
                     weights='distance') 0.9333333333333333
KNeighborsClassifier(algorithm='ball_tree', leaf_size=68, n_neighbors=10, p=1) 0.95
KNeighborsClassifier(algorithm='ball_tree', leaf_size=68, n_neighbors=13, p=1) 0.9666666666666667
KNeighborsClassifier(algorithm='ball_tree', leaf_size=68, n_neighbors=13, p=1,
                     weights='distance') 0.9833333333333333


## Tuning Random Forests

In [None]:
max_score = 0

for n in range(50, 150, 10):
    for c in ['gini', 'entropy']:
        for mss in arange(0.1, 1.1, 0.1):
            for msl in arange(0.1, 0.6, 0.1):
                for mwfl in arange(0, 0.5, 0.1):
                    for mf in ['auto', 'sqrt', 'log2']:
                        for mid in arange(0, 0.5, 0.1):
                            for b in [True]:
                                
                                for os in [True, False]:
                                    
                                    clf = RandomForestClassifier(
                                        n_estimators=n, 
                                        criterion=c, 
                                        min_samples_split=mss, 
                                        min_samples_leaf=msl, 
                                        min_weight_fraction_leaf=mwfl,
                                        max_features=mf,
                                        min_impurity_decrease=mid,
                                        bootstrap=b,
                                        oob_score=os
                                    )

                                    scores = cross_val_score(clf, X, y, cv=10)
                                    score = max(scores)

                                    if score > max_score:
                                        max_score = score
                                        best_clf = clf
                                        print(best_clf, max_score)

RandomForestClassifier(min_samples_leaf=0.1, min_samples_split=0.1,
                       n_estimators=50, oob_score=True) 0.9333333333333333
RandomForestClassifier(max_features='sqrt', min_samples_leaf=0.1,
                       min_samples_split=0.1, n_estimators=50) 0.95
RandomForestClassifier(max_features='log2', min_samples_leaf=0.1,
                       min_samples_split=0.30000000000000004,
                       min_weight_fraction_leaf=0.1, n_estimators=110,
                       oob_score=True) 0.9666666666666667


## Tuning Neural Network

## Tuning XGBoost 