In [94]:
import pandas as pd
import numpy as np

In [95]:
fn_df = pd.read_csv('FakeReal_News_Sites.csv', index_col=[0])
fn_df.head()
# fn_df.replace([np.inf, -np.inf], np.nan, inplace=True)
# fn_df.fillna(0, inplace=True)


Unnamed: 0,Domain,Page rank integer,Page rank decimal,Site Rank,Domain registrar,Postal code,Country of origin,Harmless,Malicious,Suspicious,Undetected,Timeout,No. of Local links,No. of Outbound links,Total links,Links to fake news site?,Wordpress?,Site Rating
0,actionnews3.com,3,2.82,7390112.0,"GoDaddy.com, LLC",85284,US,86,0,0,10,0,16,1,17,False,True,Fake
1,abcnews-us.com,3,2.89,6579810.0,NamePal.com #8009 Inc.,32256,US,86,0,0,10,0,0,1,1,False,False,Fake
2,21stcenturywire.com,5,4.57,66742.0,"GoDaddy.com, LLC",85284,US,83,0,0,13,0,183,36,219,False,True,Fake
3,100percentfedup.com,4,4.15,778723.0,"GoDaddy.com, LLC",85284,US,84,1,0,11,0,49,8,57,False,True,Fake
4,abcnews.com,5,4.78,32787.0,"CSC CORPORATE DOMAINS, INC.",10023-6298,US,83,0,0,13,0,0,144,144,False,False,Fake


In [96]:
fn_df.isnull().any()

Domain                      False
Page rank integer           False
Page rank decimal           False
Site Rank                    True
Domain registrar             True
Postal code                  True
Country of origin            True
Harmless                    False
Malicious                   False
Suspicious                  False
Undetected                  False
Timeout                     False
No. of Local links          False
No. of Outbound links       False
Total links                 False
Links to fake news site?    False
Wordpress?                  False
Site Rating                 False
dtype: bool

In [97]:
fn_df.fillna({'Site Rank':0, 'Domain registrar':'None', 'Postal code':'None', 'Country of origin':'None'}, inplace=True)


In [98]:
fn_df.isnull().any()

Domain                      False
Page rank integer           False
Page rank decimal           False
Site Rank                   False
Domain registrar            False
Postal code                 False
Country of origin           False
Harmless                    False
Malicious                   False
Suspicious                  False
Undetected                  False
Timeout                     False
No. of Local links          False
No. of Outbound links       False
Total links                 False
Links to fake news site?    False
Wordpress?                  False
Site Rating                 False
dtype: bool

In [99]:
fn_df[(fn_df['Site Rating'] == 'Fake')].shape[0]

149

In [100]:
fn_df.loc[fn_df['Malicious'] > 0, 'Malicious'] = 1

In [101]:
#unnecessary columns = Links to fake news site?, Suspicious, and No. of Outbound links
X = fn_df[['Domain registrar', 'Postal code',
       'Wordpress?', 'Page rank decimal', 'Site Rank', 'Country of origin',
       'Malicious', 'No. of Local links', 'Total links']]

y = fn_df['Site Rating']


In [102]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
import joblib

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

xgb = XGBClassifier()

le = LabelEncoder()
y = le.fit_transform(y)

cat_cols = ['Postal code', 'Country of origin']
bin_cols = ['Wordpress?', 'Malicious']
num_cols = ['Page rank decimal','Site Rank', 'Total links']

ct = ColumnTransformer([('standard_scaler', StandardScaler(), num_cols),
                        ('one_hot_encoder', OneHotEncoder(sparse=False, handle_unknown= "ignore"), cat_cols),
                        ('label_encoder', OrdinalEncoder(), bin_cols)])

#SVM
clfSVM = Pipeline(steps = [('preprocessor', ct), ('SVM', SVC())])
#logistic regression
clfLOGREG = Pipeline(steps = [('preprocessor', ct), ('Logistic Regression', LogisticRegression())])
#XGBoost
clfXGB = Pipeline(steps = [('preprocessor', ct), ('XGBoost', xgb)])
#Naive Bayes
clfNB = Pipeline(steps = [('preprocessor', ct), ('Naive Bayes', GaussianNB())])
#Random forest
clfRFC = Pipeline(steps = [('preprocessor', ct), ('RFC', RandomForestClassifier())])
#neural netTODO



joblib.dump(ct, "SitePreprocessor.joblib")



['SitePreprocessor.joblib']

In [111]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, f1_score
models = [clfSVM, clfLOGREG, clfXGB, clfNB, clfRFC]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)

model_scores={}

for idx, model in enumerate(models):
    model.fit(X_train, y_train)
    name = list(model.named_steps)[1]
    
    score = model.score(X_test, y_test)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    
    model_scores[name] = score, mae, mse, f1
    

    joblib.dump(model, f'{name}Classifier.joblib')
    print(f'model {idx+1} done')
    



model 1 done
model 2 done
model 3 done
model 4 done
model 5 done


In [112]:
model_scores
#scores in order: accuracy, mean absolute error, mean squared error, f1 score

{'SVM': (0.9148936170212766,
  0.0851063829787234,
  0.0851063829787234,
  0.8666666666666666),
 'Logistic Regression': (0.9787234042553191,
  0.02127659574468085,
  0.02127659574468085,
  0.962962962962963),
 'XGBoost': (0.9574468085106383,
  0.0425531914893617,
  0.0425531914893617,
  0.9285714285714286),
 'Naive Bayes': (0.425531914893617,
  0.574468085106383,
  0.574468085106383,
  0.4905660377358491),
 'RFC': (0.9574468085106383,
  0.0425531914893617,
  0.0425531914893617,
  0.9285714285714286)}

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)

clfXGB.fit(X_train, y_train)
print('Score: ',clfXGB.score(X_test, y_test))