# Feature Extraction

In [29]:
import os
import pandas as pd

clean_news_df = pd.read_csv(os.path.join(os.getcwd(),"dataset\\clean_news_df.csv"))


In [30]:
clean_news_df

Unnamed: 0,true_or_fake,text,cleaned_text
0,true,"As U.S. budget fight looms, Republicans flip t...",budget fight loom republican flip fiscal scrip...
1,true,U.S. military to accept transgender recruits o...,military accept transgender recruit monday pen...
2,true,Senior U.S. Republican senator: 'Let Mr. Muell...,senior republican senator let mueller job wash...
3,true,FBI Russia probe helped by Australian diplomat...,fbi russia probe help australian diplomat tip ...
4,true,Trump wants Postal Service to charge 'much mor...,trump want postal service charge much amazon s...
...,...,...,...
44893,fake,McPain: John McCain Furious That Iran Treated ...,mcpain john mccain furious iran treat sailor w...
44894,fake,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,justice yahoo settle mail privacy class action...
44895,fake,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,sunnistan ally safe zone plan take territorial...
44896,fake,How to Blow $700 Million: Al Jazeera America F...,blow million jazeera america finally call quit...


In [31]:
# remove empty values
clean_news_df.dropna(inplace=True)

In [32]:
from sklearn.preprocessing import LabelEncoder

# target variable encoding
le = LabelEncoder()
clean_news_df['true_or_fake']= le.fit_transform(clean_news_df['true_or_fake']) 

y = clean_news_df['true_or_fake']

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
 
Vectorizer = TfidfVectorizer(ngram_range=(1,3),stop_words='english',min_df=0.05)
X = Vectorizer.fit_transform(clean_news_df['cleaned_text'])
X.shape
feature_names = Vectorizer.get_feature_names_out()

In [34]:
tdf_df = pd.DataFrame(X.toarray(), columns=feature_names, index=clean_news_df.index)
tdf_df

Unnamed: 0,able,accept,access,accord,account,accuse,act,action,actually,add,...,word,work,worker,world,write,wrong,year,year old,york,young
0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.038326,0.0,0.034452,...,0.000000,0.028959,0.0,0.000000,0.000000,0.0,0.068259,0.0,0.000000,0.047749
1,0.000000,0.252875,0.050489,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.0,0.048115,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.026584,0.0,0.090052,0.000000
3,0.000000,0.000000,0.000000,0.045455,0.0,0.060150,0.000000,0.055979,0.0,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.033233,0.0,0.168862,0.000000
4,0.000000,0.000000,0.000000,0.123876,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.0,0.027330,0.030471,0.0,0.108681,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44893,0.080065,0.000000,0.000000,0.114215,0.0,0.000000,0.136638,0.140659,0.0,0.000000,...,0.000000,0.000000,0.0,0.125993,0.000000,0.0,0.000000,0.0,0.070717,0.000000
44894,0.000000,0.113565,0.113373,0.000000,0.0,0.000000,0.000000,0.088682,0.0,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000
44895,0.000000,0.000000,0.000000,0.017645,0.0,0.011675,0.021110,0.000000,0.0,0.000000,...,0.023696,0.041049,0.0,0.038930,0.000000,0.0,0.045153,0.0,0.021851,0.000000
44896,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.102757,0.0,0.000000,0.000000,0.0,0.040368,0.0,0.000000,0.000000


In [35]:
y = clean_news_df['true_or_fake']
y

0        1
1        1
2        1
3        1
4        1
        ..
44893    0
44894    0
44895    0
44896    0
44897    0
Name: true_or_fake, Length: 44889, dtype: int32

# Dataset Partition

In [36]:
# split into training set (test set?) and validation set

# train = 80, test = 20
# random_seed = 42

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,shuffle=True,random_state=42,test_size=0.2,stratify=y)

# Model Building

## Logistic Regression

## Naive Bayes

## Random Forest

## Gradient Boosting

In [37]:
from sklearn.ensemble import GradientBoostingClassifier

# loading the baseline model
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)

gbc.score(X_test,y_test)

0.9944308309200267

In [38]:
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,f1_score

gbc_preds = gbc.predict(X_test)
print("Accuracy: ",accuracy_score(y_true=y_test,y_pred=gbc_preds))
print("Precision: ",precision_score(y_true=y_test,y_pred=gbc_preds))
print("Recall: ",recall_score(y_true=y_test,y_pred=gbc_preds))
print("F1-Score: ",f1_score(y_true=y_test,y_pred=gbc_preds))
print("Confusion Matrix: \n",confusion_matrix(y_true=y_test,y_pred=gbc_preds).ravel())


Accuracy:  0.9944308309200267
Precision:  0.9918661399023937
Recall:  0.9964977819285548
F1-Score:  0.9941765665036104
Confusion Matrix: 
 [4660   35   15 4268]
