# Fake news classification
![](https://static.scientificamerican.com/sciam/cache/file/DFD3D397-B132-4516-9D8D0C7B4F07763B_source.jpg?w=690&h=930&2A4ADDE6-C642-4536-8CBD4821540D9D7F)

In [42]:
import pandas as pd

columns = [
    "text",
    "language",
    "thread_title",
    "spam_score",
    "replies_count",
    "participants_count",
    "likes",
    "comments",
    "shares",
    "type",
]
df = pd.read_csv("../input/fake-news-dataset/fake_news_dataset.csv", usecols=columns)

In [43]:
df.head()

Unnamed: 0,text,language,thread_title,spam_score,replies_count,participants_count,likes,comments,shares,type
0,Print They should pay all the back all the mon...,english,Muslims BUSTED: They Stole Millions In Gov’t B...,0.0,0,1,0,0,0,bias
1,Why Did Attorney General Loretta Lynch Plead T...,english,Re: Why Did Attorney General Loretta Lynch Ple...,0.0,0,1,0,0,0,bias
2,Red State : \nFox News Sunday reported this mo...,english,BREAKING: Weiner Cooperating With FBI On Hilla...,0.0,0,1,0,0,0,bias
3,Email Kayla Mueller was a prisoner and torture...,english,PIN DROP SPEECH BY FATHER OF DAUGHTER Kidnappe...,0.068,0,0,0,0,0,bias
4,Email HEALTHCARE REFORM TO MAKE AMERICA GREAT ...,english,FANTASTIC! TRUMP'S 7 POINT PLAN To Reform Heal...,0.865,0,0,0,0,0,bias


In [44]:
df = df[df["language"] == "english"]
df = df.dropna()

In [45]:
df = df.drop("language", axis=1)

In [46]:
features = 0
feature_map = {}

# adds a feature to a dictionary of features
def add_feature(name):
    if name not in feature_map:
        global features
        feature_map[name] = features
        features += 1

add_feature("fake")
add_feature("real")

In [47]:
# Wether article is fake or real
def article_type(row):
    if row["type"] == "fake":
        return feature_map["fake"]
    else:
        return feature_map["real"]

In [48]:
df["type"] = df.apply(article_type, axis=1)

In [49]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df)

In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_text = TfidfVectorizer()
vectorizer_title = TfidfVectorizer()


In [51]:
vectorized_text = vectorizer_text.fit_transform(df_train.pop("text").values)

In [52]:
vectorized_title = vectorizer_title.fit_transform(df_train.pop("thread_title").values)

In [53]:
from scipy import sparse

spam_score_train = sparse.csr_matrix(df_train["spam_score"].values).transpose()
replies_count_train = sparse.csr_matrix(df_train["replies_count"].values).transpose()
participants_count_train = sparse.csr_matrix(
    df_train["participants_count"].values
).transpose()
likes_train = sparse.csr_matrix(df_train["likes"].values).transpose()
comments_train = sparse.csr_matrix(df_train["comments"].values).transpose()
shares_train = sparse.csr_matrix(df_train["shares"].values).transpose()

In [54]:
from scipy.sparse import hstack

X_train = hstack(
    [
        vectorized_text,
        vectorized_title,
        spam_score_train,
        replies_count_train,
        participants_count_train,
        likes_train,
        comments_train,
        shares_train,
    ]
)
y_train = df_train.pop("type").values

In [55]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

In [56]:
clf.fit(X_train, y_train)

RandomForestClassifier()

In [57]:
vectorized_text_test = vectorizer_text.transform(df_test.pop("text").values)
vectorized_title_test = vectorizer_title.transform(df_test.pop("thread_title").values)

In [58]:

spam_score_test = sparse.csr_matrix(df_test["spam_score"].values).transpose()
replies_count_test = sparse.csr_matrix(df_test["replies_count"].values).transpose()
participants_count_test = sparse.csr_matrix(
    df_test["participants_count"].values
).transpose()
likes_test = sparse.csr_matrix(df_test["likes"].values).transpose()
comments_test = sparse.csr_matrix(df_test["comments"].values).transpose()
shares_test = sparse.csr_matrix(df_test["shares"].values).transpose()

In [59]:

X_test = hstack(
    [
        vectorized_text_test,
        vectorized_title_test,
        spam_score_test,
        replies_count_test,
        participants_count_test,
        likes_test,
        comments_test,
        shares_test,
    ]
)
y_test = df_test.pop("type").values

In [60]:

clf.score(X_test, y_test)

0.9983803045027535