## import datasets

In [3]:
import pandas as pd
true=pd.read_csv("True.csv")
fake=pd.read_csv("Fake.csv")

## Data Preview

In [4]:
true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [5]:
fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [6]:
#collect summary about the dataset
true.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   text     21417 non-null  object
 2   subject  21417 non-null  object
 3   date     21417 non-null  object
dtypes: object(4)
memory usage: 669.4+ KB


In [7]:
fake.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    23481 non-null  object
 1   text     23481 non-null  object
 2   subject  23481 non-null  object
 3   date     23481 non-null  object
dtypes: object(4)
memory usage: 733.9+ KB


## cleaning data

In [8]:
#missing value
true.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [9]:
fake.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

#### the is no missing value

In [10]:
#see the duplication
true.duplicated().sum()

206

In [11]:
fake.duplicated().sum()

3

In [12]:
# Remove duplicates
true = true.drop_duplicates()
fake = fake.drop_duplicates()

In [13]:
#again see the duplication
true.duplicated().sum()
fake.duplicated().sum()

0

In [14]:
# Add a 'label' column to distinguish between true and fake news
true['label'] = 1  # True news
fake['label'] = 0

In [15]:
true.head(4)

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1


In [16]:
fake.head(4)

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0


## Combine the datasets

In [17]:
# Combine the datasets
data = pd.concat([true, fake], axis=0).reset_index(drop=True)

In [18]:
# Drop unnecessary columns
data = data.drop(columns=['title', 'subject', 'date'])

In [19]:
data

Unnamed: 0,text,label
0,WASHINGTON (Reuters) - The head of a conservat...,1
1,WASHINGTON (Reuters) - Transgender people will...,1
2,WASHINGTON (Reuters) - The special counsel inv...,1
3,WASHINGTON (Reuters) - Trump campaign adviser ...,1
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,1
...,...,...
44684,21st Century Wire says As 21WIRE reported earl...,0
44685,21st Century Wire says It s a familiar theme. ...,0
44686,Patrick Henningsen 21st Century WireRemember ...,0
44687,21st Century Wire says Al Jazeera America will...,0


## training and testing model

In [20]:
# Define the independent (X) and dependent (y) variables
X = data['text']  
y = data['label'] 

In [21]:
# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
# Preprocessing the text data using TF-IDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

## Apply Machine Learning Algorithms ###

In [23]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
lr_model = LogisticRegression()
lr_model.fit(X_train_tfidf, y_train)
y_pred_lr = lr_model.predict(X_test_tfidf)
print("Logistic Regression")
print(f'Accuracy: {accuracy_score(y_test, y_pred_lr) * 100:.2f}%')
print(classification_report(y_test, y_pred_lr))

Logistic Regression
Accuracy: 98.43%
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      4648
           1       0.98      0.98      0.98      4290

    accuracy                           0.98      8938
   macro avg       0.98      0.98      0.98      8938
weighted avg       0.98      0.98      0.98      8938



#### The Logistic Regression model is very accurate, correctly predicting whether news is fake or true 98.43% of the time. It does a great job at identifying both fake and true news, with very few mistakes. The results show the model is reliable and balanced in its predictions.

In [24]:
from sklearn.tree import DecisionTreeClassifier
# Decision Tree Classifier
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train_tfidf, y_train)
y_pred_dt = dt_model.predict(X_test_tfidf)
print("\nDecision Tree Classifier")
print(f'Accuracy: {accuracy_score(y_test, y_pred_dt) * 100:.2f}%')
print(classification_report(y_test, y_pred_dt))


Decision Tree Classifier
Accuracy: 99.44%
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      4648
           1       1.00      0.99      0.99      4290

    accuracy                           0.99      8938
   macro avg       0.99      0.99      0.99      8938
weighted avg       0.99      0.99      0.99      8938



#### The Decision Tree Classifier achieved an impressive accuracy of 99.54%, meaning it correctly classified nearly all news articles. The precision, recall, and F1-scores for both fake news (class 0) and true news (class 1) are extremely high, close to 1.00. This indicates that the model almost perfectly distinguishes between fake and true news, making very few errors overall.

In [25]:
# Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
gb_model = GradientBoostingClassifier()
gb_model.fit(X_train_tfidf, y_train)
y_pred_gb = gb_model.predict(X_test_tfidf)
print("\nGradient Boosting Classifier")
print(f'Accuracy: {accuracy_score(y_test, y_pred_gb) * 100:.2f}%')
print(classification_report(y_test, y_pred_gb))


Gradient Boosting Classifier
Accuracy: 99.49%
              precision    recall  f1-score   support

           0       1.00      0.99      1.00      4648
           1       0.99      1.00      0.99      4290

    accuracy                           0.99      8938
   macro avg       0.99      0.99      0.99      8938
weighted avg       0.99      0.99      0.99      8938



In [26]:
# Random Forest Classifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
rf_model = RandomForestClassifier()
rf_model.fit(X_train_tfidf, y_train)
y_pred_rf = rf_model.predict(X_test_tfidf)
print("\nRandom Forest Classifier")
print(f'Accuracy: {accuracy_score(y_test, y_pred_rf) * 100:.2f}%')
print(classification_report(y_test, y_pred_rf))


Random Forest Classifier
Accuracy: 99.13%
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4648
           1       0.99      0.99      0.99      4290

    accuracy                           0.99      8938
   macro avg       0.99      0.99      0.99      8938
weighted avg       0.99      0.99      0.99      8938



### Manual Testing with a Sample Entry ###



In [27]:
# Example news text for manual testing
manual_test_data = [
    "The government announced a new policy that will help reduce unemployment by 20% over the next five years."
]



In [28]:
# Preprocess manual entry using the same TF-IDF vectorizer
manual_test_tfidf = vectorizer.transform(manual_test_data)

# Test using the Logistic Regression model (or any other model)
manual_pred = lr_model.predict(manual_test_tfidf)



In [29]:
# Output the result
print("\nManual Test")
if manual_pred[0] == 1:
    print("The news is predicted to be TRUE.")
else:
    print("The news is predicted to be FAKE.")


Manual Test
The news is predicted to be FAKE.


In [30]:
# Example of another manual test data
manual_test_data = [
    "The president announced a new plan to boost healthcare funding by 20% in the next fiscal year, aiming to improve medical infrastructure and accessibility."
]

# Preprocess manual entry using the same TF-IDF vectorizer
manual_test_tfidf = vectorizer.transform(manual_test_data)

# Test using the Logistic Regression model (or any other model)
manual_pred = lr_model.predict(manual_test_tfidf)

# Output the result
print("\nManual Test for True News")
if manual_pred[0] == 1:
    print("The news is predicted to be TRUE.")
else:
    print("The news is predicted to be FAKE.")


Manual Test for True News
The news is predicted to be TRUE.


In [33]:
# this data i randomly pick in CNN news
manual_test_data = [
    "Vice President Kamala Harris went on the offensive against former President Donald Trump on immigration Friday during her visit to the southern border in Arizona as she tries to turn a political vulnerability on its head.Immigration has featured prominently in the 2024 presidential election, with polls showing voters placing more trust in Trump to handle the issue than Harris. Democrats, grappling with years of border crises, have tried to gain ground by pointing to the bipartisan border measure that congressional Republicans blocked earlier this year after Trump came out against it. Harris on Friday lambasted Trump for his role in stymying that bill. “It was the strongest border security bill we have seen in decades. It was endorsed by the Border Patrol union. And it should be in effect today, producing results in real time, right now, for our country,” she said at a rally in Douglas, a town on the US-Mexico border. “But Donald Trump tanked it. He picked up the phone and called some friends in Congress and said, ‘Stop the bill,’” she said. “He prefers to run on a problem instead of fixing a problem. And the American people deserve a president who cares more about border security than playing political games and their personal political future.” She said she would ask Congress to pass the measure if she is elected, and would sign it into law. She also laid out a series of proposals that she said were “not just about some rhetoric at a rally,” but would help stem the flow of migrants into the United States."
]
print("text data are for testing: ", manual_test_data)

# Preprocess manual entry using the same TF-IDF vectorizer
manual_test_tfidf = vectorizer.transform(manual_test_data)

# Test using the Logistic Regression model (or any other model)
manual_pred = lr_model.predict(manual_test_tfidf)

# Output the result
print("\nManual Test for True News")
if manual_pred[0] == 1:
    print("The news is predicted to be TRUE.")
else:
    print("The news is predicted to be FAKE.")

text data are for testing:  ['Vice President Kamala Harris went on the offensive against former President Donald Trump on immigration Friday during her visit to the southern border in Arizona as she tries to turn a political vulnerability on its head.Immigration has featured prominently in the 2024 presidential election, with polls showing voters placing more trust in Trump to handle the issue than Harris. Democrats, grappling with years of border crises, have tried to gain ground by pointing to the bipartisan border measure that congressional Republicans blocked earlier this year after Trump came out against it. Harris on Friday lambasted Trump for his role in stymying that bill. “It was the strongest border security bill we have seen in decades. It was endorsed by the Border Patrol union. And it should be in effect today, producing results in real time, right now, for our country,” she said at a rally in Douglas, a town on the US-Mexico border. “But Donald Trump tanked it. He picked 