In [3]:
!pip install kagglehub



In [4]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("stevenpeutz/misinformation-fake-news-text-dataset-79k")

print("Path to dataset files:", path)

Path to dataset files: /Users/jacksonglass/Documents/Kaggle/datasets/stevenpeutz/misinformation-fake-news-text-dataset-79k/versions/2


In [69]:
import kagglehub
import os
import pandas as pd
import numpy as np

dataset_path = kagglehub.dataset_download('stevenpeutz/misinformation-fake-news-text-dataset-79k')

print("Path to dataset files:", dataset_path)

misinfo_fake = os.path.join(dataset_path, 'DataSet_Misinfo_FAKE.csv')
df_fake = pd.read_csv(misinfo_fake)

misinfo_true = os.path.join(dataset_path, 'DataSet_Misinfo_TRUE.csv')
df_true = pd.read_csv(misinfo_true)

print(df_fake.head())
print(df_true.head())

Path to dataset files: /Users/jacksonglass/Documents/Kaggle/datasets/stevenpeutz/misinformation-fake-news-text-dataset-79k/versions/2
   Unnamed: 0                                               text
0           0  Donald Trump just couldn t wish all Americans ...
1           1  House Intelligence Committee Chairman Devin Nu...
2           2  On Friday, it was revealed that former Milwauk...
3           3  On Christmas day, Donald Trump announced that ...
4           4  Pope Francis used his annual Christmas Day mes...
   Unnamed: 0                                               text
0           0  The head of a conservative Republican faction ...
1           1  Transgender people will be allowed for the fir...
2           2  The special counsel investigation of links bet...
3           3  Trump campaign adviser George Papadopoulos tol...
4           4  President Donald Trump called on the U.S. Post...


In [70]:
# Combine datasets with label

df_fake['label'] = 0
df_true['label'] = 1

df = pd.concat([df_fake, df_true], ignore_index=True)

df = df.drop(columns=['Unnamed: 0'])

df.head()

Unnamed: 0,text,label
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0


In [71]:
sample_1 = df.sample(n=1000)
sample_1

Unnamed: 0,text,label
64887,Venezuela accused France on Wednesday of joini...,1
3283,Donald Trump s ties to Vladimir Putin have bee...,0
13180,Because Bob Price at Breitbart News had the co...,0
72624,American cooks have frequent affairs with spir...,1
54043,The ongoing U.S.-Canadian trade “irritant” ove...,1
...,...,...
31671,Share on Facebook Russia is hoping for a “new ...,0
74015,Los Angeles (AFP) — Retired unbeaten boxing...,1
34843,Constitutional Law Expert: Comey Did NOT Viola...,0
24786,"Sonntag, 20. November 2016 Stiftung Warentest ...",0


In [72]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [73]:
vectorizer = TfidfVectorizer(
    max_features=1000,
    stop_words='english',
    max_df=0.85,
    min_df=2,
    ngram_range=(1, 2),
    strip_accents='unicode',
    lowercase=True
)

In [74]:
tfidf_matrix = vectorizer.fit_transform(sample_1['text'])
feature_names = vectorizer.get_feature_names_out()

In [None]:
def find_optimal_clusters(tfidf_matrix, max_clusters=10):
    """
    Standard implementation of elbow method and silhouette score to find optimal number of clusters.
    """
    inertias = []
    silhouette_scores = []
    K_range = range(2, min(max_clusters + 1, (tfidf_matrix.shape)[0]))
    
    print("\nEvaluating optimal number of clusters...")
    for k in K_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        kmeans.fit(tfidf_matrix)
        inertias.append(kmeans.inertia_)
        
        # Calculate silhouette score
        score = silhouette_score(tfidf_matrix, kmeans.labels_, sample_size=min(1000, (tfidf_matrix.shape)[0]))
        silhouette_scores.append(score)
        print(f"  k={k}: silhouette={score:.3f}")
    
    # Suggest k with highest silhouette score
    best_k = K_range[np.argmax(silhouette_scores)]
    print(f"\nSuggested number of clusters: {best_k}")
    
    return best_k

In [76]:
(tfidf_matrix.shape)[0]

1000

In [77]:
n_clusters = find_optimal_clusters(tfidf_matrix, max_clusters=15)

kmeans = KMeans(n_clusters=n_clusters, random_state=1, n_init=10)
cluster_labels = kmeans.fit_predict(tfidf_matrix)


Evaluating optimal number of clusters...
  k=2: silhouette=0.011
  k=3: silhouette=0.010
  k=4: silhouette=0.013
  k=5: silhouette=0.015
  k=6: silhouette=0.017
  k=7: silhouette=0.019
  k=8: silhouette=0.016
  k=9: silhouette=0.018
  k=10: silhouette=0.018
  k=11: silhouette=0.021
  k=12: silhouette=0.023
  k=13: silhouette=0.025
  k=14: silhouette=0.024
  k=15: silhouette=0.027

Suggested number of clusters: 15


In [78]:
cluster_labels
sample_1['cluster'] = cluster_labels

In [79]:
sample_1[sample_1['cluster'] == 5]

Unnamed: 0,text,label,cluster
12255,The man running Hillary Clinton s campaign is ...,0,5
15191,You can t have people walking around with guns...,0,5
16768,Hillary Clinton hasn t given up all the e-mails?,0,5
28882,Why Can't Hillary Clinton Stop Telling Stupid ...,0,5
51169,Hillary Clinton has one of the strongest resum...,1,5
...,...,...,...
8175,Eric Garner s daughter has released a new vide...,0,5
1857,It can easily be said that Donald Trump has ab...,0,5
34469,Hillary Clinton Waiting In Wings Of Stage Sinc...,0,5
15880,Every day we learn of a new security breach or...,0,5


In [81]:
for cluster in range(n_clusters):
    cluster_mask = sample_1['cluster'] == cluster
    cluster_data = sample_1[cluster_mask]
    size = len(cluster_data)
    fake_fraction = (cluster_data['label']==0).sum() / size

    print(f'Cluster {cluster}: Size={size}, Fake Fraction={fake_fraction:.2f}')

Cluster 0: Size=44, Fake Fraction=0.55
Cluster 1: Size=55, Fake Fraction=0.29
Cluster 2: Size=61, Fake Fraction=0.28
Cluster 3: Size=28, Fake Fraction=0.89
Cluster 4: Size=133, Fake Fraction=0.32
Cluster 5: Size=67, Fake Fraction=0.75
Cluster 6: Size=28, Fake Fraction=0.04
Cluster 7: Size=20, Fake Fraction=0.25
Cluster 8: Size=29, Fake Fraction=0.21
Cluster 9: Size=25, Fake Fraction=0.84
Cluster 10: Size=210, Fake Fraction=0.65
Cluster 11: Size=41, Fake Fraction=0.85
Cluster 12: Size=25, Fake Fraction=0.16
Cluster 13: Size=147, Fake Fraction=0.67
Cluster 14: Size=87, Fake Fraction=0.77
