In [3]:
!pip install kagglehub



In [3]:
import kagglehub
import os
import pandas as pd
import numpy as np

dataset_path = kagglehub.dataset_download('stevenpeutz/misinformation-fake-news-text-dataset-79k')

print("Path to dataset files:", dataset_path)

misinfo_fake = os.path.join(dataset_path, 'DataSet_Misinfo_FAKE.csv')
df_fake = pd.read_csv(misinfo_fake)

misinfo_true = os.path.join(dataset_path, 'DataSet_Misinfo_TRUE.csv')
df_true = pd.read_csv(misinfo_true)

print(df_fake.head())
print(df_true.head())

Path to dataset files: /Users/jacksonglass/.cache/kagglehub/datasets/stevenpeutz/misinformation-fake-news-text-dataset-79k/versions/2
   Unnamed: 0                                               text
0           0  Donald Trump just couldn t wish all Americans ...
1           1  House Intelligence Committee Chairman Devin Nu...
2           2  On Friday, it was revealed that former Milwauk...
3           3  On Christmas day, Donald Trump announced that ...
4           4  Pope Francis used his annual Christmas Day mes...
   Unnamed: 0                                               text
0           0  The head of a conservative Republican faction ...
1           1  Transgender people will be allowed for the fir...
2           2  The special counsel investigation of links bet...
3           3  Trump campaign adviser George Papadopoulos tol...
4           4  President Donald Trump called on the U.S. Post...


In [4]:
# Combine datasets with label

df_fake['label'] = 0
df_true['label'] = 1

df = pd.concat([df_fake, df_true], ignore_index=True)

df = df.drop(columns=['Unnamed: 0'])

df.head()

Unnamed: 0,text,label
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0


In [12]:
sample_1 = df.sample(n=1000)
sample_1

Unnamed: 0,text,label
58216,Turkish police have launched operations to tra...,1
49552,Airlines operating at Cairo airport were offic...,1
32804,"Welcome Back to a Bipolar World, Courtesy of a...",0
67569,"On the Friday edition of Breitbart News Daily,...",1
23268,"Wed, 26 Oct 2016 23:00 UTC Authorities in Nort...",0
...,...,...
2215,"On Thursday, Fox News hack and Donald Trump te...",0
65336,"On Thursday’s episode of “The Dr. Oz Show,” Do...",1
26258,source Add To The Conversation Using Facebook ...,0
48645,A group holding more than $10 billion of Puert...,1


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [7]:
vectorizer = TfidfVectorizer(
    max_features=1000,
    stop_words='english',
    max_df=0.85,
    min_df=2,
    ngram_range=(1, 2),
    strip_accents='unicode',
    lowercase=True
)

In [13]:
sample_1 = sample_1.dropna()

In [14]:
tfidf_matrix = vectorizer.fit_transform(sample_1['text'])
feature_names = vectorizer.get_feature_names_out()

In [15]:
def find_optimal_clusters(tfidf_matrix, max_clusters=10):
    """
    Standard implementation of elbow method and silhouette score to find optimal number of clusters.
    """
    inertias = []
    silhouette_scores = []
    K_range = range(2, min(max_clusters + 1, (tfidf_matrix.shape)[0]))
    
    print("\nEvaluating optimal number of clusters...")
    for k in K_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        kmeans.fit(tfidf_matrix)
        inertias.append(kmeans.inertia_)
        
        # Calculate silhouette score
        score = silhouette_score(tfidf_matrix, kmeans.labels_, sample_size=min(1000, (tfidf_matrix.shape)[0]))
        silhouette_scores.append(score)
        print(f"  k={k}: silhouette={score:.3f}")
    
    # Suggest k with highest silhouette score
    best_k = K_range[np.argmax(silhouette_scores)]
    print(f"\nSuggested number of clusters: {best_k}")
    
    return best_k

In [16]:
(tfidf_matrix.shape)[0]

1000

In [17]:
n_clusters = find_optimal_clusters(tfidf_matrix, max_clusters=30)

kmeans = KMeans(n_clusters=n_clusters, random_state=1, n_init=10)
cluster_labels = kmeans.fit_predict(tfidf_matrix)


Evaluating optimal number of clusters...
  k=2: silhouette=0.013
  k=3: silhouette=0.010
  k=4: silhouette=0.014
  k=5: silhouette=0.016
  k=6: silhouette=0.017
  k=7: silhouette=0.020
  k=8: silhouette=0.018
  k=9: silhouette=0.021
  k=10: silhouette=0.021
  k=11: silhouette=0.023
  k=12: silhouette=0.020
  k=13: silhouette=0.023
  k=14: silhouette=0.026
  k=15: silhouette=0.026
  k=16: silhouette=0.024
  k=17: silhouette=0.027
  k=18: silhouette=0.028
  k=19: silhouette=0.026
  k=20: silhouette=0.025
  k=21: silhouette=0.028
  k=22: silhouette=0.028
  k=23: silhouette=0.023
  k=24: silhouette=0.030
  k=25: silhouette=0.027
  k=26: silhouette=0.027
  k=27: silhouette=0.023
  k=28: silhouette=0.029
  k=29: silhouette=0.030
  k=30: silhouette=0.024

Suggested number of clusters: 29


In [18]:
cluster_labels
sample_1['cluster'] = cluster_labels

In [23]:
sample_1[sample_1['cluster'] == 6]

Unnamed: 0,text,label,cluster
23268,"Wed, 26 Oct 2016 23:00 UTC Authorities in Nort...",0,6
53457,The White House will likely make a decision by...,1,6
48778,The U.S. State Department released its annual ...,1,6
63738,"Important areas of Hong Kong s One Country, ...",1,6
63502,Hundreds of Tunisians protested on Saturday in...,1,6
18091,Hollywood police spokeswoman Miranda Grossman ...,0,6
46255,U.S. investigators examining money laundering ...,1,6
46004,A former U.S. Justice Department official has ...,1,6
49625,Two lawmakers and six consumer advocacy groups...,1,6
9322,"With all of the stories of, TV hosts involved ...",0,6


In [20]:
for cluster in range(n_clusters):
    cluster_mask = sample_1['cluster'] == cluster
    cluster_data = sample_1[cluster_mask]
    size = len(cluster_data)
    fake_fraction = (cluster_data['label']==0).sum() / size

    print(f'Cluster {cluster}: Size={size}, Fake Fraction={fake_fraction:.2f}')

Cluster 0: Size=10, Fake Fraction=0.80
Cluster 1: Size=47, Fake Fraction=0.15
Cluster 2: Size=102, Fake Fraction=0.64
Cluster 3: Size=16, Fake Fraction=0.12
Cluster 4: Size=23, Fake Fraction=0.43
Cluster 5: Size=28, Fake Fraction=0.50
Cluster 6: Size=28, Fake Fraction=0.36
Cluster 7: Size=26, Fake Fraction=0.58
Cluster 8: Size=99, Fake Fraction=0.47
Cluster 9: Size=22, Fake Fraction=0.05
Cluster 10: Size=20, Fake Fraction=0.15
Cluster 11: Size=69, Fake Fraction=0.86
Cluster 12: Size=26, Fake Fraction=0.65
Cluster 13: Size=25, Fake Fraction=0.88
Cluster 14: Size=60, Fake Fraction=0.45
Cluster 15: Size=21, Fake Fraction=0.67
Cluster 16: Size=35, Fake Fraction=0.51
Cluster 17: Size=32, Fake Fraction=0.72
Cluster 18: Size=20, Fake Fraction=0.75
Cluster 19: Size=36, Fake Fraction=0.75
Cluster 20: Size=25, Fake Fraction=0.12
Cluster 21: Size=25, Fake Fraction=0.92
Cluster 22: Size=24, Fake Fraction=0.58
Cluster 23: Size=57, Fake Fraction=0.30
Cluster 24: Size=24, Fake Fraction=0.29
Cluster 2

In [24]:
feature_names

array(['000', '10', '100', '11', '12', '13', '14', '15', '17', '18', '20',
       '2008', '2012', '2013', '2014', '2015', '2016', '2017', '21', '25',
       '28', '30', '40', '50', '60', 'able', 'abortion', 'access',
       'according', 'account', 'accused', 'act', 'action', 'actions',
       'actually', 'added', 'adding', 'address', 'administration',
       'adviser', 'agencies', 'agency', 'agenda', 'ago', 'agreed',
       'agreement', 'ahead', 'air', 'al', 'aleppo', 'allegations',
       'alleged', 'allies', 'allow', 'allowed', 'america', 'american',
       'americans', 'announced', 'anti', 'appeared', 'appears', 'april',
       'arabia', 'area', 'areas', 'armed', 'army', 'arrested', 'article',
       'ask', 'asked', 'asking', 'attack', 'attacks', 'attempt',
       'attention', 'attorney', 'attorney general', 'august',
       'authorities', 'authority', 'available', 'away', 'backed', 'bad',
       'ballot', 'ban', 'bank', 'barack', 'barack obama', 'base', 'based',
       'battle', 'b