In [3]:
!pip install kagglehub



In [1]:
import kagglehub
import os
import pandas as pd
import numpy as np

dataset_path = kagglehub.dataset_download('saurabhshahane/fake-news-classification')

print("Path to dataset files:", dataset_path)

path_join = os.path.join(dataset_path, 'WELFake_Dataset.csv')
df = pd.read_csv(path_join)

print(df.head())

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /Users/jacksonglass/.cache/kagglehub/datasets/saurabhshahane/fake-news-classification/versions/77
   Unnamed: 0                                              title  \
0           0  LAW ENFORCEMENT ON HIGH ALERT Following Threat...   
1           1                                                NaN   
2           2  UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...   
3           3  Bobby Jindal, raised Hindu, uses story of Chri...   
4           4  SATAN 2: Russia unvelis an image of its terrif...   

                                                text  label  
0  No comment is expected from Barack Obama Membe...      1  
1     Did they post their votes for Hillary already?      1  
2   Now, most of the demonstrators gathered last ...      1  
3  A dozen politically active pastors came here f...      0  
4  The RS-28 Sarmat missile, dubbed Satan 2, will...      1  


In [19]:
sample_1 = df.sample(n=10000)
sample_1

Unnamed: 0.1,Unnamed: 0,title,text,label
48973,48973,There Is Something Extraordinary Happening In ...,Most of us haven’t quite realized there is som...,1
15766,15766,Trump Tells Reporters: ‘Walls Work - Just Ask ...,President Trump stood by his campaign promise ...,0
7270,7270,EXCLUSIVE – Islamic State Supporters: ‘Soon We...,"Writing in an encrypted chat room, Islamic Sta...",0
20783,20783,Liz Warren Launches Book Hours Before Being To...,"In an extraordinary coincidence, Sen. Elizabet...",0
30246,30246,“It’s Fucking Christmas Now So Start Buying Sh...,0 Add Comment \nTHE RETAIL industry has wasted...,1
...,...,...,...,...
67750,67750,Trump SCOTUS Nominee: ’2nd Amendment Protects ...,President Trump’s nominee to the Supreme Court...,0
50230,50230,Robert Reich HAMMERS Trump’s Unpresidential B...,The former Labor Secretary penned a short mess...,1
42901,42901,Trump Gets Laughed Off Twitter After Saying C...,"On Sunday, Donald Trump once again embarrassed...",1
60424,60424,Anderson Cooper Demands Answers After Trump’s...,"On Friday, Donald Trump broke a 37-year old pl...",1


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [21]:
vectorizer = TfidfVectorizer(
    max_features=2000,
    stop_words='english',
    max_df=0.85,
    min_df=2,
    ngram_range=(1, 2),
    strip_accents='unicode',
    lowercase=True
)

In [22]:
sample_1 = sample_1.dropna()

In [23]:
tfidf_matrix = vectorizer.fit_transform(sample_1['text'])
feature_names = vectorizer.get_feature_names_out()

In [24]:
np.random.choice(feature_names, size=500, replace=False)

array(['discussion', 'marriage', 'raised', 'april', 'floor', 'pic', 'fed',
       'graham', '2016', 'bomb', 'congressman', 'challenge', 'thinks',
       'power', 'saturday', 'similar', 'wants', '28', 'changes', 'church',
       'source', 'prior', 'park', 'parenthood', 'spending', 'seeking',
       'help', 'ability', 'george bush', '2010', 'increase',
       'investigating', 'wanted', 'articles', 'read', 'false', 'gets',
       'leading', 'wins', 'ask', 'launched', 'details', 'lie', 'manager',
       'journalist', 'hearing', 'access', 'european', 'mike', 'knows',
       'wednesday', 'northern', 'voted', 'cancer', 'based', 'income',
       'president', 'children', 'try', 'controlled', 'trump said',
       'fellow', 'press', 'vice', 'turkish', 'physical', 'told',
       'southern', 'wikileaks', 'coming', 'killed', 'flight',
       'intelligence', 'send', 'foundation', 'fall', 'sanctions',
       'completely', 'brexit', 'civilians', 'making', 'extremely',
       'standard', 'conservative',

In [26]:
tfidf_matrix.shape

(9930, 2000)

In [112]:
def find_optimal_clusters(tfidf_matrix, max_clusters=20):
    """
    Standard implementation of elbow method and silhouette score to find optimal number of clusters.
    """
    inertias = []
    silhouette_scores = []
    K_range = range(2, min(max_clusters + 1, (tfidf_matrix.shape)[0]))
    
    print("\nEvaluating optimal number of clusters...")
    for k in K_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        kmeans.fit(tfidf_matrix)
        inertias.append(kmeans.inertia_)
        
        # Calculate silhouette score
        score = silhouette_score(tfidf_matrix, kmeans.labels_, sample_size=min(1000, (tfidf_matrix.shape)[0]))
        silhouette_scores.append(score)
        print(f"  k={k}: silhouette={score:.3f}")
    
    # Suggest k with highest silhouette score
    best_k = K_range[np.argmax(silhouette_scores)]
    print(f"\nSuggested number of clusters: {best_k}")
    
    return best_k

In [113]:
(tfidf_matrix.shape)[0]

9930

In [114]:
n_clusters = find_optimal_clusters(tfidf_matrix, max_clusters=20)

kmeans = KMeans(n_clusters=n_clusters, random_state=1, n_init=10)
cluster_labels = kmeans.fit_predict(tfidf_matrix)


Evaluating optimal number of clusters...
  k=2: silhouette=0.011
  k=3: silhouette=0.008
  k=4: silhouette=0.012
  k=5: silhouette=0.013
  k=6: silhouette=0.013
  k=7: silhouette=0.019
  k=8: silhouette=0.017
  k=9: silhouette=0.012
  k=10: silhouette=0.017
  k=11: silhouette=0.016
  k=12: silhouette=0.022
  k=13: silhouette=0.020
  k=14: silhouette=0.020
  k=15: silhouette=0.019
  k=16: silhouette=0.016
  k=17: silhouette=0.020
  k=18: silhouette=0.020
  k=19: silhouette=0.021
  k=20: silhouette=0.021

Suggested number of clusters: 12


In [115]:
cluster_labels
sample_1['cluster'] = cluster_labels

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_1['cluster'] = cluster_labels


In [116]:
sample_1[sample_1['cluster'] == 10]

Unnamed: 0.1,Unnamed: 0,title,text,label,cluster
15766,15766,Trump Tells Reporters: ‘Walls Work - Just Ask ...,President Trump stood by his campaign promise ...,0,10
71620,71620,Vicious Trump Fans Send BLOODCURDLING Threats...,Several mosques in California have received th...,1,10
30846,30846,Is Trump EMBRACING HITLER? Has Thousands Of F...,What the HELL just happened in Orlando? Donald...,1,10
35900,35900,"U.S. Women's Open must leave Trump National, s...",(Reuters) - Fourteen years after criticizing A...,0,10
40678,40678,Rivals pile on Trump in Republican candidates’...,The calamity brought upon the Republican Party...,0,10
...,...,...,...,...,...
47598,47598,Why House Democrats think Donald Trump can del...,There's a sense of growing optimism among Demo...,0,10
71835,71835,Trump tries to soothe Polish-Americans' concer...,CHICAGO (Reuters) - Republican presidential no...,0,10
18189,18189,TRUMP RUFFLES FEATHERS With New And Bold Presi...,Who knew that members of a presidential admini...,1,10
50230,50230,Robert Reich HAMMERS Trump’s Unpresidential B...,The former Labor Secretary penned a short mess...,1,10


In [117]:
for cluster in range(n_clusters):
    cluster_mask = sample_1['cluster'] == cluster
    cluster_data = sample_1[cluster_mask]
    size = len(cluster_data)
    fake_fraction = (cluster_data['label']==0).sum() / size

    print(f'Cluster {cluster}: Size={size}, Fake Fraction={fake_fraction:.2f}')

Cluster 0: Size=1500, Fake Fraction=0.16
Cluster 1: Size=714, Fake Fraction=0.30
Cluster 2: Size=183, Fake Fraction=0.90
Cluster 3: Size=726, Fake Fraction=0.83
Cluster 4: Size=368, Fake Fraction=0.95
Cluster 5: Size=216, Fake Fraction=0.27
Cluster 6: Size=538, Fake Fraction=0.87
Cluster 7: Size=439, Fake Fraction=0.54
Cluster 8: Size=2418, Fake Fraction=0.41
Cluster 9: Size=325, Fake Fraction=0.70
Cluster 10: Size=1461, Fake Fraction=0.31
Cluster 11: Size=1042, Fake Fraction=0.82


In [118]:
# Get top terms per cluster
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for cluster in range(n_clusters):
    top_terms = [terms[ind] for ind in order_centroids[cluster, :10]]
    print(f"Cluster {cluster} top terms: {', '.join(top_terms)}")

    

Cluster 0 top terms: twitter, com, trump, pic, 2016, pic twitter, twitter com, news, https, 2017
Cluster 1 top terms: clinton, hillary, hillary clinton, trump, campaign, sanders, election, democratic, said, state
Cluster 2 top terms: korea, north korea, north, korean, nuclear, missile, china, said, south, south korea
Cluster 3 top terms: house, tax, said, senate, trump, republican, white house, white, republicans, obamacare
Cluster 4 top terms: mr, mr trump, said, trump, ms, new, president, mrs clinton, mrs, like
Cluster 5 top terms: fbi, comey, clinton, investigation, emails, trump, fbi director, hillary, director, james comey
Cluster 6 top terms: said, eu, minister, european, party, israel, prime minister, prime, britain, government
Cluster 7 top terms: russia, russian, putin, moscow, trump, said, intelligence, syria, president, ukraine
Cluster 8 top terms: said, police, people, like, just, new, year, time, black, state
Cluster 9 top terms: court, supreme court, supreme, judge, said,