In [1]:
pip install pandas torch transformers scikit-learn ipywidgets numpy sentence_transformers


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/opt/homebrew/Cellar/jupyterlab/4.2.5_1/libexec/bin/python -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import torch
from transformers import GPT2Tokenizer, GPT2Model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sentence_transformers import SentenceTransformer

In [3]:
# Load the dataset
df = pd.read_csv('WELFakeDataset5000.csv')

# Drop rows where the label is NaN
df = df.dropna(subset=['label'])


text_data = df['text'].fillna('').tolist()
labels = df['label'].tolist()  # 1 = real, 0 = fake

# Load GPT model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')
print(df['label'].value_counts())


  df = pd.read_csv('WELFakeDataset5000.csv')


label
1                                                                                                               2206
0                                                                                                               2001
 anyone who thinks the Feminists want ‚Äúequality‚Äù only has to look at this decision. If Hillary was a man       1
 and not one shred of gratitude nor remorse from those who took credit they were unable to pay. Oz Steamer         1
 if current demographic trends continue                                                                            1
                                                                                                                ... 
 and whatever else I might decide to indulge in                                                                    1
 you‚Äôll help the Repubs steal what‚Äôs left of the country. Michael                                              1
 the safety net is going to break.‚Äù Yes                 

In [4]:

# Define function to get GPT embeddings with additional checks
def get_gpt_embedding(text):
    if not text.strip():  # Skip if text is empty or only whitespace
        return np.zeros((model.config.hidden_size,))
    
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    
    # Check if tokenizer produced any input IDs
    if inputs['input_ids'].size(1) == 0:
        return np.zeros((model.config.hidden_size,))  # Return zero vector if no tokens are generated
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Pooling to get a single vector
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()


In [5]:
model = SentenceTransformer('all-MiniLM-L6-v2')

# embeddings = [get_gpt_embedding(text) for text in text_data]
embeddings = [model.encode(text) for text in text_data]


In [6]:
# Split data into training and test sets

X_temp, X_test, y_temp, y_test = train_test_split(embeddings, labels, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)  # 20% of total for validation

# Check for NaN values in y_train and y_test after splitting
if any(pd.isna(y_train)) or any(pd.isna(y_test)):
    print("Warning: NaN values detected in target variable after train-test split.")

# Scale embeddings
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Increase max_iter in LogisticRegression / Train a classifier
#clf = LogisticRegression(max_iter=500)  # Increase iterations

#clf = RandomForestClassifier(n_estimators=100, random_state=42)

ensemble = VotingClassifier(estimators=[
    ('lr', LogisticRegression(max_iter=500)),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('svc', SVC(kernel='rbf', probability=True))
], voting='soft')

ensemble.fit(X_train_scaled, y_train)



In [8]:

# Validate the model
y_val_pred = ensemble.predict(X_val_scaled)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")

# Test the model
y_test_pred = ensemble.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# Print individual classifier votes for the test set
print("Classifier Votes for Test Samples:")
for i, (lr_vote, rf_vote, svc_vote) in enumerate(zip(
    ensemble.named_estimators_['lr'].predict(X_test_scaled),
    ensemble.named_estimators_['rf'].predict(X_test_scaled),
    ensemble.named_estimators_['svc'].predict(X_test_scaled)
)):
    print(f"Sample {i+1}: LogisticRegression={lr_vote}, RandomForest={rf_vote}, SVC={svc_vote}, Final Vote={y_test_pred[i]}")

# Print detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))


Validation Accuracy: 84.66%
Test Accuracy: 83.30%
Classifier Votes for Test Samples:
Sample 1: LogisticRegression=114, RandomForest=114, SVC=114, Final Vote=0
Sample 2: LogisticRegression=114, RandomForest=118, SVC=114, Final Vote=0
Sample 3: LogisticRegression=118, RandomForest=118, SVC=118, Final Vote=1
Sample 4: LogisticRegression=118, RandomForest=118, SVC=118, Final Vote=1
Sample 5: LogisticRegression=118, RandomForest=118, SVC=118, Final Vote=1
Sample 6: LogisticRegression=114, RandomForest=114, SVC=114, Final Vote=0
Sample 7: LogisticRegression=118, RandomForest=118, SVC=118, Final Vote=1
Sample 8: LogisticRegression=118, RandomForest=118, SVC=118, Final Vote=1
Sample 9: LogisticRegression=118, RandomForest=118, SVC=118, Final Vote=1
Sample 10: LogisticRegression=114, RandomForest=118, SVC=118, Final Vote=1
Sample 11: LogisticRegression=118, RandomForest=118, SVC=118, Final Vote=1
Sample 12: LogisticRegression=114, RandomForest=114, SVC=114, Final Vote=0
Sample 13: LogisticRegre

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [11]:
# Drop any unnamed columns (columns with no header) if they exist
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]


# Add predictions to original dataframe
df['prediction'] = ensemble.predict(embeddings)
#df['prediction'] = df['prediction'].apply(lambda x: 'real' if x == 1 else 'fake')

# Save results
df.to_csv('gpt_analyzed_dataset.csv', index=False)
print("Analysis complete. Results saved to 'gpt_analyzed_dataset.csv'.")

Analysis complete. Results saved to 'gpt_analyzed_dataset.csv'.
