Imort Library

In [1]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Z\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Z\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
data_path = r'C:\Users\Z\Documents\Farhan\DS bootcamp\news.csv'

In [3]:
try:
    df = pd.read_csv(data_path)
except pd.errors.ParserError as e:
    print(f"ParserError: {e}")

In [4]:
# Data Cleaning
df['news'] = df['title'] + df['text']
df['label'] = df['label'].replace({'FAKE': 0, 'REAL': 1})
df = df.drop(columns=['Unnamed: 0'])

  df['label'] = df['label'].replace({'FAKE': 0, 'REAL': 1})


In [5]:
df.head()

Unnamed: 0,title,text,label,news
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",0,"You Can Smell Hillary’s FearDaniel Greenfield,..."
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,0,Watch The Exact Moment Paul Ryan Committed Pol...
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,1,Kerry to go to Paris in gesture of sympathyU.S...
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",0,Bernie supporters on Twitter erupt in anger ag...
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,1,The Battle of New York: Why This Primary Matte...


Steming & Stopwords

In [6]:
# Initialize PorterStemmer, Lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stopwords_set = set(stopwords.words('english'))

In [7]:
def clean(text):
    text = str(text).lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\n', '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    text = ' '.join([word for word in text.split() if word not in stopwords_set])
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

In [8]:
df['cleaned_text'] = df['text'].apply(clean)
df['cleaned_title'] = df['title'].apply(clean)

# Display the first few rows of the DataFrame to check the results
print(df.head())

                                               title  \
0                       You Can Smell Hillary’s Fear   
1  Watch The Exact Moment Paul Ryan Committed Pol...   
2        Kerry to go to Paris in gesture of sympathy   
3  Bernie supporters on Twitter erupt in anger ag...   
4   The Battle of New York: Why This Primary Matters   

                                                text  label  \
0  Daniel Greenfield, a Shillman Journalism Fello...      0   
1  Google Pinterest Digg Linkedin Reddit Stumbleu...      0   
2  U.S. Secretary of State John F. Kerry said Mon...      1   
3  — Kaydee King (@KaydeeKing) November 9, 2016 T...      0   
4  It's primary day in New York and front-runners...      1   

                                                news  \
0  You Can Smell Hillary’s FearDaniel Greenfield,...   
1  Watch The Exact Moment Paul Ryan Committed Pol...   
2  Kerry to go to Paris in gesture of sympathyU.S...   
3  Bernie supporters on Twitter erupt in anger ag...   
4  T

In [9]:
# Reset index tanpa menyimpan index lama
df_reset = df.reset_index(drop=True)

Cleaning Data

In [10]:
df['filered_text']=df['news'].apply(clean)

In [11]:
df.head()

Unnamed: 0,title,text,label,news,cleaned_text,cleaned_title,filered_text
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",0,"You Can Smell Hillary’s FearDaniel Greenfield,...",daniel greenfield shillman journalism fellow f...,smell hillary’s fear,smell hillary’s feardaniel greenfield shillman...
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,0,Watch The Exact Moment Paul Ryan Committed Pol...,google pinterest digg linkedin reddit stumbleu...,watch exact moment paul ryan committed politic...,watch exact moment paul ryan committed politic...
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,1,Kerry to go to Paris in gesture of sympathyU.S...,u secretary state john f kerry said monday sto...,kerry go paris gesture sympathy,kerry go paris gesture sympathyus secretary st...
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",0,Bernie supporters on Twitter erupt in anger ag...,— kaydee king kaydeeking november lesson tonig...,bernie supporter twitter erupt anger dnc tried...,bernie supporter twitter erupt anger dnc tried...
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,1,The Battle of New York: Why This Primary Matte...,primary day new york frontrunners hillary clin...,battle new york primary matter,battle new york primary mattersits primary day...


Tokenization

In [12]:
# Text Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(df['cleaned_text'])
y = df['label']

In [13]:
X.shape

(6335, 5000)

In [14]:
y.shape

(6335,)

In [15]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Model Training and Evaluation
def evaluate_model(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"Classification Report:\n {classification_report(y_test, y_pred)}")
    cm = confusion_matrix(y_test, y_pred)

Models dan Metrics

In [17]:
# Naive Bayes
print("Naive Bayes:")
nb_model = MultinomialNB()
evaluate_model(nb_model)

Naive Bayes:
Accuracy: 0.8918705603788477
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.90      0.89       628
           1       0.90      0.88      0.89       639

    accuracy                           0.89      1267
   macro avg       0.89      0.89      0.89      1267
weighted avg       0.89      0.89      0.89      1267



In [18]:
# Logistic Regression
print("Logistic Regression:")
lr_model = LogisticRegression(max_iter=1000)
evaluate_model(lr_model)

Logistic Regression:
Accuracy: 0.914759273875296
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.92      0.91       628
           1       0.92      0.91      0.91       639

    accuracy                           0.91      1267
   macro avg       0.91      0.91      0.91      1267
weighted avg       0.91      0.91      0.91      1267



In [19]:
# Support Vector Machine
print("Support Vector Machine:")
svm_model = SVC(kernel='linear')
evaluate_model(svm_model)

Support Vector Machine:
Accuracy: 0.9234411996842936
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.93      0.92       628
           1       0.93      0.92      0.92       639

    accuracy                           0.92      1267
   macro avg       0.92      0.92      0.92      1267
weighted avg       0.92      0.92      0.92      1267



In [20]:
# Gradient Boosting Classifier
print("Gradient Boosting Classifier:")
gbc_model = GradientBoostingClassifier()
evaluate_model(gbc_model)

Gradient Boosting Classifier:
Accuracy: 0.8966061562746646
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.91      0.90       628
           1       0.91      0.88      0.90       639

    accuracy                           0.90      1267
   macro avg       0.90      0.90      0.90      1267
weighted avg       0.90      0.90      0.90      1267



# Hyperparameter Tuning for the best performing model (example: SVM)
param_grid = {'C': [0.1, 1, 10, 100], 'kernel': ['linear', 'rbf']}
grid_search = GridSearchCV(SVC(), param_grid, refit=True, verbose=2, cv=5)
grid_search.fit(X_train, y_train)
print("Best parameters found: ", grid_search.best_params_)
evaluate_model(grid_search.best_estimator_)

In [21]:
import joblib

In [22]:
# Save the trained logistic regression model
joblib.dump(svm_model, 'svm.pkl')

# Save the TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']