# Pre-Cleaning

### Load Kaggle Dataset

In [202]:
import pandas as pd
# fake_csv = "../online-fake-news-detection/data/DS1_Fake.csv"
# true_csv = "../online-fake-news-detection/data/DS1_True.csv"
# fake_df = pd.read_csv(fake_csv)
# true_df = pd.read_csv(true_csv)
# true_df.head()

### Module

In [203]:
import trafilatura
downloaded = trafilatura.fetch_url('https://edition.cnn.com/europe/live-news/russia-ukraine-war-news-11-29-22/index.html')
extract = trafilatura.extract(downloaded)

## Load GH Dataset

### Dataset 3

In [228]:
DS3_submit_csv = "../ofnd/data/DS3_submit.csv"
DS3_submit_df = pd.read_csv(DS3_submit_csv)
DS3_test_csv = "../ofnd/data/DS3_test.csv"
DS3_test_df = pd.read_csv(DS3_test_csv)
DS3_submit_test_df = pd.merge(DS3_submit_df,DS3_test_df,'inner')
DS3_train_csv = "../ofnd/data/DS3_train.csv"
DS3_train_df = pd.read_csv(DS3_train_csv)
DS3_df = pd.merge(DS3_submit_test_df,DS3_train_df, 'outer')
DS3_df['news'] = DS3_df['title'] + DS3_df['text']
DS3_df = DS3_df.drop(columns = ['id','author','title','text'])
DS3_df['label'] = DS3_df['label'].apply(lambda x: True if x == 1 else False)

In [266]:
DS3_df.shape

(26000, 2)

### Dataset 2

In [229]:
import pandas as pd
DS2_csv = "../ofnd/data/DS2_fakenews.csv"
DS2_df = pd.read_csv(DS2_csv)

In [267]:
DS2_df.shape

(10240, 2)

### Dataset 4

In [230]:
DS4_csv = "../ofnd/data/DS4.csv"
DS4_df = pd.read_csv(DS4_csv)
DS4_df['news'] = DS4_df['title'] + DS4_df['text']
DS4_df = DS4_df.drop(columns = ['Unnamed: 0','title','text'])
DS4_df['label'] = DS4_df['label'].apply(lambda x: True if x == 1 else False)

In [268]:
DS4_df.shape

(72134, 2)

## Merge dataset

In [269]:
articles_df = pd.merge(DS2_df,DS3_df, 'outer')
articles_df = pd.merge(articles_df, DS4_df,'outer')
#articles_df = DS4_df

# Clean & Preprocessing

### Removing NAN rows

In [270]:
articles_df = articles_df[articles_df['news'].notna()]

### Drop duplicates

In [271]:
articles_df = articles_df.drop_duplicates()

In [272]:
import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import unidecode

In [273]:
def preprocessing(sentence):

    
    # Basic cleaning
    
    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.lower() ## lowercase 
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## remove numbers
    
    # Advanced cleaning
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## remove punctuation

    unaccented_string = unidecode.unidecode(sentence) # remove accents

    tokenized_sentence = word_tokenize(unaccented_string) ## tokenize 
    stop_words = set(stopwords.words('english')) ## define stopwords

    tokenized_sentence_cleaned = [ ## remove stopwords
    w for w in tokenized_sentence if not w in stop_words
            ]

    lemmatized = [
    WordNetLemmatizer().lemmatize(word, pos = "v") 
    for word in tokenized_sentence_cleaned
    ]
    
    cleaned_sentence = ' '.join(word for word in lemmatized)

    
    return cleaned_sentence

articles_df['cleaned_news'] = articles_df['news'].apply(preprocessing)

## X_y split

In [274]:
from sklearn.model_selection import train_test_split

target = 'label'
feature = articles_df['cleaned_news']

def X_y(df, TARGET_COLUMN):
    X = df.drop([TARGET_COLUMN], axis=1)
    y = df[TARGET_COLUMN]
    return X, y


def split_data(X, y):

    X_train, X_test, y_train, y_test = train_test_split(X, y,\
        test_size=0.3, random_state=42, shuffle=True)

    return X_train, X_test, y_train, y_test

X, y = X_y(articles_df, target)
X_train, X_test, y_train, y_test = split_data(X,y)

## Vectorizer

### Fit & Transform

In [275]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import numpy as np
X_train_cleaned_title_text = X_train['cleaned_news']
X_test_cleaned_title_text = X_test['cleaned_news']

def train_vect(X: np.ndarray):
    tfidf_vectorizer = CountVectorizer(binary=False, decode_error='strict', encoding='utf-8',
                                input='content', lowercase=True, max_df=1.0,
                                max_features=None, min_df=1,
                                ngram_range=(1, 1), preprocessor=None,
                                stop_words=None, strip_accents=None,
                                token_pattern='(?u)\\b\\w\\w+\\b',
                                tokenizer=None, vocabulary=None)
                                
    tfidf_fitted = tfidf_vectorizer.fit(X)
    
    return tfidf_fitted


def transform_vect(X: np.ndarray, tfdidf_fitted):
    tfidf_transformed = tfdidf_fitted.transform(X)

    return tfidf_transformed


tfdidf_fitted = train_vect(X_train_cleaned_title_text)
X_train_vectorized = transform_vect(X_train_cleaned_title_text, tfdidf_fitted)
X_test_vectorized = transform_vect(X_test_cleaned_title_text, tfdidf_fitted)

### Transform

In [276]:
X_train_vectorized
X_test_vectorized

<23390x302285 sparse matrix of type '<class 'numpy.int64'>'
	with 4075714 stored elements in Compressed Sparse Row format>

# Evaluate : Cross-Validation

In [277]:
import numpy as np
from sklearn.model_selection import cross_validate
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB


# Cross-validation
cv_results = cross_validate(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
                             X_train_vectorized, y_train, cv=5, scoring=["accuracy"])
average_accuracy = cv_results["test_accuracy"].mean()
np.round(average_accuracy,2)

0.82

In [278]:
cv_results

{'fit_time': array([0.18787217, 0.12364697, 0.11828303, 0.12178707, 0.11660886]),
 'score_time': array([0.04387593, 0.03363299, 0.03432083, 0.03082991, 0.03074694]),
 'test_accuracy': array([0.82052222, 0.81768209, 0.81658268, 0.81410902, 0.80987722])}