# Pre-Cleaning

### Load Kaggle Dataset

In [32]:
import pandas as pd
# fake_csv = "../online-fake-news-detection/data/Fake.csv"
# true_csv = "../online-fake-news-detection/data/True.csv"
# fake_df = pd.read_csv(fake_csv)
# true_df = pd.read_csv(true_csv)
# true_df.head()

### Module

In [None]:
import trafilatura
downloaded = trafilatura.fetch_url('https://edition.cnn.com/europe/live-news/russia-ukraine-war-news-11-29-22/index.html')
trafilatura.extract(downloaded)

In [41]:
from trafilatura import extract, fetch_url
downloaded = fetch_url('https://edition.cnn.com/europe/live-news/russia-ukraine-war-news-11-29-22/index.html')
provided_article = pd.DataFrame({'title_texte': [extract(downloaded)]})
provided_article

Unnamed: 0,title_texte
0,NATO foreign ministers said Tuesday in a joint...


### Load GH Dataset

In [324]:
import pandas as pd
new_csv = "../online-fake-news-detection/data/fakenews.csv"
news_df = pd.read_csv(new_csv)
news_df

Unnamed: 0,label,news
0,False,Says the Annies List political group supports ...
1,True,When did the decline of coal start? It started...
2,True,"Hillary Clinton agrees with John McCain ""by vo..."
3,False,Health care reform legislation is likely to ma...
4,True,The economic turnaround started at the end of ...
...,...,...
10235,True,There are a larger number of shark attacks in ...
10236,True,Democrats have now become the party of the [At...
10237,True,Says an alternative to Social Security that op...
10238,False,On lifting the U.S. Cuban embargo and allowing...


### True column

In [325]:
# true_df['True'] = True
# fake_df['True'] = False

In [326]:
articles_df = news_df.rename(columns={'label':'True', 'news' : 'title_text'})

### Merge

In [328]:
#articles_df = pd.merge(true_df,fake_df, 'outer')
#articles_df['title_text'] = articles_df['title'] + articles_df['text']


# Clean & Preprocessing

In [329]:
articles_df.head()

Unnamed: 0,True,title_text
0,False,Says the Annies List political group supports ...
1,True,When did the decline of coal start? It started...
2,True,"Hillary Clinton agrees with John McCain ""by vo..."
3,False,Health care reform legislation is likely to ma...
4,True,The economic turnaround started at the end of ...


In [330]:
articles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10240 entries, 0 to 10239
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   True        10240 non-null  bool  
 1   title_text  10240 non-null  object
dtypes: bool(1), object(1)
memory usage: 90.1+ KB


In [331]:
import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

In [332]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import unidecode

articles_df

def preprocessing(sentence):
    
    # Basic cleaning
    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.lower() ## lowercase 
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## remove numbers
    
    # Advanced cleaning
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## remove punctuation

    unaccented_string = unidecode.unidecode(sentence) # remove accents

    tokenized_sentence = word_tokenize(unaccented_string) ## tokenize 
    stop_words = set(stopwords.words('english')) ## define stopwords

    tokenized_sentence_cleaned = [ ## remove stopwords
    w for w in tokenized_sentence if not w in stop_words
            ]

    lemmatized = [
    WordNetLemmatizer().lemmatize(word, pos = "v") 
    for word in tokenized_sentence_cleaned
    ]
    
    cleaned_sentence = ' '.join(word for word in lemmatized)

    
    return cleaned_sentence

articles_df['cleaned_title_text'] = articles_df['title_text'].apply(preprocessing)


## X_y split

In [333]:
from sklearn.model_selection import train_test_split

target = 'True'
feature = articles_df['cleaned_title_text']

def X_y(df, TARGET_COLUMN):
    X = df.drop([TARGET_COLUMN], axis=1)
    y = df[TARGET_COLUMN]
    return X, y


def split_data(X, y):

    X_train, X_test, y_train, y_test = train_test_split(X, y,\
        test_size=0.3, random_state=42, shuffle=True)

    return X_train, X_test, y_train, y_test

X, y = X_y(articles_df, target)
X_train, X_test, y_train, y_test = split_data(X,y)

In [334]:
X_train.head()

Unnamed: 0,title_text,cleaned_title_text
4919,Wall Street megabanks that received bailouts i...,wall street megabanks receive bailouts get tax...
1332,Weve got more revenue than we ever have.,weve get revenue ever
8207,North Dakotas economy is reeling.,north dakotas economy reel
3485,Bill OBriens Tea Party legislature tried to re...,bill obriens tea party legislature try repeal ...
3688,Says Connie Macks Penny Plan would cut over $2...,say connie macks penny plan would cut billion ...


## Vectorizer

### Fit & Transform

In [335]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
X_train_cleaned_title_text = X_train['cleaned_title_text']
X_test_cleaned_title_text = X_test['cleaned_title_text']

def train_vect(X: np.ndarray):
    tfidf_vectorizer = TfidfVectorizer(max_df=0.7)
    tfidf_fitted = tfidf_vectorizer.fit(X)
    
    return tfidf_fitted


def transform_vect(X: np.ndarray, tfdidf_fitted):
    tfidf_transformed = tfdidf_fitted.transform(X)

    return tfidf_transformed


tfdidf_fitted = train_vect(X_train_cleaned_title_text)
X_train_vectorized = transform_vect(X_train_cleaned_title_text, tfdidf_fitted)
X_test_vectorized = transform_vect(X_test_cleaned_title_text, tfdidf_fitted)

# tfidf_vectorizer = TfidfVectorizer(max_df=0.7)
# X_train_vectorized = tfidf_vectorizer.fit_transform(X_train_cleaned_title_text)
# X_test_vectorized = tfidf_vectorizer.transform(X_test_cleaned_title_text)

### Transform

In [336]:
X_train_vectorized
X_test_vectorized

<3072x8518 sparse matrix of type '<class 'numpy.float64'>'
	with 29415 stored elements in Compressed Sparse Row format>

# Evaluate : Cross-Validation

In [337]:
import numpy as np
from sklearn.model_selection import cross_validate
from sklearn.linear_model import PassiveAggressiveClassifier

# Cross-validation
cv_results = cross_validate(PassiveAggressiveClassifier(max_iter=50, 
                                                        n_jobs=-1, 
                                                        random_state=42, 
                                                        fit_intercept=False, 
                                                        early_stopping=True,
                                                        validation_fraction=0.2, 
                                                        n_iter_no_change=5),
                             X_train_vectorized, y_train, cv=5, scoring=["accuracy"])
average_accuracy = cv_results["test_accuracy"].mean()
np.round(average_accuracy,2)

0.56

In [338]:
cv_results

{'fit_time': array([0.01806879, 0.01921201, 0.01510382, 0.01245284, 0.0155642 ]),
 'score_time': array([0.00065207, 0.00085473, 0.00082111, 0.00055408, 0.00058007]),
 'test_accuracy': array([0.58647141, 0.5460251 , 0.57810321, 0.57013259, 0.54291696])}

# If Validated

### Classifier

In [320]:
# from sklearn.linear_model import PassiveAggressiveClassifier

# def pa_classifier_fit(tfidf_train, y_train):

#     pac=PassiveAggressiveClassifier(max_iter=50, n_jobs=-1, random_state=0, fit_intercept=False, early_stopping=True,
#                                 validation_fraction=0.2, n_iter_no_change=5)
#     pac.fit(tfidf_train,y_train)

#     return pac

# pac = pa_classifier_fit(X_train_vectorized, y_train)
# pac

### Prediction

In [321]:
# def y_pred(pac, tfidf_test):

#     y_pred = pac.predict(tfidf_test)

#     return y_pred
# y_predicted = y_pred(pac,X_test_vectorized)
# y_predicted 