# Dataset

### DS2

In [298]:
import pandas as pd

DS2_csv = "../ofnd/data/DS2_fakenews.csv"
DS2_df = pd.read_csv(DS2_csv)

### DS3

In [299]:
DS3_submit_csv = "../ofnd/data/DS3_submit.csv"
DS3_submit_df = pd.read_csv(DS3_submit_csv)
DS3_test_csv = "../ofnd/data/DS3_test.csv"
DS3_test_df = pd.read_csv(DS3_test_csv)
DS3_submit_test_df = pd.merge(DS3_submit_df,DS3_test_df,'inner')
DS3_train_csv = "../ofnd/data/DS3_train.csv"
DS3_train_df = pd.read_csv(DS3_train_csv)
DS3_df = pd.merge(DS3_submit_test_df,DS3_train_df, 'outer')
DS3_df['news'] = DS3_df['title'] + DS3_df['text']
DS3_df = DS3_df.drop(columns = ['id','author','title','text'])
DS3_df['label'] = DS3_df['label'].apply(lambda x: True if x == 1 else False)

### DS4

In [300]:
DS4_csv = "../ofnd/data/DS4.csv"
DS4_df = pd.read_csv(DS4_csv)
DS4_df['news'] = DS4_df['title'] + DS4_df['text']
DS4_df = DS4_df.drop(columns = ['Unnamed: 0','title','text'])
DS4_df['label'] = DS4_df['label'].apply(lambda x: True if x == 1 else False)

### Merge

In [301]:
#articles_df = pd.merge(DS2_df,DS3_df, 'outer')
#articles_df = pd.merge(articles_df, DS4_df,'outer')
articles_df = DS2_df

### Pre Cleaning

In [302]:
articles_df = articles_df[articles_df['news'].notna()]
articles_df = articles_df.drop_duplicates()

### Save as CSV

In [303]:
articles_df.to_csv('articles.csv')


### Cleaning

In [278]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import unidecode

def preprocessing_DS(sentence):

    
    # Basic cleaning
    
    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.lower() ## lowercase 
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## remove numbers
    
    # Advanced cleaning
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## remove punctuation

    unaccented_string = unidecode.unidecode(sentence) # remove accents

    tokenized_sentence = word_tokenize(unaccented_string) ## tokenize 
    stop_words = set(stopwords.words('english')) ## define stopwords

    tokenized_sentence_cleaned = [ ## remove stopwords
    w for w in tokenized_sentence if not w in stop_words
            ]

    lemmatized = [
    WordNetLemmatizer().lemmatize(word, pos = "v") 
    for word in tokenized_sentence_cleaned
    ]
    
    cleaned_sentence = ' '.join(word for word in lemmatized)

    
    return cleaned_sentence

articles_df['cleaned_news'] = articles_df['news'].apply(preprocessing_DS)

### Splitting

In [279]:
from sklearn.model_selection import train_test_split

target = 'label'
feature = articles_df['cleaned_news']

def X_y(df, TARGET_COLUMN):
    X = df.drop([TARGET_COLUMN], axis=1)
    y = df[TARGET_COLUMN]
    return X, y

X, y = X_y(articles_df, target)

## Vectorizer & Modelling

In [280]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

pipe = Pipeline([
                ('vectorizer',
                CountVectorizer(binary=False, decode_error='strict', encoding='utf-8',
                                input='content', lowercase=True, max_df=1.0,
                                max_features=None, min_df=1,
                                ngram_range=(1, 1), preprocessor=None,
                                stop_words=None, strip_accents=None,
                                token_pattern='(?u)\\b\\w\\w+\\b',
                                tokenizer=None, vocabulary=None)),
                ("transformer", TfidfTransformer()),
                ('classifier',
                MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))                
                ])

pipe

In [281]:
pipe_fitted = pipe.fit(X['cleaned_news'], y)

# Predict

In [282]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import unidecode

def preprocessing(sentence):
    
    # Basic cleaning
    
    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.lower() ## lowercase 
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## remove numbers
    
    # Advanced cleaning
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## remove punctuation

    unaccented_string = unidecode.unidecode(sentence) # remove accents

    tokenized_sentence = word_tokenize(unaccented_string) ## tokenize 
    stop_words = set(stopwords.words('english')) ## define stopwords

    tokenized_sentence_cleaned = [ ## remove stopwords
    w for w in tokenized_sentence if not w in stop_words
            ]

    lemmatized = [
    WordNetLemmatizer().lemmatize(word, pos = "v") 
    for word in tokenized_sentence_cleaned
    ]
    
    cleaned_sentence = ' '.join(word for word in lemmatized)
    provided_article = pd.DataFrame({'news': [cleaned_sentence]})

    
    return cleaned_sentence

In [3]:

url = 'https://edition.cnn.com/europe/live-news/russia-ukraine-war-news-11-29-22/index.html'
#url = 'https://www.bloomberg.com/news/articles/2022-12-01/kim-kardashian-s-investment-firm-hires-brisske-from-permira'

In [4]:
from trafilatura import extract, fetch_url
import pandas as pd
def df_provided_url(url):
    downloaded = fetch_url(url)
    provided_article = pd.DataFrame(({'news': [extract(downloaded)]}))
#    provided_article = pd.DataFrame([extract(downloaded)])
    return provided_article
df_provided_url(url)

Unnamed: 0,news
0,Ukraine's prime minister says winter season wi...


In [5]:
sentence_df = df_provided_url(url)
sentence = sentence_df.iloc[0][0]
cleaned_sentence = preprocessing(sentence)


NameError: name 'preprocessing' is not defined

In [None]:
df_cleaned_sentence = pd.DataFrame([cleaned_sentence])

In [None]:
prediction = pipe.predict(df_cleaned_sentence.iloc[0])
pipe.predict_proba(df_cleaned_sentence.iloc[0])

array([[0.34125957, 0.65874043]])

In [None]:
def result():
    prediction = pipe.predict(df_cleaned_sentence.iloc[0])
    predict_proba = pipe.predict_proba(df_cleaned_sentence.iloc[0])
    if prediction[0] == True:
        if predict_proba[0][1] > 0.70:
            return "It's true!"
        elif predict_proba[0][1] > 0.5:
            return 'Probably true'
        elif predict_proba[0][1] <= 0.5:
            return 'possibly true dude'
    elif prediction[0] == False:
        if predict_proba[0][0] > 0.70:
            return "It's a fake news"
        elif predict_proba[0][0] > 0.5:
            return 'Probably fake'
        elif predict_proba[0][1] <= 0.5:
            return 'Possibly fake'
result()

'Probably true'