# Fake News Detection

## Load the data

In [1]:
import numpy as np
import pandas as pd

train_data = pd.read_csv("train.csv", encoding='ISO-8859-1',delimiter="\t")

## Dropout dirty data

In [2]:
print(train_data.loc[1615])
train_data = train_data.drop([1615])

text     content
label      label
Name: 1615, dtype: object


## Visualize the news

In [3]:
rand_indexs = np.random.randint(1,len(train_data),50).tolist()
train_data["text"][rand_indexs]

606     This year, Colorado produced the most Winter O...
3040    Ted Cruz' Full 'Meet the Press' InterviewRepub...
3911    Shay Mitchell, who frequently documents her va...
1367    Cheryl Burke from Dance Moms came out in suppo...
4399    Jaime Pressly is opening up about how expectin...
2269    One of the most contentious issues in the Stat...
4684    45th President of the United States  Donald Jo...
4896    Who is Prince Harry's father? That's what many...
2475    Erika Jayne has been keeping a secret.  Get pu...
2763    (CNN) The Tony Awards celebrated the best of B...
635     (Excerpt) Read more at: E! Online  Wake Up To ...
2171    In light of the Timeâs Up movement, Bella Th...
4621    Itâs complicated for Teen Mom 2âs Kailyn L...
3616    Selena Gomez & Marshmello Deliver Intense 'Wol...
693     When members of the royal family tie the knot,...
3408    Asa Soltan Rahmati welcomed her first child in...
477     Inside BeyoncÃ©'s Weight Loss Journey 4 Months...
3415    Juggli

## Emoticons

In [4]:
import re
news = train_data.text.str.cat()
a = r" ([xX:;][-']?.) "
emos = set(re.findall(a,news))  

emos

{': ',
 ':(',
 ':)',
 '::',
 ':D',
 '; ',
 'X ',
 'X)',
 'X,',
 'X.',
 'X2',
 'X3',
 'X:',
 'XD',
 'XL',
 'XM',
 'XO',
 'XQ',
 'XS',
 'XT',
 'XX',
 'Xi',
 'Xo',
 "x'.",
 'x2',
 'xo',
 'xx'}

## Most used words

In [5]:
import nltk
from nltk.tokenize import word_tokenize

def most_used_words(text):
    tokens = word_tokenize(text)
    frequency_dist = nltk.FreqDist(tokens)  
    print("There is %d different words" % len(set(tokens)))
    
    return sorted(frequency_dist,key=frequency_dist.__getitem__, reverse=True) 

In [6]:
most_used_words(train_data.text.str.cat())[:100]

There is 122575 different words


[',',
 'the',
 '.',
 'and',
 'to',
 'a',
 'of',
 'in',
 'that',
 'on',
 'for',
 'was',
 'her',
 'with',
 'is',
 '[',
 ']',
 "'s",
 'I',
 'The',
 '``',
 ':',
 'she',
 'it',
 "''",
 'as',
 'at',
 ')',
 '(',
 'his',
 'he',
 'have',
 'be',
 'by',
 'you',
 'from',
 'has',
 'an',
 'not',
 'this',
 'their',
 'are',
 'about',
 'who',
 'they',
 'but',
 'had',
 'said',
 'we',
 'â\x80\x9d',
 '!',
 'In',
 'been',
 '?',
 'out',
 'were',
 'up',
 'all',
 'one',
 'after',
 'will',
 'also',
 'which',
 'â\x80\x94',
 'when',
 'more',
 "'",
 'so',
 "n't",
 'time',
 'just',
 'or',
 'my',
 'like',
 'first',
 'do',
 'would',
 'She',
 'what',
 'him',
 'people',
 'It',
 'me',
 'And',
 'two',
 'show',
 'can',
 'He',
 '2017',
 'years',
 'know',
 'there',
 'our',
 'get',
 'over',
 'into',
 'told',
 'new',
 'now',
 'other']

## Stop words

In [7]:
from nltk.corpus import stopwords
mw = most_used_words(train_data.text.str.cat())
most_words = []
for w in mw:
    if len(most_words) == 1000:
        break
    if w in stopwords.words("english"):
        continue
    else:
        most_words.append(w)

There is 122575 different words


In [8]:
sorted(most_words)   

['!',
 '#',
 '$',
 '&',
 "'",
 "''",
 "'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 '(',
 ')',
 ',',
 '-',
 '--',
 '.',
 '...',
 '/',
 '1',
 '10',
 '100',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '2',
 '20',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '3',
 '30',
 '31',
 '4',
 '40',
 '5',
 '50',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '?',
 '@',
 'A',
 'ABC',
 'Academy',
 'According',
 'Actress',
 'Advertisement',
 'After',
 'All',
 'Although',
 'America',
 'American',
 'Americans',
 'An',
 'And',
 'Angeles',
 'Angelina',
 'Aniston',
 'April',
 'Are',
 'As',
 'At',
 'August',
 'Award',
 'Awards',
 'B',
 'Beckham',
 'Ben',
 'Best',
 'BeyoncÃ©',
 'Bieber',
 'Big',
 'Billboard',
 'Black',
 'Blake',
 'Brad',
 'British',
 'Brown',
 'Bush',
 'But',
 'By',
 'CBS',
 'California',
 'Chicago',
 'Chris',
 'Christmas'

##### most_words觀察到:
需要的前處理為 做詞性還原(Aw,Awww)、去除雜字雜符號、數字($&...)、將emotion用空格替代(XD)

## Stemming

In [9]:
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
#取字根
def stem_tokenize(text):
    stemmer = SnowballStemmer("english")
    stemmer = WordNetLemmatizer()
    return [stemmer.lemmatize(token) for token in word_tokenize(text)]  
#還原詞性
def lemmatize_tokenize(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in word_tokenize(text)]

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Prepare the data

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

#### Building the pipeline

In [11]:
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline

In [12]:
class TextPreProc(BaseEstimator,TransformerMixin):       
    def __init__(self, use_mention=False):
        self.use_mention = use_mention
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        # We can choose between keeping the mentions
        # or deleting them
        if self.use_mention:
            X = X.str.replace(r"@[a-zA-Z0-9_]* ", " @tags ")  
        else:
            X = X.str.replace(r"@[a-zA-Z0-9_]* ", "")
            
        # Keeping only the word after the #
        X = X.str.replace("#", "")
        X = X.str.replace(r"[-\.\n]", "")
        
        # !#$&
        X = X.str.replace(r"[!#$&]", "")
        # ' or 'xxx
        X = X.str.replace(r"'[\w'\s]*", "")
        # ) ( ...
        X = X.str.replace(r"'[\D]", "")
        # number
        X = X.str.replace(r"[\d]*", "")
        
        # Removing HTML garbage
        X = X.str.replace(r"&\w+;", "")
        # Removing links
        X = X.str.replace(r"https?://\S*", "")
        # replace repeated letters with only two occurences
        # heeeelllloooo => heelloo
        X = X.str.replace(r"(.)\1+", r"\1\1")
        # mark emoticons as happy or sad
        X = X.str.replace(a, "")
        X = X.str.lower()   
        return X

In [13]:
from sklearn.model_selection import train_test_split

label = train_data['label']
text = train_data['text']


vectorizer = TfidfVectorizer(tokenizer=lemmatize_tokenize, ngram_range=(1,2),max_features=5000) 
pipeline = Pipeline([
    ('text_pre_processing', TextPreProc(use_mention=False)),  
    ('vectorizer', vectorizer),                               
])  


learn_data, test_data, label_learning, label_test = train_test_split(text, label, test_size=0.3)
#x_train,x_test,y_train,y_test


learning_data = pipeline.fit_transform(learn_data)   

# Select a model

In [14]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

import lightgbm
from lightgbm import LGBMClassifier



lr = LogisticRegression()
bnb = BernoulliNB()
mnb = MultinomialNB()
lgbm = LGBMClassifier()

models = {
    'logitic regression': lr,
    'bernoulliNB': bnb,
    'multinomialNB': mnb,           
    'lightGBM' : lgbm,
}
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
f1_scorer = make_scorer(f1_score, pos_label="1")
for model in models.keys():
    scores = cross_val_score(models[model], learning_data, label_learning, scoring=f1_scorer, cv=10)
    print("===", model, "===")
    print("scores = ", scores)
    print("mean = ", scores.mean())
    print("variance = ", scores.var())
    models[model].fit(learning_data, label_learning)   
    print("score on the learning data (accuracy) = ", accuracy_score(models[model].predict(learning_data), label_learning))
    print("")   

=== logitic regression ===
scores =  [0.63865546 0.7219917  0.69672131 0.57798165 0.66956522 0.66666667
 0.69076305 0.63793103 0.63755459 0.62393162]
mean =  0.6561762306115272
variance =  0.0015601879270126344
score on the learning data (accuracy) =  0.8426934097421204

=== bernoulliNB ===
scores =  [0.536      0.60557769 0.65546218 0.59607843 0.62948207 0.63779528
 0.64367816 0.66129032 0.63934426 0.62745098]
mean =  0.6232159378980648
variance =  0.0012092233636768158
score on the learning data (accuracy) =  0.767621776504298

=== multinomialNB ===
scores =  [0.62831858 0.72727273 0.62337662 0.52132701 0.61111111 0.63063063
 0.64935065 0.60444444 0.64035088 0.59821429]
mean =  0.6234396947382261
variance =  0.002339031569580384
score on the learning data (accuracy) =  0.805730659025788

=== lightGBM ===
scores =  [0.71713147 0.74909091 0.68656716 0.6097561  0.688      0.7007874
 0.72992701 0.704      0.65338645 0.664     ]
mean =  0.6902646507991915
variance =  0.0014605105946071447

# Fine tune the model

In [15]:
# from sklearn.model_selection import GridSearchCV

# grid_search_pipeline = Pipeline([
#     ('text_pre_processing', TextPreProc()),
#     ('vectorizer', TfidfVectorizer()),
#     ('model', LGBMClassifier()),      #想調這三個東西裡面的參數  (比較完三個model後，我們最後只想用multinomialNB當預測model)
# ])

# params = [
#     {
#         'text_pre_processing__use_mention': [True, False],
#         'vectorizer__max_features': [1000, 2000, 5000, 10000, 20000, None],
#         'vectorizer__ngram_range': [(1,1), (1,2)],      #這三個東西裡面分別參數的可能值
#     },
# ]
# from sklearn.metrics import f1_score
# from sklearn.metrics import make_scorer
# f1_scorer = make_scorer(f1_score, pos_label="1")
# grid_search = GridSearchCV(grid_search_pipeline, params, cv=5, scoring=f1_scorer)
# grid_search.fit(learn_data, label_learning )   #用x_train,y_train去調
# print(grid_search.best_params_)    #得到這些結果 再拉上去改這些參數的值

# Test

In [16]:
import lightgbm
from lightgbm import LGBMClassifier
lgbm = LGBMClassifier()
lgbm.fit(learning_data, label_learning)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [17]:
testing_data = pipeline.transform(test_data)   #tfidf轉向量:fit_transform()=fit()+transform()
lgbm.score(testing_data, label_test)

0.7747326203208557

In [18]:
# Predecting on the test.csv
sub_data = pd.read_csv("test.csv", encoding='ISO-8859-1',delimiter="\t")
sub_learning = pipeline.transform(sub_data.text)        #test.csv沒告訴你真正label所以不能measure accuracy，且test.csv只能做將文字轉向量
sub = pd.DataFrame(sub_data.id, columns=("id", "label"))   
sub["label"] = lgbm.predict(sub_learning)
print(sub)

        id label
0        2     0
1        3     0
2        4     1
3        5     1
4        6     0
...    ...   ...
1242  1244     0
1243  1245     0
1244  1246     1
1245  1247     0
1246  1248     0

[1247 rows x 2 columns]
