# Clasificador de noticias
### Comenzamos importando las librerias y los conjuntos de datos que usaremos

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
dff1 = pd.read_csv('Datasets/fake-news/Fake.csv', usecols = ['title'])
dft1 = pd.read_csv('Datasets/fake-news/True.csv', usecols = ['title'])
dff2 = pd.read_csv('Datasets/fake-news/fake2.csv', usecols = ['title'])
dft2 = pd.read_csv('Datasets/fake-news/real2.csv', usecols = ['title'])
dff1 = dff1.dropna()
dft1 = dft1.dropna()
dff2 = dff2.dropna()
dft2 = dft2.dropna()
dff1['class'] = 0
dft1['class'] = 1
dff2['class'] = 0
dft2['class'] = 1
dff2.head()

Unnamed: 0,title,class
0,BREAKING: First NFL Team Declares Bankruptcy O...,0
1,Court Orders Obama To Pay $400 Million In Rest...,0
2,UPDATE: Second Roy Moore Accuser Works For Mic...,0
3,Oscar Pistorius Attempts To Commit Suicide,0
4,Trump Votes For Death Penalty For Being Gay,0


### Unimos los dos conjuntos de datos en un solo dataframe

In [2]:
frames = [dff1,dft1,dff2,dft2]
dfn = pd.concat(frames)
dfn.info

<bound method DataFrame.info of                                                  title  class
0     Donald Trump Sends Out Embarrassing New Year’...      0
1     Drunk Bragging Trump Staffer Started Russian ...      0
2     Sheriff David Clarke Becomes An Internet Joke...      0
3     Trump Is So Obsessed He Even Has Obama’s Name...      0
4     Pope Francis Just Called Out Donald Trump Dur...      0
..                                                 ...    ...
619  Flake: “Religious tests should have no place i...      1
620                           Change We Can Believe In      1
621  deputy director of national health statistics ...      1
622  Romneys ProLife Conversion Myth or Reality Jun...      1
623                             Interest Group Ratings      1

[45954 rows x 2 columns]>

### Removemos los signos de puntuación, las mayúsculas y definimos las stopwords

In [3]:
stop_words = set(stopwords.words('english')) 
def preprocessor(text):
    text = (re.sub('[\W]+', ' ', text.lower()))
    return text

In [4]:
dfn['title'] = dfn['title'].apply(preprocessor)

In [5]:
dfn.head()

Unnamed: 0,title,class
0,donald trump sends out embarrassing new year ...,0
1,drunk bragging trump staffer started russian ...,0
2,sheriff david clarke becomes an internet joke...,0
3,trump is so obsessed he even has obama s name...,0
4,pope francis just called out donald trump dur...,0


### Tokenizamos el texto

In [6]:
tokenizer = RegexpTokenizer(r'\w+')
dfn['title'] = dfn['title'].apply(lambda x: tokenizer.tokenize(x.lower()))

In [7]:
dfn.head()

Unnamed: 0,title,class
0,"[donald, trump, sends, out, embarrassing, new,...",0
1,"[drunk, bragging, trump, staffer, started, rus...",0
2,"[sheriff, david, clarke, becomes, an, internet...",0
3,"[trump, is, so, obsessed, he, even, has, obama...",0
4,"[pope, francis, just, called, out, donald, tru...",0


### Lematizamos y removemos las stopwords

In [8]:
lemmatizer = WordNetLemmatizer()
def lemmat(text):
    lem_text = [lemmatizer.lemmatize(i) for i in text]
    return lem_text
def remove_stopwords(text):
    words = [word for word in text if word not in stop_words]
    return words
def untokenize(list):
    return " ".join(list)


In [9]:
dfn['title'] = dfn['title'].apply(remove_stopwords)
dfn['title'] = dfn['title'].apply(lemmat)


In [10]:
dfn.head()

Unnamed: 0,title,class
0,"[donald, trump, sends, embarrassing, new, year...",0
1,"[drunk, bragging, trump, staffer, started, rus...",0
2,"[sheriff, david, clarke, becomes, internet, jo...",0
3,"[trump, obsessed, even, obama, name, coded, we...",0
4,"[pope, francis, called, donald, trump, christm...",0


### Unimos de nuevo el texto

In [11]:
dfn['title'] = dfn['title'].apply(untokenize)

In [12]:
dfn.head()

Unnamed: 0,title,class
0,donald trump sends embarrassing new year eve m...,0
1,drunk bragging trump staffer started russian c...,0
2,sheriff david clarke becomes internet joke thr...,0
3,trump obsessed even obama name coded website i...,0
4,pope francis called donald trump christmas speech,0


### Dividimos los datos en un set de entrenamiento y testeo, generamos la matriz tf-idf e insertamos el headline que vamos a clasificar

In [13]:
tfidf = TfidfVectorizer()
label = {0:'fake', 1:'real'}
X_train, X_test, y_train, y_test = train_test_split(dfn['title'],dfn['class'], test_size=0.3, random_state=50)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


headline= "China punishes Australia for promoting an inquiry into covid-19"
test_headline = [" ".join(lemmat(remove_stopwords(tokenizer.tokenize(headline.lower()))))]
#test_headline  = [" ".join(tokenizer.tokenize(headline.lower()))]


test_headline_tfidf = tfidf.transform(test_headline )
print("processed headline:", test_headline )

processed headline: ['china punishes australia promoting inquiry covid 19']


### Entrenamos el modelo de regresión logística e imprimimos los resultados de testeo

In [14]:
clf = LogisticRegression(random_state=0).fit(X_train_tfidf, y_train)
clf.score(X_test_tfidf, y_test)

0.9403786175382607

### Finalmente, imprimimos el valor de la predicción 

In [15]:
print('Prediction: %s\nProbability: %.2f%%' %(label[clf.predict(test_headline_tfidf)[0]],np.max(clf.predict_proba(test_headline_tfidf))*100))

Prediction: real
Probability: 92.54%
