In [1]:
import pandas as pd
import dill
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve
from sklearn.metrics import f1_score

#working with text
from sklearn.feature_extraction.text import TfidfVectorizer

#normalizing data
from sklearn.preprocessing import StandardScaler

#pipeline
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import precision_score,recall_score

#imputer
from sklearn.impute import SimpleImputer

import sklearn.datasets

In [2]:
df = pd.read_csv("./FakeNewsNet.csv")
df.head(3)

Unnamed: 0,title,news_url,source_domain,tweet_num,real
0,Kandi Burruss Explodes Over Rape Accusation on...,http://toofab.com/2017/05/08/real-housewives-a...,toofab.com,42,1
1,People's Choice Awards 2018: The best red carp...,https://www.today.com/style/see-people-s-choic...,www.today.com,0,1
2,Sophia Bush Sends Sweet Birthday Message to 'O...,https://www.etonline.com/news/220806_sophia_bu...,www.etonline.com,63,1


In [3]:
df['real'].value_counts()

1    17441
0     5755
Name: real, dtype: int64

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df, df['real'],
                                                    test_size=0.33, random_state=42)
# save test
X_test.to_csv("X_test.csv", index=None)
y_test.to_csv("y_test.csv", index=None)

# save train
X_train.to_csv("X_train.csv", index=None)
y_train.to_csv("y_train.csv", index=None)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23196 entries, 0 to 23195
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   title          23196 non-null  object
 1   news_url       22866 non-null  object
 2   source_domain  22866 non-null  object
 3   tweet_num      23196 non-null  int64 
 4   real           23196 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 906.2+ KB


In [6]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    

class TextImputer(BaseEstimator, TransformerMixin):
    def __init__(self, key, value):
        self.key = key
        self.value = value
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X[self.key] = X[self.key].fillna(self.value)
        return X

In [7]:
features = ['title', 'news_url', 'source_domain']
target = 'real'

In [8]:
# combine
title = Pipeline([
                ('imputer', TextImputer('title', '')),
                ('selector', ColumnSelector(key='title')),
                ('tfidf', TfidfVectorizer())
            ])

news_url = Pipeline([
                ('imputer', TextImputer('news_url', '')),
                ('selector', ColumnSelector(key='news_url')),
                ('tfidf', TfidfVectorizer())
            ])

source_domain = Pipeline([
                ('imputer', TextImputer('source_domain', '')),
                ('selector', ColumnSelector(key='source_domain')),
                ('tfidf', TfidfVectorizer())
            ])


feats = FeatureUnion([('title', title),
                      ('news_url', news_url),
                      ('source_domain', source_domain)])

In [9]:
%%time

pipeline = Pipeline([
    ('features', feats),
    ('classifier', LogisticRegression()),
])

pipeline.fit(X_train, y_train)

CPU times: user 2.46 s, sys: 56.8 ms, total: 2.52 s
Wall time: 1.41 s


In [10]:
pipeline.steps

[('features',
  FeatureUnion(transformer_list=[('title',
                                  Pipeline(steps=[('imputer',
                                                   TextImputer(key='title',
                                                               value='')),
                                                  ('selector',
                                                   ColumnSelector(key='title')),
                                                  ('tfidf', TfidfVectorizer())])),
                                 ('news_url',
                                  Pipeline(steps=[('imputer',
                                                   TextImputer(key='news_url',
                                                               value='')),
                                                  ('selector',
                                                   ColumnSelector(key='news_url')),
                                                  ('tfidf', TfidfVectorizer())])),
           

In [11]:
with open("logreg_pipeline.dill", "wb") as f:
    dill.dump(pipeline, f)

In [12]:
X_test = pd.read_csv("X_test.csv")
y_test = pd.read_csv("y_test.csv")

In [13]:
X_test.head(3)

Unnamed: 0,title,news_url,source_domain,tweet_num,real
0,Will Stabler return to 'Law and Order: SVU' fo...,http://www.today.com/popculture/will-stabler-r...,www.today.com,62,1
1,Travis Barker Survives ‘Really Bad’ Crash with...,http://extratv.com/2018/07/16/travis-barker-su...,extratv.com,46,1
2,Iggy Azalea's Revenge on Nick Young: I Burned ...,https://www.nbclosangeles.com/entertainment/en...,www.nbclosangeles.com,0,1


In [14]:
with open('logreg_pipeline.dill', 'rb') as in_strm:
    pipeline = dill.load(in_strm)

In [15]:
pipeline

In [16]:
preds = pipeline.predict_proba(X_test)[:, 1]

pred_df = pd.DataFrame({'preds': preds})
pred_df.to_csv("test_predictions.csv", index=None)

In [17]:
preds[:10]

array([0.92735126, 0.88915367, 0.97027552, 0.97112008, 0.85508477,
       0.92168461, 0.38245843, 0.92049061, 0.59429108, 0.99547534])

In [18]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0.6542651994945508, F-Score=0.976, Precision=0.967, Recall=0.985


In [19]:
from flask import Flask, request, jsonify


In [20]:
# Загружаем обученные модели
with open('logreg_pipeline.dill', 'rb') as in_strm:
    model = dill.load(in_strm)

In [21]:
X_test = pd.read_csv("X_test.csv")
y_test = pd.read_csv("y_test.csv")

# Обработчики и запуск Flask
app = Flask(__name__)
# run_with_ngrok(app)  # Start ngrok when app is run


@app.route("/", methods=["GET"])
def general():
    return "Welcome to prediction process"

@app.route('/predict', methods=['POST'])
def predict():
    data = {"success": False}

    # ensure an image was properly uploaded to our endpoint
    description, company_profile, benefits = "", "", ""
    request_json = request.get_json()
    
    if request_json["description"]:
        description = request_json['description']
    
    if request_json["company_profile"]:
        company_profile = request_json['company_profile']
                
    if request_json["benefits"]:
        benefits = request_json['benefits']
    
    print(description)  
    preds = model.predict_proba(pd.DataFrame({"description": [description],
                                              "company_profile": [company_profile],
                                              "benefits": [benefits]}))
    data["predictions"] = preds[:, 1][0]
    data["description"] = description
        # indicate that the request was a success
    data["success"] = True
    print('OK')

        # return the data dictionary as a JSON response
    return jsonify(data)


if __name__ == '__main__':
    app.run()

In [None]:
# Обработчики и запуск Flask
app = Flask(__name__)
# run_with_ngrok(app)  # Start ngrok when app is run


@app.route("/", methods=["GET"])
def general():
    return "Welcome to prediction process"

@app.route('/predict', methods=['POST'])
def predict():
    data = {"success": False}

    # ensure an image was properly uploaded to our endpoint
    title, news_url, source_domain = "", "", ""
    request_json = request.get_json()
    
    if request_json["title"]:
        title = request_json['title']
    
    if request_json["news_url"]:
        news_url = request_json['news_url']
                
    if request_json["source_domain"]:
        benefits = request_json['source_domain']
    
    print(title)  
    preds = model.predict_proba(pd.DataFrame({"title": [title],
                                              "news_url": [news_url],
                                              "source_domain": [source_domain]}))
    data["predictions"] = preds[:, 1][0]
    data["title"] = title
        # indicate that the request was a success
    data["success"] = True
    print('OK')

        # return the data dictionary as a JSON response
    return jsonify(data)


if __name__ == '__main__':
    app.run()

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [28/Nov/2022 22:45:05] "POST /predict HTTP/1.1" 200 -


Will Stabler return to 'Law and Order: SVU' for battered Benson?
OK


127.0.0.1 - - [28/Nov/2022 22:45:09] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:09] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:09] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:09] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:09] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:09] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:09] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:09] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:09] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:09] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:09] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:09] "POST /predict HTTP/1.1" 200 -


Will Stabler return to 'Law and Order: SVU' for battered Benson?
OK
Travis Barker Survives ‘Really Bad’ Crash with School Bus
OK
Iggy Azalea's Revenge on Nick Young: I Burned All His Clothes
OK
We have reason to believe a new Justin Timberlake album could be on its way
OK
23 Times Blake Lively And Ryan Reynolds Trolled Each Other On Social Media
OK
Is Chris Hemsworth Done Playing Thor After 'Avengers 4'? He's Had Such Mixed Feelings About The Role
OK
Blac Chyna Suing Kardashian And Family In AllEncompassing Lawsuit
OK
Rachel Bilson Steps Out After Split From Hayden Christensen
OK
Spice Girls 'reunion is cancelled again'
OK
What is your go-to Karaoke song? : AskReddit
OK
BREAKING: Roy Moore’s Accuser Arrested And Charged With Falsification
OK
Kylie Jenner and Travis Scott spending quality time together 'before birth of first baby'
OK


127.0.0.1 - - [28/Nov/2022 22:45:09] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:09] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:09] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:09] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:09] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:09] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:09] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:09] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:09] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:09] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:09] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:09] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:09] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:09] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:09] "POST /pred

B Strong Disaster Relief – Delivering Good
OK
Is Anthony Scaramucci Dating Katrina Pierson?: Spotted Out Together After He’s Fired By Trump
OK
Josephine Skriver’s Morning Routine Involves Sweat and a Spoon
OK
Best Dressed Stars on Cannes Red Carpet 2017
OK
Angelina Jolie Buys $25 Million L.A. Mansion: Prepping For Wedding? — See Pics Inside
OK
Michael Bennet for US Senate
OK
'Black Panther' is the most tweeted about movie ever
OK
Teresa Giudice is dating businessman, report claims
OK
Stars Shine at the Palm Springs International Film Festival
OK
21 things you may not know about Ashley Graham
OK
Nick Carter's Rape Accuser Melissa Schuman Files Police Report, Investigation Underway
OK
Gisele and Tom Brady Are Twinning in Matching Suits
OK
A Complete Timeline of Selena Gomez and Justin Bieber's Relationship
OK
Mother of Bullied Child Speaks Out Amid Confederate Flag Controversy
OK
Rob Kardashian’s Daughter Dream Says ”Hi, Dad!” in Sweet Video
OK


127.0.0.1 - - [28/Nov/2022 22:45:09] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:09] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:09] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:09] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:09] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:10] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:10] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:10] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:10] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:10] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:10] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:10] "POST /predict HTTP/1.1" 200 -


Alison Brie Addresses James Franco Allegations at SAG Awards
OK
Clare Crawley Says She’s in Love After Leaving Bachelor Winter Games — But Who’s the Lucky Guy?
OK
Keeping Up with the Kardashians
OK
Meghan Markle ‘Overwhelmed’ By Royal Etiquette — How Harry & The Queen Are Helping
OK
Will Mama June Maintain Her 300-Pound Weight Loss? Wendy Williams Delivers Her Honest Prediction
OK
Meghan Markle’s dad Thomas causing royal family problems
OK
Kim Kardashian & Kanye West Named Their Daughter Elle V. West & Here’s Why
OK
Khloe Kardashian’s ‘Only Days Away’ From Giving Birth — Will Her Sisters Be In Delivery Room?
OK
Tom Daley and husband Dustin Lance Black release first photos of their baby boy
OK
Channing Tatum and Jenna Dewan Silence Rumors About Their Split
OK
Tori Spelling Owes A $220,000 Default Judgement
OK
Madonna Considering a Move to Rome (Report)
OK
Women Sharing Stories Of Sexual Assault Helped Laura Dern Identify Her Own


127.0.0.1 - - [28/Nov/2022 22:45:10] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:10] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:10] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:10] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:10] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:10] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:10] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:10] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:10] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:10] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [28/Nov/2022 22:45:10] "POST /predict HTTP/1.1" 200 -


OK
Comic-Con 2017: The Winners and Losers (Photos)
OK
Palin-Huey
OK
Did Scott Disick send flowers to both Bella Thorne and Kourtney Kardashian on the same day?
OK
Have We Noticed That Every Time Drake Dates a Woman He's Not Really Dating Her?
OK
Channing Tatum  Jenna Dewan Fighting Over Nude Photo  Claim
OK
S.1591 - 104th Congress (1995-1996): A bill to prohibit campaign expenditures for services of lobbyists, and for other purposes.
OK
Justin Bieber says religion ‘set me free from bondage and shame’
OK
‘Once Upon a Time’ Series Finale to Bring Back Jennifer Morrison, Ginnifer Goodwin
OK
The first trailer for the documentary "Whitney" reminds us that we will ALWAYS love the iconic singer
OK
Jennifer Lopez And Alex Rodriguez Headed For A Split? Inside Her Demands For A Ring
OK


In [None]:
N=50

In [None]:
%%time
predictions = X_test[
                     ['title', 'news_url', 'source_domain']
                     ].iloc[:N].apply(lambda x: response.get_json(), axis=1)