!pip install dill

# Итоговый проект

# Step 1 - TRAIN

In [1]:
import pandas as pd
import dill
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve
from sklearn.metrics import f1_score

#working with text
from sklearn.feature_extraction.text import TfidfVectorizer

#normalizing data
from sklearn.preprocessing import StandardScaler

#pipeline
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import precision_score,recall_score

#imputer
from sklearn.impute import SimpleImputer

import sklearn.datasets

Используем данные

https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset

In [2]:
df_fake = pd.read_csv("Fake.csv.zip",compression='zip')
df_true = pd.read_csv("True.csv.zip",compression='zip')

In [3]:
df_fake.describe()

Unnamed: 0,title,text,subject,date
count,23481,23481.0,23481,23481
unique,17903,17455.0,6,1681
top,MEDIA IGNORES Time That Bill Clinton FIRED His...,,News,"May 10, 2017"
freq,6,626.0,9050,46


In [4]:
df_true.describe()

Unnamed: 0,title,text,subject,date
count,21417,21417,21417,21417
unique,20826,21192,2,716
top,Factbox: Trump fills top jobs for his administ...,(Reuters) - Highlights for U.S. President Dona...,politicsNews,"December 20, 2017"
freq,14,8,11272,182


In [5]:
df_true['label'] = 0
df_fake['label'] = 1

In [6]:
df = pd.concat([df_true, df_fake], ignore_index=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   label    44898 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.7+ MB


In [7]:
df.head()

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",0
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",0
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",0
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",0


In [8]:
pd.isnull(df)

Unnamed: 0,title,text,subject,date,label
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
...,...,...,...,...,...
44893,False,False,False,False,False
44894,False,False,False,False,False
44895,False,False,False,False,False
44896,False,False,False,False,False


Разделим данные на train/test и сохраним тестовую выборку на диск

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df, df['label'],
                                                    test_size=0.33, random_state=42)
# save test
X_test.to_csv("X_test.csv", index=None)
y_test.to_csv("y_test.csv", index=None)

# save train
X_train.to_csv("X_train.csv", index=None)
y_train.to_csv("y_train.csv", index=None)

In [10]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """

    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]


class TextImputer(BaseEstimator, TransformerMixin):
    def __init__(self, key, value):
        self.key = key
        self.value = value

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X[self.key] = X[self.key].fillna(self.value)
        return X


class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]


In [11]:
features = ['title', 'text', 'subject']
target = 'label'

Соберем кусок, ответственный за feature engineering

In [12]:
# combine
title = Pipeline([("imput", TextImputer('title', '')),
                  ("col_sec", ColumnSelector(key='title')),
                  ('tfidif', TfidfVectorizer(stop_words='english', ngram_range=[1, 3], max_df=0.9, min_df=10))
                  ])

text = Pipeline([("imput", TextImputer('text', '')),
                 ("col_sec", ColumnSelector(key='text')),
                 ('tfidif', TfidfVectorizer(stop_words='english', ngram_range=[1, 3], max_df=0.9, min_df=10))
                 ])

subject = Pipeline([("col_sec", ColumnSelector(key='subject')),
                    ('onhe', OHEEncoder(key='subject'))
                    ])

feats = FeatureUnion([("title", title),
                      ('text', text),
                      ('subject', subject)
                      ])


Добавим простейший классификатор

In [13]:
%%time

pipeline = Pipeline([('features', feats),
                     ('classifier', GradientBoostingClassifier())
                     ])

pipeline.fit(X_train, y_train)

CPU times: user 3min 33s, sys: 5.19 s, total: 3min 38s
Wall time: 3min 38s


Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('title',
                                                 Pipeline(steps=[('imput',
                                                                  TextImputer(key='title',
                                                                              value='')),
                                                                 ('col_sec',
                                                                  ColumnSelector(key='title')),
                                                                 ('tfidif',
                                                                  TfidfVectorizer(max_df=0.9,
                                                                                  min_df=10,
                                                                                  ngram_range=[1,
                                                                                               3],
                        

Посмотрим, как выглядит наш pipeline

In [14]:
pipeline.steps

[('features',
  FeatureUnion(transformer_list=[('title',
                                  Pipeline(steps=[('imput',
                                                   TextImputer(key='title',
                                                               value='')),
                                                  ('col_sec',
                                                   ColumnSelector(key='title')),
                                                  ('tfidif',
                                                   TfidfVectorizer(max_df=0.9,
                                                                   min_df=10,
                                                                   ngram_range=[1,
                                                                                3],
                                                                   stop_words='english'))])),
                                 ('text',
                                  Pipeline(steps=[('imput',
        

Сохраним модель (пайплайн)

In [15]:
with open("gradboost_pipeline.dill", "wb") as f:
    dill.dump(pipeline, f)

# Step 2 - PREDICT

### Проверка работоспособности и качества пайплайна

Загружаем модель (pipeline) напрямую и проверяем на отложенной (тестовой) выборке

In [16]:
X_test = pd.read_csv("X_test.csv")
y_test = pd.read_csv("y_test.csv")

In [17]:
X_test.head(3)

Unnamed: 0,title,text,subject,date,label
0,BREAKING: GOP Chairman Grassley Has Had Enoug...,"Donald Trump s White House is in chaos, and th...",News,"July 21, 2017",1
1,Failed GOP Candidates Remembered In Hilarious...,Now that Donald Trump is the presumptive GOP n...,News,"May 7, 2016",1
2,Mike Pence’s New DC Neighbors Are HILARIOUSLY...,Mike Pence is a huge homophobe. He supports ex...,News,"December 3, 2016",1


In [18]:
with open('gradboost_pipeline.dill', 'rb') as in_strm:
    pipeline = dill.load(in_strm)

In [19]:
pipeline

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('title',
                                                 Pipeline(steps=[('imput',
                                                                  TextImputer(key='title',
                                                                              value='')),
                                                                 ('col_sec',
                                                                  ColumnSelector(key='title')),
                                                                 ('tfidif',
                                                                  TfidfVectorizer(max_df=0.9,
                                                                                  min_df=10,
                                                                                  ngram_range=[1,
                                                                                               3],
                        

In [20]:
preds = pipeline.predict_proba(X_test)[:, 1]

pred_df = pd.DataFrame({'preds': preds})
pred_df.to_csv("test_predictions.csv", index=None)

In [21]:
preds[:10]

array([9.99978578e-01, 9.99978578e-01, 9.99978578e-01, 3.82697712e-05,
       9.99978578e-01, 5.27423015e-05, 3.82673410e-05, 3.82673410e-05,
       9.99978578e-01, 9.99978578e-01])

In [22]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0.9982706473891718, F-Score=1.000, Precision=1.000, Recall=1.000


Фиксируем в маркдаун лучшие значения

Best Threshold=0.9982706473891718, F-Score=1.000, Precision=1.000, Recall=1.000

# Step 3 - FLASK

## При внедрении

**При внедрении необходимо сделать:**
*   Определить формат json'а, в котором данные будут приниматься сервисом и отправляться обратно.
*   Определить ip-адрес и порт, на который будут поступать данные.
*   Создать во Flask необходимые роуты:<br/>
    `@app.route('/predict_example', method='POST')`<br/>
    `def predict_example():`
*   Перенести во Flask все функции преобразования данных,
    *   формат данные, приходящих от фронт-системы, может отличаться от формата исторических данных, использовавшихся при построении модели; в результате преобразований данные на вход модели должны поступить ровно в том виде, в каком была обучена модель.
*   Загрузить обученные модели.
*   Настроить логирование.

## Flask


!pip install flask

In [23]:
from flask import Flask, request, jsonify
import pandas as pd

### **Создаем сервис для обработки запросов к модели**

In [24]:
# Загружаем обученные модели
with open('gradboost_pipeline.dill', 'rb') as in_strm:
    model = dill.load(in_strm)

In [25]:
X_test = pd.read_csv("X_test.csv")
y_test = pd.read_csv("y_test.csv")

Запустить сервис и не глушить его, пока работаем 

In [None]:
# Обработчики и запуск Flask
app = Flask(__name__)

@app.route("/", methods=["GET"])
def general():
    return "Welcome to prediction process"


@app.route('/predict', methods=['POST'])
def predict():
    data = {"success": False}

    # ensure an image was properly uploaded to our endpoint
    title, text, subject = "", "", ""
    request_json = request.get_json()
    
    if request_json["title"]:
        description = request_json['title']
    
    if request_json["text"]:
        company_profile = request_json['text']
                
    if request_json["subject"]:
        benefits = request_json['subject']
    
    print(description)  
    preds = model.predict_proba(pd.DataFrame({"title": [title],
                                              "text": [text],
                                              "subject": [subject]}))
    data["predictions"] = preds[:, 1][0]
    data["description"] = description
        # indicate that the request was a success
    data["success"] = True
    print('OK')

        # return the data dictionary as a JSON response
    return jsonify(data)


if __name__ == '__main__':
    app.run()

 * Serving Flask app '__main__' (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000 (Press CTRL+C to quit)
127.0.0.1 - - [04/Apr/2022 18:05:05] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:05] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:05] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:05] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:05] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:05] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:05] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:05] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:05] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:05] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:05] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:05] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:05] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:05] "POST /predict HTTP/

 BREAKING: GOP Chairman Grassley Has Had Enough, DEMANDS Trump Jr. Testimony
OK
 Failed GOP Candidates Remembered In Hilarious Mocking Eulogies (VIDEO)
OK
 Mike Pence’s New DC Neighbors Are HILARIOUSLY Trolling Him For Being A Homophobic Bigot
OK
California AG pledges to defend birth control insurance coverage
OK
AZ RANCHERS Living On US-Mexico Border Destroy Nancy Pelosi’s Claim About Trump Being “Weak” For Wanting Border Wall [VIDEO]
OK
As private lawyer, Trump high court pick was friend to business
OK
Yemeni Salafist imam killed in Aden: sources
OK
FBI says witnesses in U.S. probe into Malaysia's 1MDB fear for safety
OK
An Easy To Read Chart Shows How Bernie Sanders’ Socialism Is Just A Stepping Stone To Communism
OK
MMA FIGHTER JAKE SHIELDS Embarrasses Cowards In Masks For Violent 20-on-1 Beating of Trump Supporter [VIDEO]: “I was in Berkeley and watched a man getting beat by a mob with no police help…I was the only person to jump in and help”
OK
 Tired Of Things Going Well, Marco 

127.0.0.1 - - [04/Apr/2022 18:05:05] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:05] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:05] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:06] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:06] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:06] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:06] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:06] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:06] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:06] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:06] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:06] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:06] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:06] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:06] "POST /pred

OK
'Lips and teeth' no more as China's ties with North Korea fray
OK
Factbox: Trump on Twitter (Sept 25) - NASCAR, NFL, John Kelly, Puerto Rico, John McCain, Alabama
OK
Judge declines to dismiss bribery charges against U.S. Senator Menendez
OK
FATHER OF BENGHAZI VICTIM TY WOODS SPEAKS UP AGAINST HILLARY: “My son would still be alive…” [Video]
OK
Before debate, protesters build 'wall of taco trucks' outside Trump hotel
OK
Obama to call for increases in budget for SEC, CFTC: White House
OK
 Ammon Bundy Just Got Sweet Justice Handed To Him By A Federal Judge (VIDEO)
OK
CHARITY GETS MILLIONS From US To Train “Vulnerable Afghans”
OK
LIVE FEED: INAUGURATION 2017!
OK
Senate repeals Labor Dept. municipal retirement plan rule
OK
Trump explained U.S. position on THAAD to Xi: South Korea
OK
Kosovo war crimes court ready for first indictments: chief judge
OK
WATCH: 40,000 BOY SCOUTS BOO Barack Obama…GO CRAZY…Chant “We Love Trump” During President Trump’s Visit To National Scout Jamboree
OK
BREAKIN

127.0.0.1 - - [04/Apr/2022 18:05:06] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:06] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:06] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:06] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:06] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:06] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:06] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:06] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:06] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:06] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:06] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:06] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:06] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:06] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [04/Apr/2022 18:05:06] "POST /pred

OK
Cambodia's Hun Sen urges arrests of opposition 'rebels in the city'
OK
 WATCH: Wolf Blitzer Makes Republican Throw Temper Tantrum Over Trump’s Nazi Problem
OK
RIP PROUD PATRIOT: MARINE Dies After Delivering Powerful Message To President Trump, Mike Pence, General Mattis: “Give ‘Em Hell…Semper Fi…God Bless” [VIDEO]
OK
 The Shocking Detail Trump Doesn’t Want You To Know About HIS MOTHER (VIDEO)
OK
Irish border row thwarts May bid to clinch Brexit trade deal
OK
Senate passes budget blueprint, key to Trump tax effort
OK
 White Nationalist Radio: Trump Gave Us A Press Pass And We Interviewed His Son
OK
‘Vaccine Choice’ Mom Gets Jail Time for Not Jabbing Her Kid
OK
Factbox: British business organizations react to Brexit talk progress
OK
Iraqi Kurds face more sanctions after calling elections
OK
Cambodia marks independence from France with doves, balloons
OK
 Trump Staff Defends Shoving Breitbart Reporter By Saying They Thought She Was Liberal
OK
 AG Sessions Persuaded Trump To End DACA, T

127.0.0.1 - - [04/Apr/2022 18:05:06] "POST /predict HTTP/1.1" 200 -


OK
