In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [23]:
fake_news=pd.read_csv('fake.csv')
fake_news['tweet'].fillna('', inplace=True)
fake_news = fake_news.assign(label='fake')

In [24]:
real_news=pd.read_csv('real.csv')
real_news['tweet'].fillna('', inplace=True)
real_news = real_news.assign(label='real')

In [25]:
real_news.head()

Unnamed: 0,tweet,label
0,sun downs technical director: al-ahly respecte...,real
1,shawky gharib after the tie with enppi: our go...,real
2,"egyptian sports news today, wednesday 1/25/202...",real
3,the main referees committee of the egyptian fo...,real
4,"haji bari, the striker of the future team, is ...",real


In [26]:
news=pd.concat([fake_news, real_news], ignore_index=True)
news[25000:]

Unnamed: 0,tweet,label
25000,the return of al-souliya to group training thu...,real
25001,the egyptian men's national handball team has ...,real
25002,"""a coach with experience and achievements."" mo...",real
25003,al-ahly is close to settling the deal for the ...,real
25004,"ibrahim hassan, director of football at the al...",real
...,...,...
41863,shawky gharib: we excluded 4 players from the ...,real
41864,dina musharraf resumes her career in the singa...,real
41865,"indeed, al-ahly ended its agreement with ricar...",real
41866,"hazem imam, the captain of zamalek, decided to...",real


In [28]:
# Разделяем данные на тренировочную и тестовую выборки
X_train, X_test, y_train, y_test=train_test_split(news['tweet'], news['label'], test_size=0.2, random_state=42)

In [29]:
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [30]:
# Обучаем модель логистической регрессии
model = LogisticRegression()
model.fit(X_train_vec, y_train)

In [31]:
# Оцениваем качество модели на тестовой выборке
y_pred = model.predict(X_test_vec)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred, pos_label='fake'))
print('Recall:', recall_score(y_test, y_pred, pos_label='fake'))
print('F1-score:', f1_score(y_test, y_pred, pos_label='fake'))

Accuracy: 0.9509195127776451
Precision: 0.9451581027667985
Recall: 0.9529265255292653
F1-score: 0.9490264169663897


In [34]:
# Применяем модель для классификации новостей
new_news = ["Russia has won the World Cup", "Mo Salah is the best football player in Egypt history"]
new_news_vec = vectorizer.transform(new_news)
new_news_pred = model.predict(new_news_vec)
print('Predictions:', new_news_pred)

Predictions: ['fake' 'real']
