<a href="https://colab.research.google.com/github/dkurbatovv/Python/blob/main/News_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm, trange

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df_train = pd.read_table('train.tsv', sep = '\t')
df_test = pd.read_table('test.tsv', sep = '\t', header = None)

In [None]:
df_train

In [None]:
df_train.shape

In [None]:
plt.figure(figsize = (12,10))
sns.countplot(df_train.is_fake)

# ***Предобработка***

In [None]:
import string

def remove_punctuation(text):
  return "".join([ch if ch not in string.punctuation else ' ' for ch in text])


def remove_numbers(text):
  return "".join([i if not i.isdigit() else ' ' for i in text])  


import re
def remove_multiple_spaces(text):  
  return re.sub(r'\s+', ' ', text, flags=re.I)

import nltk
from nltk.stem import *
nltk.download('all')
from nltk.corpus import stopwords
from pymystem3 import Mystem
from string import punctuation

mystem = Mystem()

russian_stopwords = stopwords.words("russian")
russian_stopwords.extend(['...', '"'])



def lemmatize_text(text):
  tokens = mystem.lemmatize(text.lower())
  tokens = [token for token in tokens if token not in russian_stopwords and token != " "]
  text = " ".join(tokens)
  return text

In [None]:
prep_text = [remove_multiple_spaces(remove_numbers(remove_punctuation(text.lower()))) for text in tqdm(df_train['title'])]

In [None]:
len(prep_text)

In [None]:
prep_text[0]

In [None]:
df_train['text_pred'] = prep_text

In [None]:
df_train.head()

# ***Стемминг***

In [None]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("russian")

In [None]:
russian_stopwords = stopwords.words("russian")
russian_stopwords.extend(['т.д.', 'т', 'д'])

In [None]:
from nltk import word_tokenize

stemmed_texts_list = []
for text in tqdm(df_train['text_pred']):
    tokens = word_tokenize(text)    
    stemmed_tokens = [stemmer.stem(token) for token in tokens if token not in russian_stopwords]
    text = " ".join(stemmed_tokens)
    stemmed_texts_list.append(text)

df_train['text_stem'] = stemmed_texts_list

In [None]:
df_train.head()

In [None]:
def remove_stop_words(text):
    tokens = word_tokenize(text) 
    tokens = [token for token in tokens if token not in russian_stopwords and token != ' ']
    return " ".join(tokens)

In [None]:
from nltk import word_tokenize

sw_texts_list = []
for text in tqdm(df_train['text_pred']):
    tokens = word_tokenize(text)    
    tokens = [token for token in tokens if token not in russian_stopwords and token != ' ']
    text = " ".join(tokens)
    sw_texts_list.append(text)

df_train['text_sw'] = sw_texts_list

In [None]:
df_train.head()

# *** Лемматизация***

In [None]:
'''
lemm_texts_list = []
for text in tqdm(df_train['text_sw']):
    #print(text)
    try:
        text_lem = mystem.lemmatize(text)
        tokens = [token for token in text_lem if token != ' ' and token not in russian_stopwords]
        text = " ".join(tokens)
        lemm_texts_list.append(text)
    except Exception as e:
        print(e)
    
df_train['text_lemm'] = lemm_texts_list

'''

In [None]:
X = df_train['text_sw']
y = df_train['is_fake']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [None]:
my_tags = df_train['is_fake'].unique()
my_tags

# ***Naive Bayes Classifier***

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])

In [None]:
%%time
nb.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report
y_pred = nb.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])

In [None]:
%%time
logreg.fit(X_train, y_train)

In [None]:
%%time
y_pred = logreg.predict(X_test)

In [None]:
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

Linear Support Vector Machine

In [None]:
from sklearn.linear_model import SGDClassifier

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])

In [None]:
%%time
sgd.fit(X_train, y_train)

In [None]:
%%time
y_pred = sgd.predict(X_test)

In [None]:
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))