In [None]:
!pip install numpy
import numpy as np

!pip install pandas
import pandas as pd

!pip install nltk                                                # хорошая библиотека для nlp
from nltk.corpus import wordnet
import nltk

!pip install sklearn  
from sklearn.feature_extraction.text import CountVectorizer
from string import punctuation                                # разные знаки пунктуации
from sklearn.feature_extraction.text import TfidfVectorizer   # объединение tf-idf
from sklearn.pipeline import Pipeline                         # пайплайн
from sklearn.model_selection import GridSearchCV              # сетка для подбора гиперпараметров
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack
from sklearn.linear_model import SGDClassifier

!pip install lightgbm
from lightgbm import LGBMClassifier

!pip install pymorphy2
import pymorphy2

!pip install os
import os

In [None]:
def text_preproccesing(df, train=False):

    try:
        stopwords = nltk.corpus.stopwords.words('russian') +\
                    nltk.corpus.stopwords.words('english') +\
                    nltk.corpus.stopwords.words('arabic') +\
                    nltk.corpus.stopwords.words('turkish')
                    
    except:
        nltk.download('stopwords')

        stopwords = nltk.corpus.stopwords.words('russian') +\
                    nltk.corpus.stopwords.words('english') +\
                    nltk.corpus.stopwords.words('arabic') +\
                    nltk.corpus.stopwords.words('turkish')

    df['title'] = df['title'].map(lambda x : x.split(' '))
    
    # удаление стопслов, пунктации и слов, длина которых меньше 3
    df['title'] = df['title'].map(lambda x: [token for token in x if token not in stopwords\
                                                                  and token != ' '\
                                                                  and token.strip() not in punctuation\
                                                                  and len(token) >= 3])

    df['title'] = df['title'].map(lambda x : ' '.join([word for word in x]))

    bad_symbols = punctuation + "«'»•—²�®❗️"

    def remove_bad_symbols(text):
        return text.translate(str.maketrans(bad_symbols, ' ' * len(bad_symbols)))

    # удаление пунктуации, цифр и других неестественных символов
    df['title'] = df['title'].map(lambda x: remove_bad_symbols(x))

    # лемматизация 
    lemmatizer =  pymorphy2.MorphAnalyzer()
    df['title']  = df['title'].apply(lambda x: [lemmatizer.parse(word)[0].normal_form for word in str(x).split()])

    # разделение url'a по точке
    df['url'] = df['url'].apply(lambda x: ' '.join(x.split('.')))

    return df

In [None]:
train_df = pd.read_csv("https://raw.githubusercontent.com/d3vyatk4ru/DZ3ML/main/train.csv")

test_df = pd.read_csv("https://raw.githubusercontent.com/d3vyatk4ru/DZ3ML/main/test.csv")

train_df.head()

Unnamed: 0,id,url,title,target
0,0,m.kp.md,"Экс-министр экономики Молдовы - главе МИДЭИ, ц...",False
1,1,www.kp.by,Эта песня стала известна многим телезрителям б...,False
2,2,fanserials.tv,Банши 4 сезон 2 серия Бремя красоты смотреть о...,False
3,3,colorbox.spb.ru,Не Беси Меня Картинки,False
4,4,tula-sport.ru,В Новомосковске сыграют следж-хоккеисты алекси...,False


In [None]:
X_train_preprocessing = text_preproccesing(train_df, train=True)

In [None]:
X_train_preprocessing['title'] = X_train_preprocessing['title'].apply(lambda x : ' '.join(x))

In [None]:
# извлекаем данные из столбцов title и url, а также метки
X_title = X_train_preprocessing['title'].values
X_url = X_train_preprocessing['url'].values
y = X_train_preprocessing['target'].apply(int)
y = y.values

In [None]:
# делим данные на етст и трейн
X_train_title, X_test_title, y_train, y_test = train_test_split( X_title, y, test_size=0.04, random_state=42)
X_train_url, X_test_url, y_train, y_test = train_test_split( X_url, y, test_size=0.04, random_state=42)

In [None]:
# подготавливаем tf-idf и count vectorize
tfidf_title = TfidfVectorizer(ngram_range=(1, 3), min_df=1, analyzer='word', max_features=90_350, smooth_idf=False)
vectorizer_url = CountVectorizer(ngram_range=(1, 3), min_df=1, binary=False, analyzer='char')

In [None]:
X_train_title_vectorized = tfidf_title.fit_transform(X_train_title)
X_train_url_vectorized = vectorizer_url.fit_transform(X_train_url)

X_train_vectorized = hstack([X_train_title_vectorized, X_train_url_vectorized])

X_test_title_vectorized = tfidf_title.transform(X_test_title)
X_test_url_vectorized = vectorizer_url.transform(X_test_url)

X_test_vectorized = hstack([X_test_title_vectorized, X_test_url_vectorized])

In [None]:
log_reg = LogisticRegression(max_iter=1000, solver='liblinear', C=100)

log_reg.fit(
    X_train_vectorized,
    y_train
)

y_pred = log_reg.predict(X_test_vectorized)

f1_score(y_pred=y_pred, y_true=y_test)  

0.9756795422031475

In [None]:

# случайный лес
rf = RandomForestClassifier(n_estimators=50,
                            criterion='entropy',
                            bootstrap=True,
                            min_samples_split=2,
                            min_samples_leaf=1,
                            n_jobs=-1,
                            random_state=42)

# град бустинг
LGBM = LGBMClassifier(learning_rate=0.1, n_estimators=100)

SGD = SGDClassifier(alpha=0.08)

In [None]:
# пытаемся сделать голосование по нескольким классификаторам, но скор сравним с обычным лог регом
voting_clf = VotingClassifier(estimators=[('LGBM', LGBM), ('log_reg', log_reg), ('SGD', SGD)], voting='hard')

voting_clf.fit(
    hstack([X_train_vectorized, log_reg.predict_proba(X_train_vectorized)]).astype(np.float32),
    y_train
)

y_pred = voting_clf.predict(
    hstack([X_test_vectorized, log_reg.predict_proba(X_test_vectorized)]).astype(np.float32)
)

f1_score(y_test, y_pred)



0.9756446991404012

Значительных приростов в скоре при использовании голосования не замечено

In [None]:
X_test = text_preproccesing(test_df)

In [None]:
X_test['title'] = X_test['title'].apply(lambda x : ' '.join(x))

In [None]:
X_title = X_test['title'].values
X_url = X_test['url'].values

X_valid_title_vectorized = tfidf_title.transform(X_title)
X_valid_url_vectorized = vectorizer_url.transform(X_url)

X_valid_vectorized = hstack([X_valid_title_vectorized, X_valid_url_vectorized])

In [None]:
y_pred = voting_clf.predict(hstack([X_valid_vectorized, log_reg.predict_proba(X_valid_vectorized)]).astype(np.float32))



In [None]:
y_pred_log = log_reg.predict(X_valid_vectorized)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.DataFrame()
df['id'] = test_df['id'].values
df['target'] = list(map(bool, y_pred))
df.to_csv("/content/drive/MyDrive/test_pred_log.csv", index=False)

In [None]:
df

NameError: ignored