In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [8]:
data = pd.read_csv(r"vk-cup/train.csv")
cats = pd.get_dummies(data['category'])
data = data.drop(columns=["oid", "category"])
data = data.join(cats)
data.head()

Unnamed: 0,text,athletics,autosport,basketball,boardgames,esport,extreme,football,hockey,martial_arts,motosport,tennis,volleyball,winter_sport
0,Волшебные фото Виктория Поплавская ЕвгенияМедв...,0,0,0,0,0,0,0,0,0,0,0,0,1
1,Возвращение в подземелье Треша 33 Эйфория тупо...,0,0,0,0,0,1,0,0,0,0,0,0,0
2,Лучшие чешские вратари – Доминик Доминатор Гаш...,0,0,0,0,0,0,1,0,0,0,0,0,0
3,Rtokenoid Warhammer40k валрак решил нас подкор...,0,0,0,1,0,0,0,0,0,0,0,0,0
4,Шестеркин затаскивает Рейнджерс в финал Восточ...,0,0,0,0,0,0,0,1,0,0,0,0,0


In [14]:
import os
import requests
from pathlib import Path
import nltk
from nltk import sent_tokenize, word_tokenize, regexp_tokenize
from nltk.corpus import stopwords
import pymorphy2
from collections import Counter

In [15]:
url_stopwords_ru = "https://raw.githubusercontent.com/stopwords-iso/stopwords-ru/master/stopwords-ru.txt"

In [17]:
def get_text(url, encoding='utf-8', to_lower=True):
    url = str(url)
    if url.startswith('http'):
        r = requests.get(url)
        if not r.ok:
            r.raise_for_status()
        return r.text.lower() if to_lower else r.text
    elif os.path.exists(url):
        with open(url, encoding=encoding) as f:
            return f.read().lower() if to_lower else f.read()
    else:
        raise Exception('parameter [url] can be either URL or a filename')

In [18]:
def normalize_tokens(tokens):
    morph = pymorphy2.MorphAnalyzer()
    return [morph.parse(tok)[0].normal_form for tok in tokens]

In [19]:
def remove_stopwords(tokens, stopwords=None, min_length=4):
    if not stopwords:
        return tokens
    stopwords = set(stopwords)
    tokens = [tok
              for tok in tokens
              if tok not in stopwords and len(tok) >= min_length]
    return tokens

In [31]:
counter = 0
def tokenize_n_lemmatize(text, stopwords=None, normalize=True, regexp=r'(?u)\b\w{4,}\b'):
    words = [w for sent in sent_tokenize(text)
             for w in regexp_tokenize(sent, regexp)]
    if normalize:
        words = normalize_tokens(words)
    if stopwords:
        words = remove_stopwords(words, stopwords)
    
    global counter
    counter += 1
    if counter % 500 == 0:
        print(counter, end=" ")
    
    return words

In [32]:
stopwords_ru = get_text(url_stopwords_ru).splitlines()

In [33]:
data['text'] = data['text'].apply(lambda x: tokenize_n_lemmatize(x, stopwords_ru))

500 1000 1500 2000 2500 3000 3500 4000 4500 5000 5500 6000 6500 7000 7500 8000 8500 9000 9500 10000 10500 11000 11500 12000 12500 13000 13500 14000 14500 15000 15500 16000 16500 17000 17500 18000 18500 19000 19500 20000 20500 21000 21500 22000 22500 23000 23500 24000 24500 25000 25500 26000 26500 27000 27500 28000 28500 29000 29500 30000 30500 31000 31500 32000 32500 33000 33500 34000 34500 35000 35500 36000 36500 37000 37500 38000 38500 

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [36]:
def dummy_fun(doc):
    return doc

In [37]:
tfidf = TfidfVectorizer(analyzer='word', tokenizer=dummy_fun, preprocessor=dummy_fun, token_pattern=None,
                        ngram_range=(1,3), max_features=10000, sublinear_tf=True)  

In [38]:
tfidf.fit(data['text'])

In [56]:
data.head()

Unnamed: 0,text,athletics,autosport,basketball,boardgames,esport,extreme,football,hockey,martial_arts,motosport,tennis,volleyball,winter_sport
0,"[волшебный, фото, виктория, поплавский, евгени...",0,0,0,0,0,0,0,0,0,0,0,0,1
1,"[возвращение, подземелье, треш, эйфория, тупос...",0,0,0,0,0,1,0,0,0,0,0,0,0
2,"[чешский, вратарь, доминик, доминатор, гашек, ...",0,0,0,0,0,0,1,0,0,0,0,0,0
3,"[rtokenoid, warhammer40k, валрак, подкормить, ...",0,0,0,1,0,0,0,0,0,0,0,0,0
4,"[шестёркин, затаскивать, рейнджерс, финал, вос...",0,0,0,0,0,0,0,1,0,0,0,0,0


In [59]:
vectors_train = tfidf.transform(data['text'])

In [60]:
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

In [64]:
svc_model = LinearSVC()
folds_generator = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

parameters = {'C': [1, 10, 100, 1000]}

grid_search = GridSearchCV(svc_model, 
                           parameters, 
                           cv=folds_generator, verbose=10, n_jobs=4,
                           scoring='f1')

In [66]:
for column in data.columns[1:]:
    grid_search.fit(vectors_train, data[column])
    bs = grid_search.best_score_
    bp = grid_search.best_params_
    print('{}: score={:.3f}, C={}'.format(column, bs, bp['C']))

Fitting 5 folds for each of 4 candidates, totalling 20 fits
athletics: score=0.890, C=1
Fitting 5 folds for each of 4 candidates, totalling 20 fits
autosport: score=0.839, C=1
Fitting 5 folds for each of 4 candidates, totalling 20 fits
basketball: score=0.872, C=1
Fitting 5 folds for each of 4 candidates, totalling 20 fits
boardgames: score=0.952, C=1
Fitting 5 folds for each of 4 candidates, totalling 20 fits
esport: score=0.748, C=1
Fitting 5 folds for each of 4 candidates, totalling 20 fits
extreme: score=0.712, C=1
Fitting 5 folds for each of 4 candidates, totalling 20 fits
football: score=0.731, C=1
Fitting 5 folds for each of 4 candidates, totalling 20 fits
hockey: score=0.801, C=1
Fitting 5 folds for each of 4 candidates, totalling 20 fits
martial_arts: score=0.728, C=1
Fitting 5 folds for each of 4 candidates, totalling 20 fits
motosport: score=0.884, C=1
Fitting 5 folds for each of 4 candidates, totalling 20 fits
tennis: score=0.940, C=1
Fitting 5 folds for each of 4 candidate

In [67]:
models = {}

for column in data.columns[1:]:
    model = LinearSVC(C=1, random_state=42)
    model.fit(vectors_train, data[column])
    models[column] = model
    
models

{'athletics': LinearSVC(C=1, random_state=42),
 'autosport': LinearSVC(C=1, random_state=42),
 'basketball': LinearSVC(C=1, random_state=42),
 'boardgames': LinearSVC(C=1, random_state=42),
 'esport': LinearSVC(C=1, random_state=42),
 'extreme': LinearSVC(C=1, random_state=42),
 'football': LinearSVC(C=1, random_state=42),
 'hockey': LinearSVC(C=1, random_state=42),
 'martial_arts': LinearSVC(C=1, random_state=42),
 'motosport': LinearSVC(C=1, random_state=42),
 'tennis': LinearSVC(C=1, random_state=42),
 'volleyball': LinearSVC(C=1, random_state=42),
 'winter_sport': LinearSVC(C=1, random_state=42)}

In [68]:
test = pd.read_csv(r"vk-cup/test.csv")
test.head()

Unnamed: 0,oid,text
0,749208109,СПОЧНО СООБЩЕСТВО ПРОДАЕТСЯ ЗА 1300Р ЗА ПОКУПК...
1,452466036,Естественное восстановление после тяжелой трен...
2,161038103,Тема нарядов продолжается Одна из британских ж...
3,663621910,Привет Избранный. Ты спрашиваешь себя ЧТО здес...
4,566255305,КОРОЛЬ ПЯТИСОТНИКОВ В ДЕЛЕ Андрей Рублев успеш...


In [69]:
test['text'] = test['text'].apply(lambda x: tokenize_n_lemmatize(x, stopwords_ru))
test.head()

39000 39500 40000 40500 41000 41500 42000 42500 43000 43500 44000 44500 45000 45500 46000 46500 47000 47500 48000 48500 49000 49500 50000 50500 51000 51500 52000 52500 53000 53500 54000 54500 55000 55500 56000 56500 57000 57500 58000 58500 59000 59500 60000 60500 61000 61500 62000 62500 63000 63500 64000 64500 65000 

Unnamed: 0,oid,text
0,749208109,"[спочно, сообщество, продаваться, 1300р, покуп..."
1,452466036,"[естественный, восстановление, тяжёлый, тренир..."
2,161038103,"[тема, наряд, продолжаться, британский, журнал..."
3,663621910,"[привет, избранный, спрашивать, происходить, о..."
4,566255305,"[король, пятисотник, андрей, рублёв, успешно, ..."


In [70]:
vectors_test = tfidf.transform(test['text'])

In [71]:
vectors_test

<26260x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 650699 stored elements in Compressed Sparse Row format>

In [75]:
predictions = {}

for column in data.columns[1:]:
    model = models[column]
    predictions[column] = model.predict(vectors_test)
    
predictions

{'athletics': array([0, 0, 0, ..., 0, 0, 1], dtype=uint8),
 'autosport': array([0, 0, 0, ..., 0, 0, 0], dtype=uint8),
 'basketball': array([0, 0, 0, ..., 0, 0, 0], dtype=uint8),
 'boardgames': array([0, 0, 0, ..., 1, 0, 0], dtype=uint8),
 'esport': array([1, 0, 0, ..., 0, 0, 0], dtype=uint8),
 'extreme': array([0, 0, 0, ..., 0, 1, 0], dtype=uint8),
 'football': array([0, 0, 0, ..., 0, 0, 0], dtype=uint8),
 'hockey': array([0, 0, 0, ..., 0, 0, 0], dtype=uint8),
 'martial_arts': array([0, 0, 0, ..., 0, 0, 0], dtype=uint8),
 'motosport': array([0, 0, 0, ..., 0, 0, 0], dtype=uint8),
 'tennis': array([0, 0, 1, ..., 0, 0, 0], dtype=uint8),
 'volleyball': array([0, 0, 0, ..., 0, 0, 0], dtype=uint8),
 'winter_sport': array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)}

In [117]:
def greater_than_else_None(row):
    res = np.where(row.values>=0.6)[0]
    
    if len(res) == 0:
        return None
    else:
        return res[0]

In [152]:
predDF = pd.concat([test, pd.DataFrame(predictions)], axis=1)
predDF = predDF.drop(columns=['text'])
predDF = predDF.groupby(['oid']).sum()
predDF = predDF.div(predDF.sum(axis=1), axis=0)
# predDF['category'] = predDF.apply(lambda row: greater_than_else_None(row), axis=1)
predDF['category'] = predDF.apply(lambda row: np.argmax(row.values), axis=1)
predDF = predDF.dropna()
predDF['category'] = predDF['category'].astype('int')
predDF['category'] = predDF['category'].apply(lambda x: predDF.columns[x])
predDF = predDF.reset_index()
predDF

Unnamed: 0,oid,athletics,autosport,basketball,boardgames,esport,extreme,football,hockey,martial_arts,motosport,tennis,volleyball,winter_sport,category
0,1622114,0.571429,0.0,0.000000,0.000000,0.428571,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,athletics
1,1663817,0.000000,1.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,autosport
2,3174332,0.000000,0.0,0.888889,0.000000,0.000000,0.000000,0.0,0.111111,0.000000,0.0,0.0,0.0,0.0,basketball
3,3469228,0.000000,0.0,0.000000,0.000000,0.333333,0.333333,0.0,0.000000,0.333333,0.0,0.0,0.0,0.0,esport
4,3905302,0.000000,0.0,0.000000,1.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,boardgames
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2615,998309713,0.000000,0.0,0.000000,0.000000,1.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,esport
2616,998565619,0.000000,0.0,0.000000,0.000000,1.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,esport
2617,999112505,0.000000,0.0,0.000000,0.909091,0.000000,0.090909,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,boardgames
2618,999361308,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,1.0,0.0,0.0,tennis


In [151]:
predDF[['oid', 'category']].to_csv('vk-cup/submission1.csv', index=False)