# Download libraries

In [1]:
import pandas as pd
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.0.6-cp37-none-manylinux1_x86_64.whl (76.6 MB)
[K     |████████████████████████████████| 76.6 MB 13.8 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.6


In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
from nltk.corpus import stopwords
BADWORDS = stopwords.words('russian')

In [4]:
!pip install pymorphy2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 2.5 MB/s 
[?25hCollecting dawg-python>=0.7.1
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting pymorphy2-dicts-ru<3.0,>=2.4
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[K     |████████████████████████████████| 8.2 MB 10.7 MB/s 
[?25hInstalling collected packages: pymorphy2-dicts-ru, dawg-python, pymorphy2
Successfully installed dawg-python-0.7.2 pymorphy2-0.9.1 pymorphy2-dicts-ru-2.4.417127.4579844


In [5]:
 import pymorphy2
 morph = pymorphy2.MorphAnalyzer()

# Clear text

In [13]:
def cleartext(s):
  s = s.lower()
  alpabet = 'абвгдеёжзийклмнопрстуфхцчшщъыьэюя '
  newstr = ""
  for symb in s:
    if(symb in alpabet):
      newstr = newstr + symb
  while('  ' in newstr):
    newstr = newstr.replace('  ', ' ')
  words = newstr.split(' ')
  newwords = []
  for word in words:
    if(word not in BADWORDS):
      newwords.append(morph.parse(word)[0].normal_form)
  answer = ' '.join(newwords)
  return answer.strip()

# Work with data

In [14]:
df_train = pd.read_csv("/content/drive/MyDrive/PredictorAuthor/train_texts.csv")
df_test = pd.read_csv("/content/drive/MyDrive/PredictorAuthor/test_texts.csv")

In [15]:
authors = set(df_train['author'])
replace_params = {}
replace_index_author = {}
for index, author in enumerate(authors):
    replace_params.update({author : index})
    replace_index_author.update({index : author})

In [16]:
df_train['author'] = df_train['author'].map(replace_params)

In [34]:
df_train_label = df_train['author']
df_train_data = df_train
df_train_data['text'] = df_train['text'].apply(cleartext)

In [35]:
df_train_data.head(5)

Unnamed: 0,id,text,author
0,0,бабушка вскричать малютка взять знать уйти пог...,6
1,1,знать скрудж разумеется знать мочь иначе скруд...,6
2,2,праздник дядя радость дать бог благо земной ра...,6
3,3,высказать главный передовой мысль наш журнал н...,6
4,4,отдел литературный повесть роман рассказ мемуа...,6


In [18]:
df_train_data = df_train_data.drop(columns=['author', 'id'])

In [19]:
from sklearn.model_selection import train_test_split

x_train, x_eval, y_train, y_eval = train_test_split(df_train_data, df_train_label.values, test_size=0.2)

# Train data

In [20]:
import random
rands = []

In [21]:
from catboost import Pool, CatBoostClassifier
#
text_features = ['text']

train_pool = Pool(
    x_train, 
    y_train, 
    text_features=text_features
)
valid_pool = Pool(
    x_eval, 
    y_eval,
    text_features=text_features
)
randin = random.randint(1, 10000000)
rands.append(randin)
catboost_params = {
    'iterations': 6000,
    'learning_rate': 0.01,
    'eval_metric': 'Accuracy',
    'task_type': 'GPU',
    'early_stopping_rounds': 2000,
    'verbose': 500,
    'random_seed': randin,
    'use_best_model': True
}
model = CatBoostClassifier(**catboost_params)
model.fit(train_pool, eval_set=valid_pool)

0:	learn: 0.7490988	total: 24.9ms	remaining: 2m 29s
500:	learn: 0.8175919	total: 6.7s	remaining: 1m 13s
1000:	learn: 0.8327325	total: 12.8s	remaining: 1m 3s
1500:	learn: 0.8500360	total: 18.8s	remaining: 56.5s
2000:	learn: 0.8637347	total: 24.9s	remaining: 49.7s
2500:	learn: 0.8716655	total: 31s	remaining: 43.3s
3000:	learn: 0.8824802	total: 37.5s	remaining: 37.5s
3500:	learn: 0.8932949	total: 43.5s	remaining: 31s
4000:	learn: 0.9055516	total: 55s	remaining: 27.5s
4500:	learn: 0.9163663	total: 1m	remaining: 20.3s
5000:	learn: 0.9250180	total: 1m 6s	remaining: 13.4s
5500:	learn: 0.9343908	total: 1m 12s	remaining: 6.61s
5999:	learn: 0.9444845	total: 1m 18s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7ff22065d3d0>

In [22]:
rands

[1821215]

# Predict data

In [23]:
df_test['text'] = df_test['text'].apply(cleartext)

In [24]:
df_test = df_test.drop(columns=['id'])

In [25]:
df_test.head(5)

Unnamed: 0,text
0,идти немного знать сколько шаг ворота дом ровн...
1,твой дедушка немножко пират уверовать струхнут...
2,немецкий паспорт годный целый четыре месяц под...
3,знать сказать сделать вид собираться поставить...
4,отмахнулсян шестнадцать пятьдесят одновременно...


In [26]:
pred = model.predict(df_test)

In [31]:
import csv
i = 0
with open('answer.csv', 'w', newline='') as csvfile2:
  spamwriter = csv.writer(csvfile2, delimiter=',')
  with open('test_texts.csv', newline='') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',')
    spamwriter.writerow(['id', 'author'])
    for row in spamreader:
      id, text = row
      if(id=="id"):
        continue
      answer = pred[i][0]
      spamwriter.writerow([id, replace_index_author[answer]])
      i+=1