<a href="https://colab.research.google.com/github/diwert-ai/First-step-in-NLP-HSE-bootcamp-/blob/main/kaggle%20competition/experiment_SGDClassifier_%5BLB_0_77896%5D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mount google.drive and copy dataset to colab linux instance

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
INPUT_DIR ='/content/drive/MyDrive/kaggle/fsnlp_hse/input/'
OUTPUT_DIR = '/content/drive/MyDrive/kaggle/fsnlp_hse/output/'

WORK_DIR='/content/'
EXPERIMENT='sgd'
EXPERIMENT_DIR = OUTPUT_DIR + EXPERIMENT+'/'

DATASET_DIR = INPUT_DIR
METADATA_DIR = INPUT_DIR

!mkdir {EXPERIMENT_DIR}

# Install and set up Kaggle API

In [None]:
from google.colab import files
uploaded = files.upload()

Saving kaggle.json to kaggle.json


In [None]:
!pip install -q kaggle --upgrade
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json

kaggle.json


In [None]:
!kaggle competitions download -c hse-nlp-bootcamp

Downloading hse-nlp-bootcamp.zip to /content
 97% 58.0M/59.8M [00:00<00:00, 105MB/s] 
100% 59.8M/59.8M [00:00<00:00, 90.6MB/s]


In [None]:
!unzip hse-nlp-bootcamp.zip

Archive:  hse-nlp-bootcamp.zip
  inflating: new_test_ml.csv         
  inflating: sample_submission.csv   
  inflating: train_ml.csv            


# Imports

In [None]:
import pandas as pd
import numpy as np
import random
import pprint as pp
import os


# import re
# from pymorphy2 import MorphAnalyzer
# from functools import lru_cache
# from nltk.corpus import stopwords

# from multiprocessing import Pool
# from tqdm import tqdm

import nltk
# nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import FunctionTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, f1_score
from joblib import dump, load

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Configuration

In [None]:
class Config:
  seed = 12536271
  samples_num = 51476
  experiment = EXPERIMENT

# Main pipeline starts here

In [None]:
def set_seed(seed=42):
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)


set_seed(Config.seed)

## Load and process train data

In [None]:
df = pd.read_csv('/content/train_ml.csv')
banks_list = list(df.bank.unique())
df['bank_id'] = df['bank'].apply(lambda x: banks_list.index(x))
df['date'] = pd.to_datetime(df['date'], format='%d.%m.%Y %H:%M')
df = df.dropna()
df = df.astype({'grades': 'int32'})
df.head()

Unnamed: 0,bank,feeds,grades,date,bank_id
0,ubrr,"Много лет являюсь клиентом этого банка, но пос...",1,2017-02-16 16:10:00,0
1,fk_otkritie,"Г. Ростов-на-Дону, ул. Ленина, 48. Были 10.12....",2,2016-12-13 01:05:00,1
4,promsvyazbank,"Имею потребительский кредит, взятый в Связь-ба...",2,2020-08-04 06:38:00,4
7,homecreditbank,Хочу выразить глубокую благодарность всем сотр...,5,2015-04-19 20:11:00,7
8,vtb,"6 марта 2019г. В 10:33 при проходе в метро ""Ку...",2,2019-03-28 18:36:00,3


In [None]:
len(df)

51476

In [None]:
subset = df.sample(Config.samples_num, random_state=Config.seed)
subset['text'] = subset.feeds.apply(lambda x: x.lower())
subset.reset_index(drop=True, inplace=True)

## Split data on train and test data

In [None]:
x_train, x_test, y_train, y_test = train_test_split(subset[['bank_id','text']],
                                                    subset['grades'],
                                                    test_size=0.1,
                                                    random_state=Config.seed)

In [None]:
len(x_train)

46328

## Set up train pipeline

In [None]:
def get_numeric_data(x):
    return x[['bank_id']]

def get_text_data(x):
    return x['text']

transfomer_numeric = FunctionTransformer(get_numeric_data)
transformer_text = FunctionTransformer(get_text_data)

pipeline = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector', transfomer_numeric)
            ])),
             ('text_features', Pipeline([
                ('selector', transformer_text),
                ('vec', TfidfVectorizer(ngram_range=(1, 2), tokenizer=word_tokenize))
            ]))
         ])),
    ('clf', SGDClassifier(random_state=Config.seed))
])

In [None]:
pp.pprint(sorted(pipeline.get_params().keys()))

In [None]:
param_grid = {'clf__loss': ['modified_huber'],
              'clf__penalty': ['l2'],
              'clf__alpha': [0.0001],
              'clf__l1_ratio': [0.15],
              'clf__max_iter': [1000],
              'clf__epsilon': [0.1],
              'clf__learning_rate': ['optimal'],
              'clf__class_weight': [None],
              'clf__average': [False]
              }

kfold = StratifiedKFold(n_splits=5)
scoring = {'F1': 'f1_micro'}
refit = 'F1'

## Perform GridSearch

In [None]:
print(f'samples: {Config.samples_num}')
lr_model = GridSearchCV(pipeline, param_grid=param_grid, cv=kfold, scoring=scoring, 
                         refit=refit, n_jobs=-1, return_train_score=True, verbose=10)
lr_model.fit(x_train, y_train)
lr_best = lr_model.best_estimator_

samples: 51476
Fitting 5 folds for each of 1 candidates, totalling 5 fits


## Test best model

In [None]:
pred = lr_best.predict(x_test)
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

           1       0.99      0.73      0.84      3773
           2       0.01      0.19      0.02        31
           3       0.01      0.17      0.02        12
           4       0.06      0.33      0.10        24
           5       0.87      0.92      0.89      1308

    accuracy                           0.77      5148
   macro avg       0.39      0.47      0.37      5148
weighted avg       0.94      0.77      0.84      5148



In [None]:
f1_score(y_test, pred, average='micro')

0.7711732711732712

In [None]:
lr_model.best_params_

{'clf__alpha': 0.0001,
 'clf__average': False,
 'clf__class_weight': None,
 'clf__epsilon': 0.1,
 'clf__l1_ratio': 0.15,
 'clf__learning_rate': 'optimal',
 'clf__loss': 'modified_huber',
 'clf__max_iter': 1000,
 'clf__penalty': 'l2'}

In [None]:
lr_model.cv_results_

{'mean_fit_time': array([305.20934887]),
 'std_fit_time': array([55.46696947]),
 'mean_score_time': array([26.51159673]),
 'std_score_time': array([4.72627389]),
 'param_clf__alpha': masked_array(data=[0.0001],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_clf__average': masked_array(data=[False],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_clf__class_weight': masked_array(data=[None],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_clf__epsilon': masked_array(data=[0.1],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_clf__l1_ratio': masked_array(data=[0.15],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_clf__learning_rate': masked_array(data=['optimal'],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_clf__loss': masked_array(data=['modifie

In [None]:
lr_model.best_score_

0.7532367122973007

In [None]:
cv_tag = f'[cv_{lr_model.best_score_:0.4f}]'

In [None]:
cv_tag

'[cv_0.7532]'

## Inference (calculating submission) 

In [None]:
test = pd.read_csv('/content/new_test_ml.csv', index_col=0)
test['text'] = test.feeds.apply(lambda x: x.lower())
test['bank_id'] = test['bank'].apply(lambda x: banks_list.index(x))
test = test[['bank_id', 'text']]
test_pred = lr_best.predict(test)
probs = lr_best.predict_proba(test)
sub = pd.DataFrame({'inds': test.index,
                    'grades': test_pred})
sub

Unnamed: 0,inds,grades
0,0,1
1,1,1
2,2,1
3,3,1
4,4,1
...,...,...
17215,17215,1
17216,17216,1
17217,17217,1
17218,17218,1


# Save results to files

In [None]:
suffix = f'{Config.experiment}_[sn_{Config.samples_num}]_[sd_{Config.seed}]'+cv_tag
sub_filename = 'sub_'+suffix+'.csv'
probs_filename = 'probs_'+suffix+'.npy'
model_filename = 'model_'+suffix+'.joblib'

In [None]:
sub.to_csv(sub_filename, index=False)
np.save(probs_filename, probs)
dump(lr_best, model_filename)

['model_sgd_[sn_51476]_[sd_12536271][cv_0.7532].joblib']

In [None]:
!cp {sub_filename} {EXPERIMENT_DIR}
!cp {probs_filename} {EXPERIMENT_DIR}
!cp {model_filename} {EXPERIMENT_DIR}

# Submit predictions

In [None]:
!kaggle competitions submit -c hse-nlp-bootcamp  -f {'/content/'+sub_filename} -m "Submission via API"

100% 124k/124k [00:02<00:00, 46.2kB/s]
Successfully submitted to HSE | NLP Bootcamp | Sentiment Analysis

# Local test 0.7711 CV 0.7532 - public LB 0.77886