<a href="https://colab.research.google.com/github/diwert-ai/First-step-in-NLP-HSE-bootcamp-/blob/main/experiment_5folds_Logistic_Regression_%5BLB_0_79581_mean%5D_%5Bbest_LB_0_79698_fold_1%5D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mount google.drive and copy dataset to colab linux instance

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
INPUT_DIR = '/content/drive/MyDrive/kaggle/fsnlp_hse/input/'
OUTPUT_DIR = '/content/drive/MyDrive/kaggle/fsnlp_hse/output/'

WORK_DIR = '/content/'
EXPERIMENT = 'logreg_5f'
EXPERIMENT_DIR = OUTPUT_DIR + EXPERIMENT + '/'

DATASET_DIR = INPUT_DIR
METADATA_DIR = INPUT_DIR

In [None]:
!mkdir {EXPERIMENT_DIR}

# Install and set up Kaggle API

In [None]:
from google.colab import files
uploaded = files.upload()

Saving kaggle.json to kaggle.json


In [None]:
!pip install -q kaggle --upgrade
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json

kaggle.json


In [None]:
!kaggle competitions download -c hse-nlp-bootcamp
!unzip hse-nlp-bootcamp.zip

Downloading hse-nlp-bootcamp.zip to /content
 95% 57.0M/59.8M [00:00<00:00, 145MB/s] 
100% 59.8M/59.8M [00:00<00:00, 132MB/s]
Archive:  hse-nlp-bootcamp.zip
  inflating: new_test_ml.csv         
  inflating: sample_submission.csv   
  inflating: train_ml.csv            


# Imports

In [None]:
import pandas as pd
import numpy as np
import random
import pprint as pp
import os


# import re
# from pymorphy2 import MorphAnalyzer
# from functools import lru_cache
# from nltk.corpus import stopwords

# from multiprocessing import Pool
# from tqdm import tqdm

import nltk
# nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, f1_score
from joblib import dump, load

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Configuration

In [None]:
class Config:
  seed = 12536271
  samples_num = 51476
  experiment = EXPERIMENT
  test_size = 0.1
  n_folds = 5
  folds = [0, 1, 2, 3, 4]

# Main pipeline starts here

In [None]:
def set_seed(seed=42):
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)


set_seed(Config.seed)

## Load and process train data

In [None]:
df = pd.read_csv('/content/train_ml.csv')
banks_list = list(df.bank.unique())
df['bank_id'] = df['bank'].apply(lambda x: banks_list.index(x))
df['date'] = pd.to_datetime(df['date'], format='%d.%m.%Y %H:%M')
df = df.dropna()
df = df.astype({'grades': 'int32'})
df.head()

Unnamed: 0,bank,feeds,grades,date,bank_id
0,ubrr,"Много лет являюсь клиентом этого банка, но пос...",1,2017-02-16 16:10:00,0
1,fk_otkritie,"Г. Ростов-на-Дону, ул. Ленина, 48. Были 10.12....",2,2016-12-13 01:05:00,1
4,promsvyazbank,"Имею потребительский кредит, взятый в Связь-ба...",2,2020-08-04 06:38:00,4
7,homecreditbank,Хочу выразить глубокую благодарность всем сотр...,5,2015-04-19 20:11:00,7
8,vtb,"6 марта 2019г. В 10:33 при проходе в метро ""Ку...",2,2019-03-28 18:36:00,3


In [None]:
len(df)

51476

In [None]:
subset = df.sample(Config.samples_num, random_state=Config.seed)
subset['text'] = subset.feeds.apply(lambda x: x.lower())
subset.reset_index(drop=True, inplace=True)

## Split data on 5 folds

In [None]:
skf = StratifiedKFold(n_splits=Config.n_folds, shuffle=True, random_state=Config.seed)
for fold, (train_idx, val_idx) in enumerate(skf.split(subset, subset['grades'])):
  subset.loc[val_idx, 'fold'] = fold

## Set up train pipeline

In [None]:
def get_numeric_data(x):
    return x[['bank_id']]

def get_text_data(x):
    return x['text']

transfomer_numeric = FunctionTransformer(get_numeric_data)
transformer_text = FunctionTransformer(get_text_data)

pipeline = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector', transfomer_numeric)
            ])),
             ('text_features', Pipeline([
                ('selector', transformer_text),
                ('vec', TfidfVectorizer(ngram_range=(1, 2), tokenizer=word_tokenize))
            ]))
         ])),
    ('clf', LogisticRegression(random_state=Config.seed,  warm_start=False, max_iter=1000))
])

param_grid = {'features__text_features__vec__max_df': [1.0],
              'features__text_features__vec__min_df': [0.001]}

kfold = StratifiedKFold(n_splits=5)
scoring = {'F1': 'f1_micro'}
refit = 'F1'

In [None]:
# pp.pprint(sorted(pipeline.get_params().keys()))

## Load  and prepare test

In [None]:
test = pd.read_csv('/content/new_test_ml.csv', index_col=0)
test['text'] = test.feeds.apply(lambda x: x.lower())
test['bank_id'] = test['bank'].apply(lambda x: banks_list.index(x))
test = test[['bank_id', 'text']]

## Perform CV

In [None]:
submit, final_probs, mean_f1_micro = True, np.zeros((len(test), 5)), 0
for step, fold in enumerate(Config.folds):
    train_df = subset.query("fold!=@fold").reset_index(drop=True)
    valid_df = subset.query("fold==@fold").reset_index(drop=True)
    x_train, y_train = train_df[['bank_id','text']], train_df['grades']
    x_test, y_test = valid_df[['bank_id','text']], valid_df['grades']

    print(f'fold: {fold} | samples: {Config.samples_num} | train: {len(x_train)}')
    
    lr_model = GridSearchCV(pipeline, param_grid=param_grid, cv=kfold, scoring=scoring, 
                            refit=refit, n_jobs=-1, return_train_score=True, verbose=0)
    lr_model.fit(x_train, y_train)
    lr_best = lr_model.best_estimator_

    pred = lr_best.predict(x_test)
    print(f'{classification_report(y_test, pred)}')
    display(pd.DataFrame(lr_model.cv_results_))
    print(f'best f1-score on local cv: {lr_model.best_score_}')
    f1_micro = f1_score(y_test, pred, average="micro")
    print(f'f1-micro on x_test: {f1_micro}')
    mean_f1_micro += f1_micro

    test_pred = lr_best.predict(test)
    probs = lr_best.predict_proba(test)
    final_probs += probs
    sub = pd.DataFrame({'inds': test.index, 'grades': test_pred})

    suffix = f'{Config.experiment}_[fold_{fold}]_[sn_{Config.samples_num}]_[sd_{Config.seed}]_[cv_{lr_model.best_score_:0.4f}]'
    sub_filename = 'sub_'+suffix+'.csv'
    probs_filename = 'probs_'+suffix+'.npy'
    model_filename = 'model_'+suffix+'.joblib'

    sub.to_csv(sub_filename, index=False)
    np.save(probs_filename, probs)
    dump(lr_best, model_filename)

    ! cp {sub_filename} {EXPERIMENT_DIR}
    ! cp {probs_filename} {EXPERIMENT_DIR}
    ! cp {model_filename} {EXPERIMENT_DIR}

    if submit:
      ! kaggle competitions submit -c hse-nlp-bootcamp  -f {'/content/'+sub_filename} -m "Submission via API"
    print()
    print('-------------------------------------------------------------------------------------------------')
    print('-------------------------------------------------------------------------------------------------')
    print('-------------------------------------------------------------------------------------------------')

fold: 0 | samples: 51476 | train: 41180


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

           1       0.76      0.98      0.85      5548
           2       0.28      0.03      0.05      1126
           3       0.25      0.01      0.01       472
           4       0.53      0.03      0.06       304
           5       0.90      0.93      0.92      2846

    accuracy                           0.79     10296
   macro avg       0.54      0.40      0.38     10296
weighted avg       0.71      0.79      0.72     10296



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_features__text_features__vec__max_df,param_features__text_features__vec__min_df,params,split0_test_F1,split1_test_F1,split2_test_F1,...,mean_test_F1,std_test_F1,rank_test_F1,split0_train_F1,split1_train_F1,split2_train_F1,split3_train_F1,split4_train_F1,mean_train_F1,std_train_F1
0,414.530282,82.116597,28.491667,5.463618,1.0,0.001,"{'features__text_features__vec__max_df': 1.0, ...",0.787397,0.791039,0.785454,...,0.788417,0.001879,1,0.815141,0.817873,0.821819,0.817266,0.822881,0.818996,0.002905


best f1-score on local cv: 0.788416707139388
f1-micro on x_test: 0.7912781662781663
100% 124k/124k [00:00<00:00, 560kB/s]
Successfully submitted to HSE | NLP Bootcamp | Sentiment Analysis-------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------
fold: 1 | samples: 51476 | train: 41181


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

           1       0.75      0.98      0.85      5547
           2       0.21      0.02      0.04      1127
           3       0.25      0.01      0.02       471
           4       0.61      0.04      0.07       304
           5       0.90      0.93      0.92      2846

    accuracy                           0.79     10295
   macro avg       0.55      0.40      0.38     10295
weighted avg       0.71      0.79      0.72     10295



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_features__text_features__vec__max_df,param_features__text_features__vec__min_df,params,split0_test_F1,split1_test_F1,split2_test_F1,...,mean_test_F1,std_test_F1,rank_test_F1,split0_train_F1,split1_train_F1,split2_train_F1,split3_train_F1,split4_train_F1,mean_train_F1,std_train_F1
0,417.435602,86.514909,29.3405,6.037468,1.0,0.001,"{'features__text_features__vec__max_df': 1.0, ...",0.786937,0.791646,0.785454,...,0.788276,0.002351,1,0.822578,0.814054,0.821581,0.819153,0.822674,0.820008,0.003236


best f1-score on local cv: 0.7882761792862646
f1-micro on x_test: 0.7897037396794561
100% 124k/124k [00:00<00:00, 569kB/s]
Successfully submitted to HSE | NLP Bootcamp | Sentiment Analysis-------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------
fold: 2 | samples: 51476 | train: 41181


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

           1       0.75      0.99      0.85      5548
           2       0.22      0.01      0.02      1127
           3       0.00      0.00      0.00       471
           4       0.33      0.00      0.01       304
           5       0.90      0.92      0.91      2845

    accuracy                           0.79     10295
   macro avg       0.44      0.38      0.36     10295
weighted avg       0.68      0.79      0.71     10295



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_features__text_features__vec__max_df,param_features__text_features__vec__min_df,params,split0_test_F1,split1_test_F1,split2_test_F1,...,mean_test_F1,std_test_F1,rank_test_F1,split0_train_F1,split1_train_F1,split2_train_F1,split3_train_F1,split4_train_F1,mean_train_F1,std_train_F1
0,416.368621,81.90063,28.538699,5.468479,1.0,0.001,"{'features__text_features__vec__max_df': 1.0, ...",0.786816,0.790918,0.789582,...,0.789418,0.001924,1,0.824278,0.816057,0.8146,0.819366,0.819548,0.81877,0.003348


best f1-score on local cv: 0.7894175129774599
f1-micro on x_test: 0.7873725109276348
100% 124k/124k [00:00<00:00, 601kB/s]
Successfully submitted to HSE | NLP Bootcamp | Sentiment Analysis-------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------
fold: 3 | samples: 51476 | train: 41181


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

           1       0.75      0.99      0.85      5548
           2       0.25      0.02      0.03      1127
           3       0.08      0.00      0.00       471
           4       0.12      0.00      0.01       304
           5       0.90      0.92      0.91      2845

    accuracy                           0.79     10295
   macro avg       0.42      0.39      0.36     10295
weighted avg       0.69      0.79      0.72     10295



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_features__text_features__vec__max_df,param_features__text_features__vec__min_df,params,split0_test_F1,split1_test_F1,split2_test_F1,...,mean_test_F1,std_test_F1,rank_test_F1,split0_train_F1,split1_train_F1,split2_train_F1,split3_train_F1,split4_train_F1,mean_train_F1,std_train_F1
0,418.132293,83.223245,29.343767,5.731142,1.0,0.001,"{'features__text_features__vec__max_df': 1.0, ...",0.78718,0.792254,0.789097,...,0.789733,0.001754,1,0.814594,0.810624,0.819578,0.821885,0.818759,0.817088,0.004


best f1-score on local cv: 0.7897331913599206
f1-micro on x_test: 0.7878581835842642
100% 124k/124k [00:00<00:00, 634kB/s]
Successfully submitted to HSE | NLP Bootcamp | Sentiment Analysis-------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------
fold: 4 | samples: 51476 | train: 41181


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

           1       0.75      0.98      0.85      5548
           2       0.21      0.02      0.04      1127
           3       0.57      0.01      0.02       471
           4       0.53      0.03      0.06       304
           5       0.90      0.93      0.92      2845

    accuracy                           0.79     10295
   macro avg       0.59      0.39      0.38     10295
weighted avg       0.72      0.79      0.72     10295



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_features__text_features__vec__max_df,param_features__text_features__vec__min_df,params,split0_test_F1,split1_test_F1,split2_test_F1,...,mean_test_F1,std_test_F1,rank_test_F1,split0_train_F1,split1_train_F1,split2_train_F1,split3_train_F1,split4_train_F1,mean_train_F1,std_train_F1
0,401.508497,77.020521,29.202695,5.518395,1.0,0.001,"{'features__text_features__vec__max_df': 1.0, ...",0.787787,0.792618,0.78594,...,0.788325,0.002355,1,0.815657,0.816118,0.818698,0.819548,0.822158,0.818436,0.002377


best f1-score on local cv: 0.7883247259151144
f1-micro on x_test: 0.7905779504613891
100% 124k/124k [00:00<00:00, 600kB/s]
Successfully submitted to HSE | NLP Bootcamp | Sentiment Analysis-------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------
Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.
Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.


In [None]:
final_grades = np.argmax(final_probs, axis=1) + 1
final_sub = pd.DataFrame({'inds': test.index, 'grades': final_grades})
suffix = f'{Config.experiment}_[sn_{Config.samples_num}]_[sd_{Config.seed}]__[f1_{mean_f1_micro/len(Config.folds):.4f}]'
final_sub_filename = 'final_sub_'+suffix+'.csv'
final_probs_filename = 'final_probs_'+suffix+'.npy'
final_sub.to_csv(final_sub_filename, index=False)
np.save(final_probs_filename, final_probs)
! cp {final_sub_filename} {EXPERIMENT_DIR}
! cp {final_probs_filename} {EXPERIMENT_DIR}

## Submit final sub

In [None]:
!kaggle competitions submit -c hse-nlp-bootcamp  -f {'/content/'+final_sub_filename} -m "Submission via API"

  0% 0.00/124k [00:00<?, ?B/s]100% 124k/124k [00:00<00:00, 529kB/s]
Successfully submitted to HSE | NLP Bootcamp | Sentiment Analysis

# Results :

| Fold | Local CV | OOF | Public LB|
| --- | --- | --- | --- |
| 0 | 0.7884 | 0.7912 | 0.79419 |
| 1 | 0.7883 | 0.7897 | 0.79698 |
| 2 | 0.7894 | 0.7874 | 0.79326 |
| 3 | 0.7897 | 0.7879 | 0.79396 |
| 4 | 0.7883 | 0.7906 | 0.79303 |
|mean| 0.7888 | 0.7894 |0.79428 |
|final| - | - | 0.79581 |