<a href="https://colab.research.google.com/github/diwert-ai/First-step-in-NLP-HSE-bootcamp-/blob/main/kaggle%20competition/experiment_Logistic_Regression_PL_%5BLB_0_79419%5D_%2B_PostProc_thresholding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mount google.drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
INPUT_DIR ='/content/drive/MyDrive/kaggle/fsnlp_hse/input/'
OUTPUT_DIR = '/content/drive/MyDrive/kaggle/fsnlp_hse/output/'

WORK_DIR='/content/'
EXPERIMENT='logreg_pl'
EXPERIMENT_DIR = OUTPUT_DIR + EXPERIMENT+'/'

DATASET_DIR = INPUT_DIR
METADATA_DIR = INPUT_DIR

!mkdir {EXPERIMENT_DIR}

mkdir: cannot create directory ‘/content/drive/MyDrive/kaggle/fsnlp_hse/output/logreg_pl/’: File exists


# Install, set up Kaggle API and download dataset

In [None]:
from google.colab import files
uploaded = files.upload()

Saving kaggle.json to kaggle.json


In [None]:
!pip install -q kaggle --upgrade
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json

kaggle.json


In [None]:
!kaggle competitions download -c hse-nlp-bootcamp
!unzip hse-nlp-bootcamp.zip

Downloading hse-nlp-bootcamp.zip to /content
 95% 57.0M/59.8M [00:00<00:00, 81.6MB/s]
100% 59.8M/59.8M [00:00<00:00, 72.6MB/s]
Archive:  hse-nlp-bootcamp.zip
  inflating: new_test_ml.csv         
  inflating: sample_submission.csv   
  inflating: train_ml.csv            


In [None]:
! ls

drive		      kaggle.json      sample_data	      train_ml.csv
hse-nlp-bootcamp.zip  new_test_ml.csv  sample_submission.csv


# Imports

In [None]:
import pandas as pd
import numpy as np
import random
import pprint as pp
import os


# import re
# from pymorphy2 import MorphAnalyzer
# from functools import lru_cache
# from nltk.corpus import stopwords

# from multiprocessing import Pool
# from tqdm import tqdm

import nltk
# nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, f1_score
from joblib import dump, load

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Configuration

In [None]:
class Config:
  seed = 12536271
  samples_num = 51476
  experiment = EXPERIMENT
  pl_thr = 0.90

# Main pipeline starts here

In [None]:
def set_seed(seed=42):
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)


set_seed(Config.seed)

## Load and process train data

In [None]:
df = pd.read_csv('/content/train_ml.csv')
banks_list = list(df.bank.unique())
df['bank_id'] = df['bank'].apply(lambda x: banks_list.index(x))
df['date'] = pd.to_datetime(df['date'], format='%d.%m.%Y %H:%M')

## Get nan samples for labelning

In [None]:
df_nan=df[df['grades'].isna()].reset_index(drop=True)
df_nan['text']= df_nan.feeds.apply(lambda x: x.lower())
x_nan = df_nan[['bank_id','text']]

In [None]:
len(x_nan)

23524

## Set up pipeline to load logreg model for labeling unlabeled data and further training on more bigger trainset

In [None]:
def get_numeric_data(x):
    return x[['bank_id']]

def get_text_data(x):
    return x['text']

transfomer_numeric = FunctionTransformer(get_numeric_data)
transformer_text = FunctionTransformer(get_text_data)

pipeline = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector', transfomer_numeric)
            ])),
             ('text_features', Pipeline([
                ('selector', transformer_text),
                ('vec', TfidfVectorizer(ngram_range=(1, 2), tokenizer=word_tokenize))
            ]))
         ])),
    ('clf', LogisticRegression(random_state=Config.seed,  warm_start=True, max_iter=1000))
])

## Load the model to label unlabeled data

In [None]:
lr_model = load(OUTPUT_DIR+'logreg/'+'model_logreg_[sn_51476]_[sd_12536271][cv_0.7899].joblib')

## Label unlabeled data

In [None]:
pseudo_labels_prob = lr_model.predict_proba(x_nan)
label_probability = np.max(pseudo_labels_prob, axis = 1)
labels_within_threshold = np.where((label_probability > Config.pl_thr))[0]
pseudo_labels = np.argmax(pseudo_labels_prob[labels_within_threshold], axis = 1) + 1
chosen_unlabelled_rows = x_nan.iloc[labels_within_threshold].copy()
chosen_unlabelled_rows['grades'] = pseudo_labels
chosen_unlabelled_rows.head()

Unnamed: 0,bank_id,text,grades
1,3,обращаюсь к вам с жалобой на незаконное списан...,1
2,5,брала кредит на стиральную машину. все платила...,1
4,1,"уважаемый бинбанк, если у вас имеются какие-ли...",1
5,8,"я проживаю в г. уфа, респ. башкортостан, с бан...",1
7,6,добрый день! мне поступают звонки из вашего ба...,1


## Construct new dataset

In [None]:
df = df.dropna()
df = df.astype({'grades': 'int32'})

In [None]:
subset = df.sample(Config.samples_num, random_state=Config.seed)
subset['text'] = subset.feeds.apply(lambda x: x.lower())
subset.reset_index(drop=True, inplace=True)
# new_data = pd.concat([subset[['bank_id', 'text', 'grades']], x_nan])

## Split new data on train and test

In [None]:
x_train, x_test, y_train, y_test = train_test_split(subset[['bank_id', 'text']], subset['grades'], test_size=0.1, random_state=Config.seed)
x_train = pd.concat([x_train, chosen_unlabelled_rows[['bank_id', 'text']]], ignore_index=True)
y_train = pd.concat([y_train, chosen_unlabelled_rows['grades']], ignore_index=True)
len(x_train)

50429

In [None]:
pp.pprint(sorted(pipeline.get_params().keys()))

## Set params for GridSearchCV

In [None]:
param_grid = {'features__text_features__vec__max_df': [1.0],
              'features__text_features__vec__min_df': [0.001]}

kfold = StratifiedKFold(n_splits=5)
scoring = {'F1': 'f1_micro'}
refit = 'F1'

## Perform GridSearch

In [None]:
print(f'samples: {Config.samples_num} + pl {len(chosen_unlabelled_rows)}')
lr_model = GridSearchCV(pipeline, param_grid=param_grid, cv=kfold, scoring=scoring, 
                         refit=refit, n_jobs=-1, return_train_score=True, verbose=10)
lr_model.fit(x_train, y_train)
lr_best = lr_model.best_estimator_

samples: 51476 + pl 4101
Fitting 5 folds for each of 1 candidates, totalling 5 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Test best model

In [None]:
pred = lr_best.predict(x_test)
probs_test = lr_best.predict_proba(x_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           1       0.75      0.98      0.85      2788
           2       0.23      0.02      0.04       582
           3       0.43      0.01      0.02       253
           4       0.43      0.02      0.04       135
           5       0.90      0.92      0.91      1390

    accuracy                           0.79      5148
   macro avg       0.55      0.39      0.37      5148
weighted avg       0.71      0.79      0.71      5148



In [None]:
len(x_test)

5148

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
errors_idx = np.where(y_test != pred)
errors_info = x_test.copy().iloc[errors_idx]
errors_info['gt'] = y_test.iloc[errors_idx]
errors_info['pred'] = pred[errors_idx]
errors_info[['c1','c2','c3','c4', 'c5']] = probs_test[errors_idx]
errors_info.head(30)

In [None]:
f1_score(y_test, pred, average='micro')

0.7861305361305362

In [None]:
probs_test_corrected = np.multiply(probs_test, (1, 1.01, 1.01, 1.1, 0.97))
pred_corrected = np.argmax(probs_test_corrected, axis=1) + 1
print(classification_report(y_test, pred_corrected))
f1_score(y_test, pred_corrected, average='micro')

              precision    recall  f1-score   support

           1       0.75      0.98      0.85      2788
           2       0.25      0.02      0.04       582
           3       0.43      0.01      0.02       253
           4       0.45      0.04      0.07       135
           5       0.90      0.92      0.91      1390

    accuracy                           0.79      5148
   macro avg       0.56      0.40      0.38      5148
weighted avg       0.71      0.79      0.72      5148



0.7865190365190365

In [None]:
lr_model.best_params_

{'features__text_features__vec__max_df': 1.0,
 'features__text_features__vec__min_df': 0.001}

In [None]:
lr_model.cv_results_

{'mean_fit_time': array([421.08608727]),
 'std_fit_time': array([81.49718932]),
 'mean_score_time': array([32.69089179]),
 'std_score_time': array([6.314987]),
 'param_features__text_features__vec__max_df': masked_array(data=[1.0],
              mask=[False],
        fill_value='?',
             dtype=object),
 'param_features__text_features__vec__min_df': masked_array(data=[0.001],
              mask=[False],
        fill_value='?',
             dtype=object),
 'params': [{'features__text_features__vec__max_df': 1.0,
   'features__text_features__vec__min_df': 0.001}],
 'split0_test_F1': array([0.80805076]),
 'split1_test_F1': array([0.80557208]),
 'split2_test_F1': array([0.80279595]),
 'split3_test_F1': array([0.80428316]),
 'split4_test_F1': array([0.81566683]),
 'mean_test_F1': array([0.80727376]),
 'std_test_F1': array([0.00453735]),
 'rank_test_F1': array([1], dtype=int32),
 'split0_train_F1': array([0.83422155]),
 'split1_train_F1': array([0.83295739]),
 'split2_train_F1': array

In [None]:
lr_model.best_score_

0.8072737590091597

In [None]:
cv_tag = f'[cv_{lr_model.best_score_:0.4f}]'

In [None]:
cv_tag

'[cv_0.8073]'

## Inference (calculating submission) 

In [None]:
test = pd.read_csv('/content/new_test_ml.csv', index_col=0)
test['text'] = test.feeds.apply(lambda x: x.lower())
test['bank_id'] = test['bank'].apply(lambda x: banks_list.index(x))
test = test[['bank_id', 'text']]
#test_pred = lr_best.predict(test)
probs = lr_best.predict_proba(test)
probs_corrected = np.multiply(probs, (1, 1.01, 1.01, 1.1, 0.97))
test_pred_corrected = np.argmax(probs_corrected, axis=1) + 1
sub = pd.DataFrame({'inds': test.index,
                    'grades': test_pred_corrected})
sub

Unnamed: 0,inds,grades
0,0,1
1,1,1
2,2,1
3,3,1
4,4,1
...,...,...
17215,17215,1
17216,17216,1
17217,17217,1
17218,17218,1


# Save results to files

In [None]:
suffix = f'{Config.experiment}_postpr_[thr_{Config.pl_thr}]_[sn_{Config.samples_num}]_[sd_{Config.seed}]'+cv_tag
sub_filename = 'sub_'+suffix+'.csv'
probs_filename = 'probs_'+suffix+'.npy'
model_filename = 'model_'+suffix+'.joblib'

In [None]:
sub.to_csv(sub_filename, index=False)
np.save(probs_filename, probs)
dump(lr_best, model_filename)

['model_logreg_pl_postpr_[thr_0.9]_[sn_51476]_[sd_12536271][cv_0.8073].joblib']

In [None]:
!cp {sub_filename} {EXPERIMENT_DIR}
!cp {probs_filename} {EXPERIMENT_DIR}
!cp {model_filename} {EXPERIMENT_DIR}

In [None]:
!ls

 drive
 hse-nlp-bootcamp.zip
 kaggle.json
'model_logreg_pl_postpr_[thr_0.9]_[sn_51476]_[sd_12536271][cv_0.8073].joblib'
'model_logreg_pl_[sn_10000]_[sd_12536271][cv_0.8049].joblib'
'model_logreg_pl_[thr_0.95]_[sn_20000]_[sd_12536271][cv_0.7972].joblib'
'model_logreg_pl_[thr_0.95]_[sn_51476]_[sd_12536271][cv_0.7957].joblib'
'model_logreg_pl_[thr_0.9]_[sn_51476]_[sd_12536271][cv_0.8073].joblib'
 new_test_ml.csv
'probs_logreg_pl_postpr_[thr_0.9]_[sn_51476]_[sd_12536271][cv_0.8073].npy'
'probs_logreg_pl_[sn_10000]_[sd_12536271][cv_0.8049].npy'
'probs_logreg_pl_[thr_0.95]_[sn_20000]_[sd_12536271][cv_0.7972].npy'
'probs_logreg_pl_[thr_0.95]_[sn_51476]_[sd_12536271][cv_0.7957].npy'
'probs_logreg_pl_[thr_0.9]_[sn_51476]_[sd_12536271][cv_0.8073].npy'
 sample_data
 sample_submission.csv
'sub_logreg_pl_postpr_[thr_0.9]_[sn_51476]_[sd_12536271][cv_0.8073].csv'
'sub_logreg_pl_[sn_10000]_[sd_12536271][cv_0.8049].csv'
'sub_logreg_pl_[thr_0.95]_[sn_20000]_[sd_12536271][cv_0.7972].csv'
'sub_logreg_pl_[

# Submit predictions

In [None]:
! kaggle competitions submit -c hse-nlp-bootcamp  -f {'/content/'+sub_filename} -m "Submission via API"

100% 124k/124k [00:01<00:00, 72.4kB/s]
Successfully submitted to HSE | NLP Bootcamp | Sentiment Analysis

#  CV:0.8475 x_test:0.7843 LB:0.79419 