# Step 0. Install LAMA

## Download the packages into a directory in the output space

In [None]:
# !pip download lightautoml -d ./lightautoml/

## Compress the downloaded library into a zip archive

In [None]:
# !tar cvfz lightautoml.tgz ./lightautoml/

## Download the archive into your machine

By using the download button on the file name.

## Upload the downloaded zip as input data¶
Press "+ Add Data" in the top-right to upload your zip file. Upload the zip file as a dataset (the name you give is not important).

## Uncompress the downloaded zip into a library

In [None]:
!tar xvfz ../input/lightautoml-tar/lightautoml.tgz

## Install the offline packages into our kernel

In [None]:
!pip install lightautoml --no-index --find-links=file:./lightautoml/ 

# Step 0.1. Import necessary libraries 

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
from sklearn.metrics import mean_squared_error
from lightautoml.automl.presets.text_presets import TabularNLPAutoML
from lightautoml.tasks import Task

# Step 0.2. Parameters 

In [None]:
N_THREADS = 8 # threads cnt for lgbm and linear models
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 60 * 170 # Time in seconds for automl run
TARGET_NAME = 'target'

# Step 0.3. Fix torch number of threads and numpy seed 

In [None]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)




# Step 0.4. Example data load 

In [None]:
%%time

train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
train_data.head()

In [None]:
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
test_data.head()

In [None]:
submission = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')
submission.head()

In [None]:
train_data.shape, test_data.shape, submission.shape

# Step 0.5. Some EDA

In [None]:
train_data.info()

In [None]:
train_data['license'].value_counts(dropna = False)

# Step 0.6. Data cleaning and preparation

In [None]:
def clean_text(text):
    
    return text.lower()

In [None]:
def preprocess_text(df):
    df['len_tokens'] = df['excerpt'].str.strip().str.split(' ').apply(len)
    df['len'] = df['excerpt'].str.strip().apply(len)
    df['len_sent'] = df['excerpt'].str.strip().str.split('.').apply(len)
    df['n_comm'] = df['excerpt'].str.strip().str.split(',').apply(len)
    _t = df['excerpt'].str.strip().str.split(' ').values
    df['d_mean'] = [np.sum([j.isdigit() for j in i]) for i in _t]
    df['u_mean'] = [np.sum([j.isupper() for j in i]) for i in _t]
    df['license'] = df['license'].astype(str)
    df['url_legal'] = df['url_legal'].astype(str)
    df.loc[df['license'].value_counts()[df['license']].values < 5, 'license'] = "RARE_VALUE"
    df.loc[df['license'] == 'nan', 'license'] = np.nan
    df.loc[df['url_legal'] != 'nan', 'url_legal'] = 1
    df.loc[df['url_legal'] == 'nan', 'url_legal'] = 0

    df['excerpt'] = df['excerpt'].map(clean_text)

In [None]:
preprocess_text(train_data)
train_data

#  ==== AutoML preset usage ====


## Step 1. Create Task

In [None]:
%%time
def rmse(x, y): return np.sqrt(mean_squared_error(x, y))
task = Task('reg', metric=rmse)

## Step 2. Setup columns roles

In [None]:
%%time

roles = {'target': TARGET_NAME, 
         'text': ['excerpt'],
        'drop': ['id', "standard_error"]}

## Step 3. Create AutoML from preset

To create AutoML model here we use `TabularNLPAutoML` preset.


All params we set above can be send inside preset to change its configuration:

In [None]:
train_data

In [None]:
#!pip install torch-scatter -q
#import torch_scatter

In [None]:
%%time 

automl = TabularNLPAutoML(task = task, 
                       timeout = TIMEOUT,
                       cpu_limit = N_THREADS,
                       gpu_ids='all',
                       reader_params = {'cv': 5},
                       general_params = {'nested_cv': False, 'use_algos': [['nn']]},
                       text_params = {'lang': 'en'},
                       tuning_params = {'max_tuning_iter': 20, 'max_tuning_time': 60},
                       nn_params = {'lang': 'en', 
                                    'bert_name': '../input/clrp-roberta-large/clrp_roberta_large', 
                                    'opt_params': { 'lr': 1e-5},
                                    'max_length': 250, 'bs': 13,
                                    'n_epoch': 5
                                    },
                       )

oof_pred = automl.fit_predict(train_data, roles = roles)
print('oof_pred:\n{}\nShape = {}'.format(oof_pred, oof_pred.shape))

## Step 4. Predict to test data

In [None]:
preprocess_text(test_data)
test_pred = automl.predict(test_data)
print('Prediction for test data:\n{}\nShape = {}'.format(test_pred, test_pred.shape))

## Step 6. Generate submission file

In [None]:
submission['target'] = test_pred.data[:, 0]
submission

In [None]:
submission['target'].value_counts()

In [None]:
submission.to_csv("submission.csv",index=False)