In [1]:
import logging
import torch
import numpy as np
import pandas as pd

from loading import load_train
from preprocessing import preprocess
from evaluation import evaluate, evaluate_model
from submission import prepare_submission, prepare_model_submission

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to /Users/jonas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/jonas/nltk_data...
[nltk_data] Downloading package wordnet to /Users/jonas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
logging.basicConfig(level=logging.INFO)

## Components

### Loading training data

In [7]:
df = load_train(full=False)
df

Unnamed: 0,index,x,y
0,27662,"<user> just got the morning off school , gotta...",1
1,136569,the old-house journal compendium ( paperback w...,-1
2,188597,""" <user> i want a crab cake "" that sounds bomb\n",-1
3,37041,would u follback then ? ; * wink-wink rt <user...,1
4,51782,i kinda feel sorry for <user> everyone is bull...,1
...,...,...,...
199995,26197,<user> you fo it and let me kno\n,1
199996,57647,everyone go follow <user> ! ! ! ( she's actual...,1
199997,11611,<user> agreed me and him do have the odd bit o...,1
199998,21655,country music and gossip girl\n,1


### Preprocessing

In [8]:
preprocess(df, flags={'tokenize': True, 'remove_tags': False, 'remove_tag_tokens': True, 'remove_stopwords': True, 'lemmatize': True, 'remove_single_symbols': True, 'spelling_correction': False})
df

### Testing model

In [5]:
y = torch.tensor(df['y'])
y_ones = torch.ones_like(y)

In [6]:
evaluate(y, y_ones)
evaluate(y, -y_ones)

INFO:root:---
* accuracy: 0.5
* precision: 0.5
* recall: 1.0
* f1: 0.6666666666666666
* bce: 17.269787996170436
* auc: 0.5
---
  _warn_prf(average, modifier, msg_start, len(result))
INFO:root:---
* accuracy: 0.5
* precision: 0.0
* recall: 0.0
* f1: 0.0
* bce: 17.269388197455328
* auc: 0.5
---


(0.5, 0.0, 0.0, 0.0, 17.269388197455328, 0.5)

### Submitting model

Generates submission file.

In [7]:
prepare_submission(y_ones, file='test_submission.csv')

## Pulling it all together

In [8]:
def model(df: pd.DataFrame) -> np.array:
  # training if df has column `y`; evaluating otherwise

  remove_tags(df)
  return np.ones(df['x'].shape, dtype=np.int32)

In [9]:
df_train = load_train(full=False)
evaluate_model(model, df_train)

INFO:root:---
* accuracy: 0.5
* precision: 0.5
* recall: 1.0
* f1: 0.6666666666666666
* bce: 17.269787996170436
* auc: 0.5
---


(0.5, 0.5, 1.0, 0.6666666666666666, 17.269787996170436, 0.5)

In [10]:
prepare_model_submission(model)