In [None]:
import os 
os.chdir('drive/MyDrive/entrepreneur-helper')

In [None]:
ls

config.py         main.py     [0m[01;34m__pycache__[0m/      text_handling.py
data_1.json       nn.py       requests.db       users.db
db_management.py  parsers.py  requirements.txt


In [None]:
!pip install -r requirements.txt



In [None]:
!pip install sentence-transformers



In [None]:
!pip install wikipedia



In [None]:
!python main.py

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Bot started!
Added thread: search_for_relevant_part_in_json
Added thread: parse_google
Added thread: wiki_parser
  tensor = as_tensor(value)
  for span_id in range(num_spans)
Added thread: search_for_relevant_part_in_json
Added thread: parse_google
Added thread: wiki_parser
Added thread: search_for_relevant_part_in_json
Added thread: parse_google
Added thread: wiki_parser
Added thread: search_for_relevant_part_in_json
Added thread: parse_google
Added thread: wiki_parser


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.0.4-cp37-none-manylinux1_x86_64.whl (76.1 MB)
[K     |████████████████████████████████| 76.1 MB 1.3 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.4


# Building NN for text relevancy estimation

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
import numpy as np
import warnings
warnings.simplefilter("ignore")
import torch.nn as nn
from scipy.special import softmax

In [None]:
class RelevanceEstimator(nn.Module):
    '''
    NN module used for prediction of relevancy of corpus and given texts

    model_type: gradient boosting classifier / logistic reqression etc.
    X: train data
    y: train data labels
    batch_size: batch_size for train data

    '''
    def __init__(self,device:str, X:np.ndarray, y:np.ndarray, batch_size:int, 
                 model_type:str, model_kwargs:dict={}) -> None:
        super(RelevanceEstimator, self).__init__()

        if model_type == 'catboost':
            self.model = CatBoostClassifier(**model_kwargs)
        elif model_type == 'logreg':
            self.model = LogisticRegression(**model_kwargs)
        else:
            raise ValueError("Invalid model type")

        self.X = np.array(X)
        self.y = np.array(y)
        self.batch_size = batch_size
        self.reset_data = True

        self.pipeline = nn.Sequential(
            nn.BatchNorm1d(num_features=len(self.X[0])),
            nn.Softmax(),
        )

    @staticmethod
    def batch_normalization(vals:np.ndarray, eps=1e-5) -> np.ndarray:
        '''
        A function normalizing batch with respect to mean and varience

        vals: list of floats/ints meant to be normalized
        eps: constant meant to prevent division by zero
        '''
        mean_val = np.mean(vals)
        varience = np.var(vals)
        vals = (vals - mean_val) / np.sqrt(varience + eps)

        return vals  

    def normalize_and_softmax(self, vals) -> None:
        '''
        Batch preparation method
        '''
        for i in range(len(vals)):
            vals[i] = self.batch_normalization(vals[i])
            vals[i] = softmax(vals[i])


    def split_and_preprocess_data(self, random_state=42, test_size=0.3, shuffle=True) -> None:
        '''
        Method used for splitting X into train and test datasets
        '''
        self.train_data, self.test_data, self.train_labels, self.test_labels = \
        train_test_split(test_size=test_size, 
                         random_state=random_state,
                         shuffle=shuffle)
        
        print('Data successfully splitted!')


    def __batch_generator(self, dataset_type) -> tuple:
        '''
        Method used for data generation
        dataset_type: train or test
        '''
        if self.reset_data:
            try:
                self.train_generator = (self.train[i * self.batch_size : (i+1) * self.batch_size] for i in range(len(self.train) // self.batch_size - 1))
                self.test_generator = (self.test[i * self.batch_size : (i+1) * self.batch_size] for i in range(len(self.test) // self.batch_size - 1))
                self.train_labels_generator = (self.train_labels[i * self.batch_size : (i+1) * self.batch_size] for i in range(len(self.train_labels) // self.batch_size - 1))
                self.test_labels_generator = (self.test_labels[i * self.batch_size : (i+1) * self.batch_size] for i in range(len(self.test_labels) // self.batch_size - 1))

                self.reset_data = False

            except AttributeError:
                print("Seems like you haven't ran split_data method just yet")
                print("Aborting...")

        
        if dataset_type == 'train':
            return next(self.train_generator), next(self.train_labels_generator)
        elif dataset_type == 'test':
            return next(self.test_generator), next(self.test_labels_generator)
        else:
            raise ValueError("Invalid dataset type") 

    def fit(self, num_iters=1000):
        for _ in range(num_iters):
            for i in range(len(self.train) // self.batch_size - 1):
                X_batch, y_batch = self.__batch_generator("train")
                self.model.fit(X_batch, y_batch)

            self.reset_data = True

    def predict(self, X):
        apply_pipeline = self.pipeline(X)
        return self.model.predict(apply_pipeline.detach().numpy())


In [20]:
!git status
!git add entrepreneur-helper.ipynb

On branch master

No commits yet

Untracked files:
  (use "git add <file>..." to include in what will be committed)

	[31m__pycache__/[m
	[31mconfig.py[m
	[31mdata_1.json[m
	[31mdb_management.py[m
	[31mentrepreneur-helper.ipynb[m
	[31mmain.py[m
	[31mnn.py[m
	[31mparsers.py[m
	[31mrequests.db[m
	[31mrequirements.txt[m
	[31mtext_handling.py[m
	[31musers.db[m

nothing added to commit but untracked files present (use "git add" to track)


In [21]:
!git status

On branch master

No commits yet

Changes to be committed:
  (use "git rm --cached <file>..." to unstage)

	[32mnew file:   entrepreneur-helper.ipynb[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)

	[31m__pycache__/[m
	[31mconfig.py[m
	[31mdata_1.json[m
	[31mdb_management.py[m
	[31mmain.py[m
	[31mnn.py[m
	[31mparsers.py[m
	[31mrequests.db[m
	[31mrequirements.txt[m
	[31mtext_handling.py[m
	[31musers.db[m



In [25]:
!git config --global user.email "daniildiveev@mail.ru"
!git config --global user.name "sexozavr"

In [23]:
!git remote add origin https://github.com/sexozavr/entrepreneur-helper.git

In [26]:
!git commit -m '[ADD] added entrepreneur-helper.ipynb'

[master (root-commit) 7819c42] [ADD] added entrepreneur-helper.ipynb
 1 file changed, 1 insertion(+)
 create mode 100644 entrepreneur-helper.ipynb


In [31]:
!git log

[33mcommit 7819c426fabb0f4d4f79833efa3cbc9fb4825325[m[33m ([m[1;36mHEAD -> [m[1;32mmaster[m[33m)[m
Author: sexozavr <daniildiveev@mail.ru>
Date:   Thu Apr 14 08:08:00 2022 +0000

    [ADD] added entrepreneur-helper.ipynb


In [29]:
!git push origin main

error: src refspec main does not match any.
error: failed to push some refs to 'https://github.com/sexozavr/entrepreneur-helper.git'


In [18]:
os.chdir('entrepreneur-helper')

In [30]:
!ls

config.py	  entrepreneur-helper.ipynb  parsers.py   requirements.txt
data_1.json	  main.py		     __pycache__  text_handling.py
db_management.py  nn.py			     requests.db  users.db
