In [2]:
from beir import util, LoggingHandler
from beir.retrieval import models
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES

import logging
import pathlib, os

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

#### Download scifact.zip dataset and unzip the dataset
dataset = "scifact"
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
out_dir = os.path.join(pathlib.Path("/tmp").absolute(), "datasets")
data_path = util.download_and_unzip(url, out_dir)

#### Provide the data_path where scifact has been downloaded and unzipped
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

2023-06-02 22:26:52 - Downloading scifact.zip ...


/tmp/datasets/scifact.zip:   0%|          | 0.00/2.69M [00:00<?, ?iB/s]

2023-06-02 22:26:55 - Unzipping scifact.zip ...
2023-06-02 22:26:55 - Loading Corpus...


  0%|          | 0/5183 [00:00<?, ?it/s]

2023-06-02 22:26:55 - Loaded 5183 TEST Documents.
2023-06-02 22:26:55 - Doc Example: {'text': 'Alterations of the architecture of cerebral white matter in the developing human brain can affect cortical development and result in functional disabilities. A line scan diffusion-weighted magnetic resonance imaging (MRI) sequence with diffusion tensor analysis was applied to measure the apparent diffusion coefficient, to calculate relative anisotropy, and to delineate three-dimensional fiber architecture in cerebral white matter in preterm (n = 17) and full-term infants (n = 7). To assess effects of prematurity on cerebral white matter development, early gestation preterm infants (n = 10) were studied a second time at term. In the central white matter the mean apparent diffusion coefficient at 28 wk was high, 1.8 microm2/ms, and decreased toward term to 1.2 microm2/ms. In the posterior limb of the internal capsule, the mean apparent diffusion coefficients at both times were similar (1.2 vers

In [4]:
len(corpus)

5183

In [5]:
queries

{'1': '0-dimensional biomaterials show inductive properties.',
 '3': '1,000 genomes project enables mapping of genetic sequence variation consisting of rare variants with larger penetrance effects than common variants.',
 '5': '1/2000 in UK have abnormal PrP positivity.',
 '13': '5% of perinatal mortality is due to low birth weight.',
 '36': 'A deficiency of vitamin B12 increases blood levels of homocysteine.',
 '42': 'A high microerythrocyte count raises vulnerability to severe anemia in homozygous alpha (+)- thalassemia trait subjects.',
 '48': 'A total of 1,000 people in the UK are asymptomatic carriers of vCJD infection.',
 '49': 'ADAR1 binds to Dicer to cleave pre-miRNA.',
 '50': 'AIRE is expressed in some skin tumors.',
 '51': 'ALDH1 expression is associated with better breast cancer outcomes.',
 '53': 'ALDH1 expression is associated with poorer prognosis in breast cancer.',
 '54': 'AMP-activated protein kinase (AMPK) activation increases inflammation-related fibrosis in the lung

# FiQA 

In [6]:
!mv /home/jjmachan/Downloads/FiQA_train_task2.zip /tmp/datasets/

In [7]:
d_dir = "/tmp/datasets/fiqa/"

In [9]:
import pandas as pd
import os

In [15]:
qs_df = pd.read_csv(
    os.path.join(d_dir, "FiQA_train_question_final.tsv"),
    sep="\t",
    header=0, 
    index_col=0
)

qs_df.head()

Unnamed: 0,qid,question,timestamp
0,0,What is considered a business expense on a bus...,Nov 8 '11 at 15:14
1,1,Claiming business expenses for a business with...,May 13 '14 at 13:17
2,2,Transferring money from One business checking ...,Jan 20 '16 at 20:31
3,3,Having a separate bank account for business/in...,Mar 1 at 0:24
4,4,Business Expense - Car Insurance Deductible Fo...,Mar 4 at 0:26


In [50]:
ans_df = pd.read_csv(
    os.path.join(d_dir, "FiQA_train_question_doc_final.tsv"),
    sep="\t",
    header=0, 
    index_col=0
)

ans_df.head()

Unnamed: 0,qid,docid
0,0,18850
1,1,14255
2,2,308938
3,3,296717
4,3,100764


In [20]:
doc_df = pd.read_csv(
    os.path.join(d_dir, "FiQA_train_doc_final.tsv"),
    sep="\t",
    header=0, 
    index_col=0
)

doc_df.head()

Unnamed: 0,docid,doc,timestamp
0,3,I'm not saying I don't like the idea of on-the...,Oct 03 '12 at 14:56
1,31,So nothing preventing false ratings besides ad...,Sep 01 '17 at 13:36
2,56,You can never use a health FSA for individual ...,Jun 9 '14 at 17:37
3,59,Samsung created the LCD and other flat screen ...,Dec 27 at 01:37
4,63,Here are the SEC requirements: The federal sec...,Jul 14 '14 at 8:10


In [21]:
qs_df.shape, ans_df.shape, doc_df.shape

((6648, 3), (57638, 3), (57638, 3))

In [41]:
doc_df.iloc[10]['doc']

'In the US, I would say the risk is exactly the same.  If your accounts are withing the FDIC amount (currently $250,000) your balance is 100% covered in case of a failure. You are giving up a larger network of ATMs in some cases.  You are also perhaps giving up the number of branches you can visit, the hours the bank is open and maybe how well the website works.  The features might be less, but the protection for your deposits is the same.'

In [103]:
from IPython.display import Markdown, display

t = """\
*{ques}*

**answer**
{ans}
"""

i = 3
q_id, q = qs_df.iloc[i]["qid"], qs_df.iloc[i]["question"]

ans_doc_ids = list(ans_df.loc[ans_df["qid"] == q_id]['docid'].values)

anss = []
for ans in ans_doc_ids:
    anss.append(doc_df.loc[doc_df['docid']==ans]["doc"].values[0])
ans_str = "- " + '\n- '.join(anss)

display(Markdown(t.format(ques=q, ans=ans_str)))

*Having a separate bank account for business/investing, but not a “business account?”*

**answer**
- Having a separate checking account for the business makes sense. It simplifies documenting your income/expenses. You can "explain" every dollar entering and exiting the account without having to remember that some of them were for non-business items. My credit union allowed me to have a 2nd checking account and allowed me to put whatever I wanted as the name on the check. I think this looked a little better than having my name on the check. I don't see the need for a separate checking account for investing. The money can be kept in a separate savings account that has no fees, and can even earn a little interest. Unless you are doing a lot of investment transactions a month this has worked for me. I fund IRAs and 529 plans this way. We get paychecks 4-5 times a month, but send money to each of the funds once a month. You will need a business account if the number of transactions becomes large. If you deposit dozens of checks every time you go to the bank, the bank will want to move you to a business account.
- You don't specify which country you are in, so my answers are more from a best practice view than a legal view.. I don't intend on using it for personal use, but I mean it's just as possible. This is a dangerous proposition.. You shouldn't co-mingle business expenses with personal expenses.  If there is a chance this will happen, then stop, make it so that it won't happen. The big danger is in being able to have traceability between what you are doing for the business, and what you are doing for yourself.  If you are using this as a "staging" account for investments, etc., are those investments for yourself?  Or for the business?  Is tax treatment on capital gains and/or dividends the same for personal and business in your jurisdiction?  If you buy a widget, is the widget an expense against business income?  Or is it an out of pocket expense for personal consumption?  The former reduces your taxable income, the latter does not. I don't see the benefit of a real business account because those have features specific to maybe corporations, LLC, and etc. -- nothing beneficial to a sole proprietor who has no reports/employees. The real benefit is that there is a clear delineation between business income/expenses and personal income/expenses. This account can also accept money and hold it from business transactions/sales, and possibly transfer some to the personal account if there's no need for reinvesting said amount/percentage. What you are looking for is a commonly called a current account, because it is used for current expenses.  If you are moving money out of the account to your personal account, that speaks to paying yourself, which has other implications as well. The safest/cleanest way to do this is to: While this may sound like overkill, it is the only way to guarantee that income/expenses are allocated to the correct entity (i.e. you, or your business). From a Canadian standpoint:
- If it makes your finances easier, why not? My wife and I had his/hers/our since before we were married. I also have an account to handle transactions for my rental property, and one extra for PayPal use. I was paranoid to give out a checking account number with authorization for a third party to debit it, so that account has a couple hundred dollars, maximum. All this is just to explain that your finances should be arranged to simplify your life and make you comfortable.
- When I was younger I had a problem with Washington Mutual.  Someone had deposited a check in to my account then ran my account negative with a "dupe" of my debit card.  WaMu tied up my account for three months while they investigated because it wasn't simply a debit card fraud issue, this was check fraud (so they claimed).  At the time all the money I had in the world was in that account and the ordeal was extremely disruptive to my life.  Since the, I never spend on my debit card(s) and I keep more than one checking account to disperse the risk and avoid disruption in the event anything ever happens again. Now one of the accounts contains just enough money (plus a small buffer) to pay my general monthly expenses and the other is my actual checking account.   There's no harm in having more than one checking account and if you think it will enhance your finances, do it. Though, there's no reason to get a business account unless you've actually formed a business.


In [104]:
from llms import llm


prompt = """\
answer the question to the best of your ability

Question: {ques}
Answer:
"""

llm(prompt.format(ques=q))['choices'][0]['text']

'It is possible to have a separate bank account for business or investing purposes without having a business account. This type of account is often referred to as a personal investment account. These accounts are typically used to manage investments, such as stocks, bonds, mutual funds, and other investments. They are not typically used for business transactions, such as payroll or accounts receivable.'

In [107]:
from beir import util
from beir.datasets.data_loader import GenericDataLoader

dataset = "fiqa"
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
data_path = util.download_and_unzip(url, "datasets")
corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")

2023-06-03 20:18:42 - Downloading fiqa.zip ...


datasets/fiqa.zip:   0%|          | 0.00/17.1M [00:00<?, ?iB/s]

2023-06-03 20:18:52 - Unzipping fiqa.zip ...
2023-06-03 20:18:52 - Loading Corpus...


  0%|          | 0/57638 [00:00<?, ?it/s]

2023-06-03 20:18:52 - Loaded 57638 TEST Documents.
2023-06-03 20:18:52 - Doc Example: {'text': "I'm not saying I don't like the idea of on-the-job training too, but you can't expect the company to do that. Training workers is not their job - they're building software. Perhaps educational systems in the U.S. (or their students) should worry a little about getting marketable skills in exchange for their massive investment in education, rather than getting out with thousands in student debt and then complaining that they aren't qualified to do anything.", 'title': ''}
2023-06-03 20:18:52 - Loading Queries...
2023-06-03 20:18:52 - Loaded 648 TEST Queries.
2023-06-03 20:18:52 - Query Example: How to deposit a cheque issued to an associate in my business into my business account?


In [110]:
len(queries)

648

In [111]:
#### Load the SBERT model and retrieve using cosine-similarity
model = DRES(models.SentenceBERT("msmarco-distilbert-base-tas-b"), batch_size=16)
retriever = EvaluateRetrieval(model, score_function="dot") # or "cos_sim" for cosine similarity
results = retriever.retrieve(corpus, queries)

#### Evaluate your model with NDCG@k, MAP@K, Recall@K and Precision@K  where k = [1,3,5,10,100,1000] 
ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)

2023-06-03 20:20:14 - Load pretrained SentenceTransformer: msmarco-distilbert-base-tas-b


Downloading (…)0cfdf/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)5bdce0cfdf/README.md:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

Downloading (…)dce0cfdf/config.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)0cfdf/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/547 [00:00<?, ?B/s]

Downloading (…)5bdce0cfdf/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)ce0cfdf/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

2023-06-03 20:21:37 - Use pytorch device: cpu
2023-06-03 20:21:37 - Encoding Queries...


Batches:   0%|          | 0/41 [00:00<?, ?it/s]

2023-06-03 20:21:40 - Sorting Corpus by document length (Longest first)...
2023-06-03 20:21:40 - Scoring Function: Dot Product (dot)
2023-06-03 20:21:40 - Encoding Batch 1/2...


Batches:   0%|          | 0/3125 [00:00<?, ?it/s]

KeyboardInterrupt: 