In [None]:
!pip install kaggle

In [None]:
!mkdir /root/.kaggle

In [None]:
!touch /root/.kaggle/kaggle.json

In [None]:
!echo "{"username":"bolg4rin","key":"9b329c4fc7812fd645aeef0dc24033e2"}" >> /root/.kaggle/kaggle.json
!chmod 600 /root/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d sbhatti/financial-sentiment-analysis

In [None]:
!python -m spacy init fill-config base_config.cfg config.cfg

In [30]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy --gpu-id 0

[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[2023-09-17 03:34:59,593] [INFO] Set up nlp object from config
[2023-09-17 03:34:59,603] [INFO] Pipeline: ['textcat']
[2023-09-17 03:34:59,606] [INFO] Created vocabulary
[2023-09-17 03:34:59,607] [INFO] Finished initializing nlp object
[2023-09-17 03:35:03,186] [INFO] Initialized pipeline components: ['textcat']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat'][0m
[38;5;4mℹ Initial learn rate: 0.0001[0m
E    #       LOSS TEXTCAT  CATS_SCORE  SCORE 
---  ------  ------------  ----------  ------
  0       0          0.25       45.15    0.45
  1     200         32.46       45.15    0.45
  2     400         26.54       47.76    0.48
  3     600         24.32       57.08    0.57
  4     800         13.42       59.26    0.59
  6    1000          8.72       66.57    0.67
  7    1200          3.98       70.07    0.70
  8    1400          2.68       69.56    0.70
  9    1600          1.

In [None]:
#imports
import spacy as spc
import pandas as pd
from spacy.tokens import DocBin

In [2]:
#load the model
nlp_model = spc.load("ru_core_news_sm")

In [3]:

def make_docs(nlp_model, data):
    """
    this will take a list of texts and labels
    and transform them in spacy documents
    data: list(tuple(text, label))
    returns: List(spacy.Doc.doc)
    """
    # nlp.pipe([texts]) is way faster than running
    # nlp(text) for each text
    # as_tuples allows us to pass in a tuple,
    # the first one is treated as text
    # the second one will get returned as it is.
    docs = []
    for doc, label in nlp_model.pipe(data, as_tuples=True):
        if label == 'negative':
            doc.cats["positive"] = 0
            doc.cats["negative"] = 1
        else:
            doc.cats["positive"] = 1
            doc.cats["negative"] = 0
        docs.append(doc)
    return docs

In [28]:
#drop an id axis in json
df = pd.read_json("data/train.json")
df = df.drop("id", axis=1)
#take only one tenth of the whole dataset
df = df.sample(frac=0.1)
class_counts = df['sentiment'].value_counts()
#this is the measure against class disbalance
#in balanced_df we will recieve df with equality in quantity between classes
min_count = class_counts.min()
balanced_df = df.groupby('sentiment').apply(lambda x: x.sample(n=min_count)).reset_index(drop=True)
#due to groupby balanced_df will be ordered, so we will shuffle dataframe
balanced_df = df.sample(frac=1).reset_index(drop=True)


train = [tuple((balanced_df.sample(frac=0.8)).iloc[i].values) for i in range((balanced_df.sample(frac=0.8)).shape[0])]
valid = [tuple((balanced_df.sample(frac=0.2)).iloc[i].values) for i in range((balanced_df.sample(frac=0.2)).shape[0])]
print(len(train), len(valid))

661 165


In [29]:
#make docs
train_docs = make_docs(nlp_model, train)
valid_docs = make_docs(nlp_model, valid)
#save it as binary
train_docs_bin = DocBin(docs=train_docs)
valid_docs_bin = DocBin(docs=valid_docs)
#save to the disk
valid_docs_bin.to_disk("train.spacy")
train_docs_bin.to_disk("dev.spacy")

In [None]:
nlp = spc.load("output/model-best")
text = ""
print("type : ‘quit’ to exit")
# predict the sentiment until someone writes quit
while text != "quit":
    text = input("Please enter example input: ")
    doc = nlp(text)
    print(doc.cats)
    if doc.cats['positive'] >.5:
        print(f"the sentiment is positive")
    else:
        print(f"the sentiment is negative")

type : ‘quit’ to exit


Please enter example input:  Русские взяли Харьков


{'positive': 0.49208006262779236, 'negative': 0.5079199075698853}
the sentiment is negative


Please enter example input:  Я вышла сегодня дежурить на работу


{'positive': 0.49378061294555664, 'negative': 0.5062193870544434}
the sentiment is negative
