In [41]:
import pandas as pd

In [42]:
df = pd.read_csv('sentiment_data/Fin_Cleaned.csv')

In [43]:
len(df)

400

In [44]:
df.head()

Unnamed: 0,Date_published,Headline,Synopsis,Full_text,Final Status
0,2022-06-21,"Banks holding on to subsidy share, say payment...",The companies have written to the National Pay...,ReutersPayments companies and banks are at log...,Negative
1,2022-04-19,Digitally ready Bank of Baroda aims to click o...,"At present, 50% of the bank's retail loans are...",AgenciesThe bank presently has 20 million acti...,Positive
2,2022-05-27,Karnataka attracted investment commitment of R...,Karnataka is at the forefront in attracting in...,PTIKarnataka Chief Minister Basavaraj Bommai.K...,Positive
3,2022-04-06,Splitting of provident fund accounts may be de...,The EPFO is likely to split accounts only at t...,Getty ImagesThe budget for FY22 had imposed in...,Negative
4,2022-06-14,Irdai weighs proposal to privatise Insurance I...,"Set up in 2009 as an advisory body, IIB collec...",AgenciesThere is a view in the insurance indus...,Positive


In [45]:
df2 = pd.read_csv('sentiment_data/all-data.csv', encoding="ISO-8859-1", )

In [46]:
df2.rename(columns={'neutral': 'sentiment', 'According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .': 'text'}, inplace=True)

In [47]:
len(df2)

4845

In [48]:
df2.head()

Unnamed: 0,sentiment,text
0,neutral,Technopolis plans to develop in stages an area...
1,negative,The international electronic industry company ...
2,positive,With the new production plant the company woul...
3,positive,According to the company 's updated strategy f...
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...


In [49]:
df2['sentiment'].unique()

array(['neutral', 'negative', 'positive'], dtype=object)

In [50]:
df2["sentiment"] = df2['sentiment'].apply(lambda x: 0 if x == 'negative' else 1)

In [51]:
df2.head()

Unnamed: 0,sentiment,text
0,1,Technopolis plans to develop in stages an area...
1,0,The international electronic industry company ...
2,1,With the new production plant the company woul...
3,1,According to the company 's updated strategy f...
4,1,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...


In [52]:
df2 = df2.dropna()

In [53]:
df['Final Status'].unique()

array(['Negative', 'Positive', 'Positive '], dtype=object)

In [54]:
df["Final Status"] = df['Final Status'].apply(lambda x: 0 if x == 'Negative' else 1)

In [55]:
df['Final Status'].unique()

array([0, 1])

In [56]:
df['text'] = df['Headline']+' \n'+df['Synopsis']+' \n'+df['Full_text']

In [57]:
df['text'].iloc[0]

'Banks holding on to subsidy share, say payments firms \nThe companies have written to the National Payments Corp. of India (NPCI), complaining that â‚¹700 crore of the â‚¹1,500 crore granted in the budget is being retained by banks. \nReutersPayments companies and banks are at loggerheads over the sharing of government-granted subsidies for building payment infrastructure, said three people with knowledge of the matter. \n\nThe companies have written to the National Payments Corp. of India (NPCI), complaining that â‚¹700 crore of the â‚¹1,500 crore granted in the budget is being retained by banks, they said. This has deprived companies connecting up the last mile of state-promised revenues, according to them. The government granted the subsidies in exchange for waiving Merchant Discount Rate (MDR) charges.\n\n"The government has released â‚¹700 crore worth of subsidies to banks but they are not sharing it with any payment aggregators," said the CEO of a payments company on condition o

In [58]:
df = df.dropna()

In [59]:
text1 = list(df['text'])
text2 = list(df2['text'])

text = text1 + text2

In [60]:
len(text)

5244

In [61]:
text[0]

'Banks holding on to subsidy share, say payments firms \nThe companies have written to the National Payments Corp. of India (NPCI), complaining that â‚¹700 crore of the â‚¹1,500 crore granted in the budget is being retained by banks. \nReutersPayments companies and banks are at loggerheads over the sharing of government-granted subsidies for building payment infrastructure, said three people with knowledge of the matter. \n\nThe companies have written to the National Payments Corp. of India (NPCI), complaining that â‚¹700 crore of the â‚¹1,500 crore granted in the budget is being retained by banks, they said. This has deprived companies connecting up the last mile of state-promised revenues, according to them. The government granted the subsidies in exchange for waiving Merchant Discount Rate (MDR) charges.\n\n"The government has released â‚¹700 crore worth of subsidies to banks but they are not sharing it with any payment aggregators," said the CEO of a payments company on condition o

In [62]:
labels1 = list(df['Final Status'])
labels2 = list(df2['sentiment'])
labels = labels1+labels2

In [63]:
len(labels)

5244

In [64]:
from sklearn.model_selection import train_test_split
t_texts, test_texts, t_labels, test_labels = train_test_split(text, labels, test_size=.1)

In [65]:
train_texts, val_texts, train_labels, val_labels = train_test_split(t_texts, t_labels, test_size=.2)

In [66]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [67]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [68]:
import torch

class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

In [69]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

In [70]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)


In [71]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [72]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [73]:
trainer.train()



Step,Training Loss
10,0.7676
20,0.7521
30,0.715
40,0.6461
50,0.5525
60,0.4557
70,0.3996
80,0.3984
90,0.3496
100,0.3233


TrainOutput(global_step=354, training_loss=0.2602356519066008, metrics={'train_runtime': 284.5872, 'train_samples_per_second': 39.794, 'train_steps_per_second': 1.244, 'total_flos': 1500193289779200.0, 'train_loss': 0.2602356519066008, 'epoch': 3.0})

In [74]:
tokenizer.save_pretrained('sentiment-analysis')
trainer.save_model("sentiment-analysis")

In [75]:
from transformers import AutoModelForSequenceClassification,AutoTokenizer


# Load the trained model for inference
model_path = "sentiment-analysis"  # Replace with the directory where your trained model is saved
model = AutoModelForSequenceClassification.from_pretrained(model_path)

tokenizer = AutoTokenizer.from_pretrained('sentiment-analysis')

In [76]:
from transformers import pipeline

# Define the pipeline for text classification
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Input text for classification
input_text = "I dont like this company.The stocks are dropping and market is crashing"

# Perform inference
result = classifier(input_text)

# Print the result
print(result)

[{'label': 'LABEL_0', 'score': 0.970958411693573}]
