In [1]:
import chardet
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
import re
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
#detect the encoding of the CSV file
with open('financialdata.csv', 'rb') as file:
    result = chardet.detect(file.read())

try:
    finnews_df = pd.read_csv('financialdata.csv', encoding=result['encoding'],header=None)
except UnicodeDecodeError as e:
    print(f"UnicodeDecodeError: {e}")
    
finnews_df.columns = ['sentiment','text']

finnews_df.info()
news_df1= pd.read_csv("SEN_en_AMT_nooutlier.csv")
news_df2= pd.read_csv("SEN_en_R_nooutlier.csv")

columns_to_keep = ['headline','majority_label']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4846 entries, 0 to 4845
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  4846 non-null   object
 1   text       4846 non-null   object
dtypes: object(2)
memory usage: 75.8+ KB


In [3]:
#keeping the targetted columns -->
news_df1 = news_df1[columns_to_keep]
news_df2 = news_df2[columns_to_keep]


news_df = pd.concat([news_df1, news_df2], axis=0, ignore_index=True)

replace_dict = {'neg': 'Negative', 'neutr': 'Neutral','pos':'Positive'}
news_df['majority_label'] = news_df['majority_label'].replace(replace_dict)


#drop irrelevant value
#define the values to drop
values_to_drop = ['unk', 'UNK']
news_df = news_df[~news_df['majority_label'].isin(values_to_drop)]

#renaming the columns with new names
new_column_names = {'headline': 'text', 'majority_label': 'sentiment'}
news_df = news_df.rename(columns=new_column_names)

news_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2446 entries, 0 to 2464
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       2446 non-null   object
 1   sentiment  2446 non-null   object
dtypes: object(2)
memory usage: 57.3+ KB


In [4]:
final_df = pd.merge(finnews_df, news_df, on=['text','sentiment'], how='outer')
print(final_df['sentiment'].value_counts())
print("\n")
final_df.info()

sentiment
neutral     2879
positive    1363
Neutral     1097
Negative     905
negative     604
Positive     444
Name: count, dtype: int64


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7292 entries, 0 to 7291
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  7292 non-null   object
 1   text       7292 non-null   object
dtypes: object(2)
memory usage: 114.1+ KB


Text Cleaning


In [5]:
final_df

Unnamed: 0,sentiment,text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...
...,...,...
7287,Negative,Trump’s NATO parade of falsehoods and misstate...
7288,Negative,Lawyer for Democrats calls Trump ‘a clear and ...
7289,Negative,How could 63 million people be wrong? The GOP ...
7290,Neutral,What Trump claimed about the Russia probe — an...


In [6]:

class TextCleaner:
    
    def lowercase(self, text):
        # Lowercase
        text = text.lower()
        return text
    
    def remove_links(self, text):
        # Remove any link
        text = re.sub(r'https*\S+', ' ', text)
        text = re.sub(r'http*\S+', ' ', text)
        return text
    
    def remove_alphanumeric(self, text):
        # Remove alphanumeric characters
        text = re.sub(r'\W+', ' ', text)
        return text

    def remove_word_with_number(self, text):
        # Remove any word containing number
        text = re.sub(r'\b\w*\d+\w*\b', ' ', text)
        return text


    def remove_extra_whitespace(self, text):
        # Remove extra white space
        text = re.sub(r'\s{2,}', ' ', text)
        # Remove leading and trailing whitespaces
        text = text.strip()
        return text

    def clean_text(self, text):
       
        # Use the individual functions to clean the text
        text = self.lowercase(text)
        text = self.remove_links(text)
        #text = self.remove_alphanumeric(text)
        #text = self.remove_word_with_number(text)
        text = self.remove_extra_whitespace(text)

        return text

In [7]:
#create an instance of TextCleaner
cleaner = TextCleaner()


In [8]:
#apply the clean_text method to the 'text_column' in the DataFrame
final_df['cleaned_text'] = final_df['text'].progress_apply(cleaner.clean_text)
final_df2 = final_df[['cleaned_text','sentiment']].copy()

100%|██████████| 7292/7292 [00:00<00:00, 260066.53it/s]


In [9]:
final_df2.to_csv("processed_data.csv",index=False)

In [10]:
X = final_df2.cleaned_text.values
y = final_df2.sentiment.values


In [11]:
# #split for training,testing and validation
X_temp, x_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=40,shuffle=True)

#split for testing and validation purpose
X_train, x_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.1, random_state=40,shuffle=True)
# Verify the shapes of the datasets
print("Training set shape:", X_train.shape, y_train.shape)
print("Test set shape:", x_test.shape, y_test.shape)
print("Validation dataset shape:", x_val.shape, y_val.shape)
print("\n")

unique_labels_train, counts_train = np.unique(y_train, return_counts=True)
print("Training set label counts:")
for label, count in zip(unique_labels_train, counts_train):
    print(f"{label}: {count}")
print("\n")

unique_labels_val, counts_val = np.unique(y_test, return_counts=True)
print("Test set label counts:")
for label, count in zip(unique_labels_val, counts_val):
    print(f"{label}: {count}")
print("\n")
# Print category labels and their counts in y_test
unique_labels_val, counts_val = np.unique(y_val, return_counts=True)
print("\nValidation set label counts:")
for label, count in zip(unique_labels_val, counts_val):
    print(f"{label}: {count}")

Training set shape: (5249,) (5249,)
Test set shape: (1459,) (1459,)
Validation dataset shape: (584,) (584,)


Training set label counts:
Negative: 654
Neutral: 756
Positive: 309
negative: 438
neutral: 2110
positive: 982


Test set label counts:
Negative: 177
Neutral: 244
Positive: 100
negative: 118
neutral: 560
positive: 260



Validation set label counts:
Negative: 74
Neutral: 97
Positive: 35
negative: 48
neutral: 209
positive: 121
