In [2]:
import pandas as pd 
import os
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [3]:
root_dpath='/Users/zeynmehezmacbook/Desktop/AladinDocs/Data-Hub'

In [4]:
df_data=pd.read_csv(os.path.join(root_dpath,'nlp-data','financial-review.csv'),on_bad_lines='skip',encoding='ISO-8859-1')

In [5]:
df_data

Unnamed: 0,neutral,"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing ."
0,neutral,Technopolis plans to develop in stages an area...
1,negative,The international electronic industry company ...
2,positive,With the new production plant the company woul...
3,positive,According to the company 's updated strategy f...
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...
...,...,...
4840,negative,LONDON MarketWatch -- Share prices ended lower...
4841,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4842,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4843,negative,Net sales of the Paper segment decreased to EU...


In [6]:
df_data.columns=['sentiment','sentence']

In [7]:
df_data

Unnamed: 0,sentiment,sentence
0,neutral,Technopolis plans to develop in stages an area...
1,negative,The international electronic industry company ...
2,positive,With the new production plant the company woul...
3,positive,According to the company 's updated strategy f...
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...
...,...,...
4840,negative,LONDON MarketWatch -- Share prices ended lower...
4841,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4842,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4843,negative,Net sales of the Paper segment decreased to EU...


In [8]:
def preprocess_text(text):
    data = text.lower()
    data = re.sub(r'\d','',data)
    data = re.sub(r'[^\w\s]','',data)
    tokens = word_tokenize(data)
    words = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(words)

In [9]:
df_data['clean_sentence']=df_data['sentence'].apply(preprocess_text)

In [10]:
df_data

Unnamed: 0,sentiment,sentence,clean_sentence
0,neutral,Technopolis plans to develop in stages an area...,technopolis plan develop stage area less squar...
1,negative,The international electronic industry company ...,international electronic industry company elco...
2,positive,With the new production plant the company woul...,new production plant company would increase ca...
3,positive,According to the company 's updated strategy f...,according company updated strategy year baswar...
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...,financing aspocomp growth aspocomp aggressivel...
...,...,...,...
4840,negative,LONDON MarketWatch -- Share prices ended lower...,london marketwatch share price ended lower lon...
4841,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...,rinkuskiai beer sale fell per cent million lit...
4842,negative,Operating profit fell to EUR 35.4 mn from EUR ...,operating profit fell eur mn eur mn including ...
4843,negative,Net sales of the Paper segment decreased to EU...,net sale paper segment decreased eur mn second...


In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')

def analyze_sentiment(text):
    sentiment_dict={'neu':'neutral','pos':'positive','neg':'negative','compound':'compound'}
    sid = SentimentIntensityAnalyzer()
    sentiment_scores = sid.polarity_scores(text)
    max_val=max(sentiment_scores.values())
    res = [key for key in sentiment_scores if sentiment_scores[key] == max_val] 
    return sentiment_dict[res[0]]


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/zeynmehezmacbook/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [43]:
df_data['sentiment_score']=df_data['clean_sentence2'].apply(analyze_sentiment)

In [19]:
df_data

Unnamed: 0,sentiment,sentence,clean_sentence,clean_sentence2
0,neutral,Technopolis plans to develop in stages an area...,technopolis plans to develop in stages an area...,technopolis plan develop stage area less squar...
1,negative,The international electronic industry company ...,the international electronic industry company ...,international electronic industry company elco...
2,positive,With the new production plant the company woul...,with the new production plant the company woul...,new production plant company would increase ca...
3,positive,According to the company 's updated strategy f...,according to the company s updated strategy fo...,according company updated strategy year baswar...
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...,financing of aspocomp s growth aspocomp is agg...,financing aspocomp growth aspocomp aggressivel...
...,...,...,...,...
4840,negative,LONDON MarketWatch -- Share prices ended lower...,london marketwatch share prices ended lower i...,london marketwatch share price ended lower lon...
4841,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...,rinkuskiai s beer sales fell by per cent to ...,rinkuskiai beer sale fell per cent million lit...
4842,negative,Operating profit fell to EUR 35.4 mn from EUR ...,operating profit fell to eur mn from eur mn ...,operating profit fell eur mn eur mn including ...
4843,negative,Net sales of the Paper segment decreased to EU...,net sales of the paper segment decreased to eu...,net sale paper segment decreased eur mn second...


In [11]:
df_data['sentiment'].value_counts()

sentiment
neutral     2878
positive    1363
negative     604
Name: count, dtype: int64

In [12]:
df_data['sentiment_label']=df_data['sentiment'].map({'neutral':0,'positive':1,'negative':-1})

In [13]:
df_data

Unnamed: 0,sentiment,sentence,clean_sentence,sentiment_label
0,neutral,Technopolis plans to develop in stages an area...,technopolis plan develop stage area less squar...,0
1,negative,The international electronic industry company ...,international electronic industry company elco...,-1
2,positive,With the new production plant the company woul...,new production plant company would increase ca...,1
3,positive,According to the company 's updated strategy f...,according company updated strategy year baswar...,1
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...,financing aspocomp growth aspocomp aggressivel...,1
...,...,...,...,...
4840,negative,LONDON MarketWatch -- Share prices ended lower...,london marketwatch share price ended lower lon...,-1
4841,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...,rinkuskiai beer sale fell per cent million lit...,0
4842,negative,Operating profit fell to EUR 35.4 mn from EUR ...,operating profit fell eur mn eur mn including ...,-1
4843,negative,Net sales of the Paper segment decreased to EU...,net sale paper segment decreased eur mn second...,-1


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df_data["clean_sentence"])
y = df_data["sentiment_label"]

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2, random_state=42)

In [17]:
from sklearn.linear_model import LogisticRegression

In [18]:
model = LogisticRegression()
model.fit(X_train, y_train)


In [19]:
from sklearn.naive_bayes import GaussianNB

In [21]:
X_train=X_train.toarray()

In [22]:
model_gnb = GaussianNB()
model_gnb.fit(X_train, y_train)

In [23]:
from sklearn.metrics import accuracy_score, classification_report

In [24]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(report)

Accuracy: 0.73
              precision    recall  f1-score   support

          -1       0.76      0.38      0.51       115
           0       0.71      0.95      0.81       567
           1       0.77      0.43      0.55       287

    accuracy                           0.73       969
   macro avg       0.75      0.58      0.62       969
weighted avg       0.74      0.73      0.70       969



In [26]:
X_test=X_test.toarray()

In [27]:
y_pred = model_gnb.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(report)

Accuracy: 0.53
              precision    recall  f1-score   support

          -1       0.32      0.51      0.40       115
           0       0.71      0.59      0.64       567
           1       0.37      0.40      0.38       287

    accuracy                           0.53       969
   macro avg       0.47      0.50      0.47       969
weighted avg       0.56      0.53      0.54       969



In [29]:
from imblearn.over_sampling import SMOTE

X_resampled, y_resampled = SMOTE().fit_resample(X_train, y_train)

In [30]:
from sklearn.ensemble import RandomForestClassifier

model_rand = RandomForestClassifier(class_weight='balanced')

In [31]:
model_rand.fit(X_resampled, y_resampled)

In [32]:
y_pred = model_rand.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(report)

Accuracy: 0.73
              precision    recall  f1-score   support

          -1       0.72      0.46      0.56       115
           0       0.73      0.92      0.81       567
           1       0.74      0.45      0.56       287

    accuracy                           0.73       969
   macro avg       0.73      0.61      0.64       969
weighted avg       0.73      0.73      0.71       969

