In [4]:
import pandas as pd
import numpy as np

# NLP
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Visualization
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# ML
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [5]:
df = pd.read_csv("/content/Sentiment_Stock_data.csv")
df.head()


Unnamed: 0.1,Unnamed: 0,Sentiment,Sentence
0,0,0,"According to Gran , the company has no plans t..."
1,1,1,"For the last quarter of 2010 , Componenta 's n..."
2,2,1,"In the third quarter of 2010 , net sales incre..."
3,3,1,Operating profit rose to EUR 13.1 mn from EUR ...
4,4,1,"Operating profit totalled EUR 21.1 mn , up fro..."


In [6]:
df.info()
df.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108751 entries, 0 to 108750
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  108751 non-null  int64 
 1   Sentiment   108751 non-null  int64 
 2   Sentence    108750 non-null  object
dtypes: int64(2), object(1)
memory usage: 2.5+ MB


Unnamed: 0,0
Unnamed: 0,0
Sentiment,0
Sentence,1


In [7]:
df = df.dropna()


In [8]:
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return " ".join(words)

df['clean_text'] = df['Sentence'].apply(clean_text)
df.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Unnamed: 0.1,Unnamed: 0,Sentiment,Sentence,clean_text
0,0,0,"According to Gran , the company has no plans t...",according gran company plan move production ru...
1,1,1,"For the last quarter of 2010 , Componenta 's n...",last quarter componenta net sale doubled eur e...
2,2,1,"In the third quarter of 2010 , net sales incre...",third quarter net sale increased eur mn operat...
3,3,1,Operating profit rose to EUR 13.1 mn from EUR ...,operating profit rose eur mn eur mn correspond...
4,4,1,"Operating profit totalled EUR 21.1 mn , up fro...",operating profit totalled eur mn eur mn repres...


In [9]:
le = LabelEncoder()
df['label'] = le.fit_transform(df['Sentiment'])

df[['Sentiment', 'label']].drop_duplicates()


Unnamed: 0,Sentiment,label
0,0,0
1,1,1


In [None]:
all_text = " ".join(df['clean_text'])

wordcloud = WordCloud(
    width=800,
    height=400,
    background_color='white'
).generate(all_text)

plt.figure(figsize=(12,5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()


In [None]:
X = df['clean_text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [None]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=5000,
        ngram_range=(1,2)
    )),
    ('clf', LogisticRegression(max_iter=1000))
])


In [None]:
pipeline.fit(X_train, y_train)


In [None]:
y_pred = pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, target_names=[str(c) for c in le.classes_]))

In [None]:
def predict_sentiment(text):
    cleaned = clean_text(text)
    pred = pipeline.predict([cleaned])[0]
    return le.inverse_transform([pred])[0]

sample_text = "The company's quarterly profits exceeded expectations"
print("Predicted Sentiment:", predict_sentiment(sample_text))
