In [15]:
import pandas as pd 
import os
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [14]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/zeynmehezmacbook/nltk_data...


True

In [2]:
nltk.data.path.append(os.path.expanduser("~/nltk_data"))
def download_nltk_resources():
    resources = ["punkt", "stopwords", "wordnet"]
    for resource in resources:
        try:
            nltk.data.find(f"tokenizers/{resource}" if resource == "punkt" else f"corpora/{resource}")
        except LookupError:
            nltk.download(resource, download_dir=os.path.expanduser("~/nltk_data"))


download_nltk_resources()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/zeynmehezmacbook/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [19]:
root_dpath='/Users/zeynmehezmacbook/Desktop/AladinDocs/Data-Hub'

In [20]:
df_data=pd.read_csv(os.path.join(root_dpath,'nlp-data','financial-review.csv'),on_bad_lines='skip',encoding='ISO-8859-1')

In [21]:
df_data

Unnamed: 0,neutral,"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing ."
0,neutral,Technopolis plans to develop in stages an area...
1,negative,The international electronic industry company ...
2,positive,With the new production plant the company woul...
3,positive,According to the company 's updated strategy f...
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...
...,...,...
4840,negative,LONDON MarketWatch -- Share prices ended lower...
4841,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4842,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4843,negative,Net sales of the Paper segment decreased to EU...


In [22]:
df_data.columns=['sentiment','sentence']

In [23]:
df_data

Unnamed: 0,sentiment,sentence
0,neutral,Technopolis plans to develop in stages an area...
1,negative,The international electronic industry company ...
2,positive,With the new production plant the company woul...
3,positive,According to the company 's updated strategy f...
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...
...,...,...
4840,negative,LONDON MarketWatch -- Share prices ended lower...
4841,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4842,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4843,negative,Net sales of the Paper segment decreased to EU...


In [27]:
def data_cleaner(data):
    data = data.lower()
    data = re.sub(r'\d','',data)
    data = re.sub(r'[^\w\s]','',data)
    return data
    

In [28]:
df_data['clean_sentence']=df_data['sentence'].apply(data_cleaner)

In [29]:
df_data

Unnamed: 0,sentiment,sentence,clean_sentence
0,neutral,Technopolis plans to develop in stages an area...,technopolis plans to develop in stages an area...
1,negative,The international electronic industry company ...,the international electronic industry company ...
2,positive,With the new production plant the company woul...,with the new production plant the company woul...
3,positive,According to the company 's updated strategy f...,according to the company s updated strategy fo...
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...,financing of aspocomp s growth aspocomp is agg...
...,...,...,...
4840,negative,LONDON MarketWatch -- Share prices ended lower...,london marketwatch share prices ended lower i...
4841,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...,rinkuskiai s beer sales fell by per cent to ...
4842,negative,Operating profit fell to EUR 35.4 mn from EUR ...,operating profit fell to eur mn from eur mn ...
4843,negative,Net sales of the Paper segment decreased to EU...,net sales of the paper segment decreased to eu...


In [30]:
def preprocess_text(text):

    tokens = word_tokenize(text.lower())
    words = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return words

In [31]:
df_data['clean_sentence2']=df_data['clean_sentence'].apply(preprocess_text)

In [32]:
df_data

Unnamed: 0,sentiment,sentence,clean_sentence,clean_sentence2
0,neutral,Technopolis plans to develop in stages an area...,technopolis plans to develop in stages an area...,"[technopolis, plan, develop, stage, area, less..."
1,negative,The international electronic industry company ...,the international electronic industry company ...,"[international, electronic, industry, company,..."
2,positive,With the new production plant the company woul...,with the new production plant the company woul...,"[new, production, plant, company, would, incre..."
3,positive,According to the company 's updated strategy f...,according to the company s updated strategy fo...,"[according, company, updated, strategy, year, ..."
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...,financing of aspocomp s growth aspocomp is agg...,"[financing, aspocomp, growth, aspocomp, aggres..."
...,...,...,...,...
4840,negative,LONDON MarketWatch -- Share prices ended lower...,london marketwatch share prices ended lower i...,"[london, marketwatch, share, price, ended, low..."
4841,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...,rinkuskiai s beer sales fell by per cent to ...,"[rinkuskiai, beer, sale, fell, per, cent, mill..."
4842,negative,Operating profit fell to EUR 35.4 mn from EUR ...,operating profit fell to eur mn from eur mn ...,"[operating, profit, fell, eur, mn, eur, mn, in..."
4843,negative,Net sales of the Paper segment decreased to EU...,net sales of the paper segment decreased to eu...,"[net, sale, paper, segment, decreased, eur, mn..."


In [42]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')

def analyze_sentiment(text):
    sentiment_dict={'neu':'neutral','pos':'positive','neg':'negative','compound':'compound'}
    text=" ".join(text)
    sid = SentimentIntensityAnalyzer()
    sentiment_scores = sid.polarity_scores(text)
    max_val=max(sentiment_scores.values())
    res = [key for key in sentiment_scores if sentiment_scores[key] == max_val] 
    return sentiment_dict[res[0]]


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/zeynmehezmacbook/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [43]:
df_data['sentiment_score']=df_data['clean_sentence2'].apply(analyze_sentiment)

In [44]:
df_data

Unnamed: 0,sentiment,sentence,clean_sentence,clean_sentence2,sentiment_score
0,neutral,Technopolis plans to develop in stages an area...,technopolis plans to develop in stages an area...,"[technopolis, plan, develop, stage, area, less...",neutral
1,negative,The international electronic industry company ...,the international electronic industry company ...,"[international, electronic, industry, company,...",neutral
2,positive,With the new production plant the company woul...,with the new production plant the company woul...,"[new, production, plant, company, would, incre...",compound
3,positive,According to the company 's updated strategy f...,according to the company s updated strategy fo...,"[according, company, updated, strategy, year, ...",neutral
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...,financing of aspocomp s growth aspocomp is agg...,"[financing, aspocomp, growth, aspocomp, aggres...",neutral
...,...,...,...,...,...
4840,negative,LONDON MarketWatch -- Share prices ended lower...,london marketwatch share prices ended lower i...,"[london, marketwatch, share, price, ended, low...",neutral
4841,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...,rinkuskiai s beer sales fell by per cent to ...,"[rinkuskiai, beer, sale, fell, per, cent, mill...",neutral
4842,negative,Operating profit fell to EUR 35.4 mn from EUR ...,operating profit fell to eur mn from eur mn ...,"[operating, profit, fell, eur, mn, eur, mn, in...",compound
4843,negative,Net sales of the Paper segment decreased to EU...,net sales of the paper segment decreased to eu...,"[net, sale, paper, segment, decreased, eur, mn...",neutral
