In [10]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize , sent_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import matplotlib.pyplot as plt

In [2]:
stop_words = set(stopwords.words('english'))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [3]:
def tokenize_and_count(file_path):
    """
    Reads text from a file, tokenizes it into words, converts them to lowercase, and returns the list of words
    and their count.

    Parameters:
    - file_path: str, the path to the text file

    Returns:
    - tuple: (list of tokenized words, int count of words)
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    
    # Tokenize by words and convert to lowercase
    words = text.split()
    words = [word.lower() for word in words]
    
    # Return the list of words and their count
    return words, len(words)


In [4]:
def clean_and_tokenize(words):
    # Initialize lists
    cleaned_txt = []
    
    # Define stop words
    stop_words = set(stopwords.words('english'))
    
    # Remove stop words
    for i in words:
        if i.lower() not in stop_words:
            cleaned_txt.append(i)
    
    # Join list into a single string
    cleaned_txt = " ".join(cleaned_txt)
    
    # Remove digits and digits within brackets
    cleaned_txt = re.sub(r'[0-9]', "", cleaned_txt)
    cleaned_txt = re.sub(r'\[\d+\]', "", cleaned_txt)
    
    # Tokenize words
    word_token = word_tokenize(cleaned_txt)
    
    # Apply stemming
    p = PorterStemmer()
    porter_stemmer = [p.stem(word) for word in word_token]
    
    # Join stemmed words into a single string
    cleaned_txt = " ".join(porter_stemmer)
    
    # Tokenize sentences
    sent_token = sent_tokenize(cleaned_txt)
    
    return sent_token


In [5]:
from textblob import TextBlob

def analyze_sentiments(sentences):
    """
    Analyzes sentiment of a list of sentences and classifies each sentence as positive, neutral, or negative.

    Parameters:
    - sentences: list of str, list containing sentences to be analyzed

    Returns:
    - tuple: (list of sentences, list of polarity scores, list of sentiment labels)
    """
    sentence_list = []
    polarity_act = []
    polarity_list = []

    for sentence in sentences:
        blob = TextBlob(sentence)  # Create a TextBlob object
        polarity = blob.sentiment.polarity  # Get the polarity score
        sentence_list.append(sentence)
        polarity_act.append(polarity)

        # Determine sentiment and append to polarity_list
        if polarity > 0:
            sentiment_label = 1  # Positive
            print(f"Sentence: {sentence}")
            print(f"Polarity: {polarity}")
            print(f"Sentiment: Positive")
            polarity_list.append(sentiment_label)
        elif polarity == 0:
            sentiment_label = 0  # Neutral
            print(f"Sentence: {sentence}")
            print(f"Polarity: {polarity}")
            print(f"Sentiment: Neutral")
            polarity_list.append(sentiment_label)
        else:
            sentiment_label = -1  # Negative
            print(f"Sentence: {sentence}")
            print(f"Polarity: {polarity}")
            print(f"Sentiment: Negative")
            polarity_list.append(sentiment_label)
        
        print("\t")
    
    return sentence_list, polarity_act, polarity_list


In [6]:
file_path = 'test_sample.txt'
words, count = tokenize_and_count(file_path)
sent_tokens = clean_and_tokenize(words)
sentence_list, polarity_act, polarity_list = analyze_sentiments(sent_tokens)

Sentence: life often present complex tapestri experi , fill triumph challeng .
Polarity: -0.15
Sentiment: Negative
	
Sentence: one hand , sunni day joy moment testament beauti life offer .
Polarity: 0.8
Sentiment: Positive
	
Sentence: instanc , receiv unexpect promot work bring immens satisfact sens achiev .
Polarity: 0.0
Sentiment: Neutral
	
Sentence: similarli , enjoy peac walk park crisp autumn day provid refresh escap daili stress .
Polarity: 0.325
Sentiment: Positive
	
Sentence: moment joy often surround support love one , make even memor .
Polarity: 0.65
Sentiment: Positive
	
Sentence: celebr , birthday anniversari , occas reinforc bond share closest us .
Polarity: 0.0
Sentiment: Neutral
	
Sentence: howev , life alway smooth sail .
Polarity: 0.4
Sentiment: Positive
	
Sentence: sometim , encount setback feel overwhelm .
Polarity: 0.0
Sentiment: Neutral
	
Sentence: exampl , sudden health issu unforeseen financi problem creat signific amount stress uncertainti .
Polarity: 0.0
Sentim

In [7]:
test_sample = pd.DataFrame({
    "Sentence" : sentence_list, 
    "Polarity_val" : polarity_list,
    "Polarity_score" : polarity_act
})

In [8]:
test_sample.to_csv("test_sample.csv")