In [10]:
#Packages

import pandas as pd
import re
import string
from collections import defaultdict
from nltk.corpus import stopwords
import nltk
import io

In [18]:
# Download stopwords from the NLTK package
# Initialize English stopwords 

stops = set(stopwords.words('english'))
print(stops)
# https://pythonspot.com/nltk-stop-words/

# Input and output filenames
input_filename = "sample_data.csv"
final_filename = "final.csv"
word_freq_output = "word_freq.csv" 

# Function to clean and tokenize sentences

def clean_and_tokenize(sentence):
    sentence = re.sub(f'[{re.escape(string.punctuation)}]', '', sentence.lower())
    return [word for word in sentence.split() if word not in stops]

# Step 1: remove header from the input file and create a CSV file without it

def remove_header(input_file, output_file):
    file = pd.read_csv(input_file, header = None, skiprows = 1)
    file.to_csv(output_file, index = False)


# Step 2: extract and clean sentences from file

def extract_sentences(file):
    posts = file[2]
    sentences = []
    sentences_clean = []
    for post in posts:
        sentences.extend(re.split('[?.,]', post))
    for sentence in sentences:
        cleaned_tokens = clean_and_tokenize(sentence)
        if cleaned_tokens:
            sentences_clean.append(cleaned_tokens)
    return sentences_clean

# Step 3: calculate word frequencies:

def calculate_word_frequencies(sentences):
    freqs = {}
    total_words = 0
    for sentence in sentences:
        for word in sentence:
            freqs[word] = freqs.get(word, 0) + 1
            total_words += 1
    for word in freqs:
        freqs[word] /= total_words
    return freqs

# Step 4: write word frequencies to CSV

def write_word_frequencies(word_freq, output_file):
    word_freq_df = pd.DataFrame(word_freq.items(), columns = ["Word", "Frequency"])
    word_freq_df = word_freq_df.sort_values(by = "Frequency", ascending = False)
    word_freq_df.to_csv(output_file, index=False)
    print(f"Word frequencies written to {word_freq}")

if __name__ == "__main__":
    remove_header(input_filename, final_filename)
    file = pd.read_csv(final_filename, header = None)
    sentences = extract_sentences(file)
    word_freq = calculate_word_frequencies(sentences)
    write_word_frequencies(word_freq, word_freq_output)



{'ourselves', 'their', 'own', 'again', 'further', 'or', 'which', 'so', "mustn't", 'while', 'very', 'them', 'shouldn', "isn't", 'down', 'won', 'our', 'any', 'here', 'until', 'no', 'didn', 'd', 'be', 're', 'into', 'your', 'o', "hadn't", 'him', "couldn't", 'wouldn', 'myself', "don't", 'before', 'an', 't', 'when', 'they', 'where', 'is', 'herself', "aren't", 'theirs', "shouldn't", "you've", 'you', 'its', 'doesn', 'on', 'up', 'about', 'to', 'hers', "won't", "weren't", "doesn't", "shan't", 'during', 'from', 'll', 'with', 'for', 'this', 'what', 'did', 'most', "you're", 've', 'my', 'himself', 'more', 'mustn', 'he', 'because', 's', "it's", 'have', 'were', "should've", 'there', 'yourself', 'hadn', 'than', 'below', 'doing', 'by', 'been', 'whom', 'nor', 'ma', 'same', 'hasn', 'itself', 'was', "that'll", 'ours', 'mightn', 'ain', 'but', 'only', 'we', "mightn't", 'yourselves', 'such', 'yours', 'these', 'those', 'who', 'just', 'a', 'wasn', 'it', 'out', 'don', 'off', 'that', 'then', 'other', 'under', 'we