In [1]:
# Time utilities
import time

# Data manipulation and file handling
import pandas as pd
import numpy as np
import csv
import shutil
from tempfile import NamedTemporaryFile
import io

# Natural language processing (NLP)
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer
from collections import Counter, defaultdict
import re
import string

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning and text processing
from sklearn import manifold
from sklearn.feature_extraction.text import CountVectorizer

# Additional utilities
import itertools
from itertools import combinations
import decimal
import operator


  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


**Part B**:

I took the below directly from our first assignment, removed stopwords. 

In [2]:
# raw word frequencies
# Download stopwords from the NLTK package
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Input and output filenames
input_filename = 'data/beer_reviews.csv'  # Input file
final_filename = 'data/femke_final.csv'  # Intermediate file without the column header
word_freq_output = 'data/femke_word_freq.csv'  # Output file for word frequencies

# Function to clean and tokenize sentences
def clean_and_tokenize(sentence):
    """
    Cleans a given sentence by removing punctuation, converting text to lowercase,
    and tokenizing the remaining words.
    """
    # Remove punctuation and convert text to lowercase
    sentence = re.sub(f'[{re.escape(string.punctuation)}]', '', sentence.lower())

    # Tokenize and remove stopwords
    return [word for word in sentence.split()]

# Step 1: Remove header from the input CSV and create a new file without it
def remove_header(input_file, output_file):
    """
    Reads the input CSV file, removes the header, and writes the remaining rows
    into a new output file.
    """
    file = pd.read_csv(input_file, header = None, skiprows = 1)
    file.to_csv(output_file, index = False)

# Step 2: Extract and clean sentences from the text
def extract_sentences(file):
    """
    Extracts text data from the third column of the CSV file, splits it into sentences,
    and cleans each sentence by removing punctuation and stopwords.
    """
    df = pd.read_csv(file, header = None)
    posts = df.iloc[:,2]
    sentences = []
    sentences_clean = []
    for post in posts:
        sentences.extend(re.split('[?.!]', post))
    for sentence in sentences:
        cleaned_tokens = clean_and_tokenize(sentence)
        if cleaned_tokens:
            sentences_clean.append(cleaned_tokens)
    return sentences_clean


# Step 3: Calculate word frequencies
def calculate_word_frequencies(sentences):
    """
    Calculates the frequency of each word in the given list of cleaned sentences.
    """
    freqs = {}
    total_words = 0
    for sentence in sentences:
        for word in sentence:
            if word not in stop_words:
                freqs[word] = freqs.get(word, 0) + 1
                total_words += 1
    return freqs

# Step 4: Write word frequencies to CSV
def write_word_frequencies(word_freq, output_file):
    """
    Writes the word frequencies to the specified CSV file.
    """
    word_freq_df = pd.DataFrame(word_freq.items(), columns = ["Word", "Frequency"])
    word_freq_df = word_freq_df.sort_values(by = "Frequency", ascending = False)
    word_freq_df.to_csv(output_file, index=False)
    print(f"Word frequencies written to {output_file}")

# Main function to run all steps
def main():
    # Step 1: Remove header
    remove_header(input_filename, final_filename)
    
    # Step 2: Extract and clean sentences
    sentences = extract_sentences(final_filename)
    
    # Step 3: Calculate word frequencies
    word_freq = calculate_word_frequencies(sentences)
    
    # Step 4: Write word frequencies to CSV
    write_word_frequencies(word_freq, word_freq_output)

if __name__ == "__main__":
    main()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/fmunting/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Word frequencies written to data/femke_word_freq.csv


In [4]:
# Lift Calculation

# Initialize global variables and data structures
df_lift = pd.DataFrame(columns=['word1', 'word2', 'lift_value'])  # To store lift values
word_frequency = {}  # Dictionary to store word frequency in posts
word_pair_frequency = defaultdict(dict)  # Dictionary to store word pair co-occurrence frequency
results_dict = {}  # Dictionary to store results with lift values for word pairs
file_length = 0  # Number of rows in the input file
itr = 0  # Row iterator for the lift DataFrame

# File paths
input_file = 'data/femke_final.csv'  # Input data file
pair_keys_file = 'femke_beer_attributes.txt'  # File containing the words to calculate lift
output_lift_values = 'data/femke_Lift_Values.csv'  # Output file for lift values
output_lift_matrix = 'data/femke_Lift_Matrix.csv'  # Output file for lift matrix

# Load NLTK stopwords
stop_words = set(stopwords.words('english'))

# Function to clean and tokenize text (removes punctuation and stopwords)
def clean_text(text):
    """
    Cleans a given text by removing punctuation, converting it to lowercase,
    and tokenizing it, ignoring any stopwords.
    """
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize text
    tokens = word_tokenize(text)

    # Remove stopwords
    cleaned_tokens = [word for word in tokens if word not in stop_words]

    return cleaned_tokens

# Step 1: Load the words from the edmunds_pair_keys.txt file and generate all pairs
def load_word_pairs(filename):
    """
    Loads words from a file where words are comma-separated in each row.
    Returns a list of all possible word pairs for each row.
    """
    word_pairs = []
    with open(filename, 'r') as file:
        reader = csv.reader(file)
        for row in reader:
            # Generate all possible word pairs from each row
            pairs = list(combinations(row, 2))
            word_pairs.extend(pairs)

    return word_pairs

# Step 2: Process the input CSV file to extract posts and clean the text
def process_input_file(input_filename):
    """
    Processes the input CSV file to extract and clean posts. Each post is tokenized,
    cleaned of punctuation and stopwords, and stored in a list.
    """
    posts = []
    global file_length
    df = pd.read_csv(input_filename)  # Load the CSV file into a DataFrame

    # Assuming 'comments' is the column that contains the text
    for index, row in df.iterrows():
        cleaned_post = clean_text(row[2])  # Clean and tokenize the post
        posts.append(cleaned_post)

    file_length = len(df)  # Get the total number of rows
    return posts

# Step 3: Calculate word frequencies and word pair co-occurrences (distance <= 7 words)
# Step 3: Calculate word frequencies and word pair co-occurrences (distance <= 7 words)
def calculate_frequencies(posts):
    """
    Calculates the frequency of individual words and word pairs within the posts.
    Updates the global word_frequency and word_pair_frequency dictionaries.
    Only considers word pairs that are 7 or more words apart.
    """
    global word_frequency, word_pair_frequency

    for post in posts:
        word_positions = {}  # Dictionary to track positions of each word

        # Track word positions
        for idx, word in enumerate(post):
            if word not in word_positions:
                word_positions[word] = []
            word_positions[word].append(idx)

        # Count word frequencies
        unique_words = set(post)  # Track unique words in the post to avoid double counting
        for word in unique_words:
            word_frequency[word] = word_frequency.get(word, 0) + 1

        # Track word pairs that have already been marked as co-occurring in this post
        seen_pairs = set()
        # Count word pair co-occurrences with distance check
        for word1 in word_positions:
            for word2 in word_positions:
                if word1 != word2 and (word1, word2) not in seen_pairs:
                    # Check if the words are 7 or fewer positions apart
                    found_pair = False
                    for pos1 in word_positions[word1]:
                        for pos2 in word_positions[word2]:
                            if abs(pos1 - pos2) <= 7:
                                word_pair_frequency[word1][word2] = word_pair_frequency.get(word1, {}).get(word2, 0) + 1
                                seen_pairs.add((word1, word2))  # Mark this pair as seen
                                found_pair = True
                                break  # No need to check more positions; move to the next pair
                        if found_pair:
                            break  # Stop after finding one valid pair in this post
                            
# Step 4: Calculate the lift between word pairs
def calculate_lift(word_pairs):
    """
    Calculates the lift between word pairs using the formula:
    Lift(word1, word2) = P(word1 AND word2) / (P(word1) * P(word2))
    Lift is written to the lift values CSV and stored in a DataFrame for further processing.
    """
    global itr
    
    for word1, word2 in word_pairs:
        # Get the frequency of word1, word2, and their co-occurrence
        freq_word1 = word_frequency.get(word1, 0)
        freq_word2 = word_frequency.get(word2, 0)
        co_occurrence = word_pair_frequency.get(word1, {}).get(word2, 0)

        # Calculate probabilities
        p_word1 = freq_word1 / file_length if freq_word1 else 0
        p_word2 = freq_word2 / file_length if freq_word2 else 0
        p_word1_and_word2 = co_occurrence / file_length if co_occurrence else 0

        # Avoid division by zero
        if p_word1 > 0 and p_word2 > 0:
            lift_value = p_word1_and_word2 / (p_word1 * p_word2) if (p_word1 * p_word2) > 0 else 0
        else:
            lift_value = 0
        # Store lift value in DataFrame
        df_lift.loc[itr] = [word1, word2, lift_value]
        itr += 1
    return df_lift

# Step 5: Write lift values and matrix to CSV
def save_results(df_lift):
    """
    Writes the calculated lift values to a CSV file and also generates a lift matrix,
    saving it to another CSV.
    """
    # Save lift values DataFrame to CSV
    # must create duplicate word pairs to create 10x10 matrix
    df_lift2 = pd.DataFrame({'word1':df_lift.word2, 'word2':df_lift.word1, 'lift_value':df_lift.lift_value})
    df_lift = pd.concat([df_lift, df_lift2], ignore_index=True)
    df_lift.to_csv(output_lift_values, index=False)

    # Generate lift matrix
    lift_matrix = pd.pivot_table(df_lift, values='lift_value', index='word1', columns='word2', fill_value=0)
    lift_matrix.index.name = ''
    lift_matrix.to_csv(output_lift_matrix)

# Main function to run all steps
def main():
    # Step 1: Load word pairs
    word_pairs = load_word_pairs(pair_keys_file)

    # Step 2: Process the input file to extract posts
    posts = process_input_file(input_file)

    # Step 3: Calculate frequencies
    calculate_frequencies(posts)

    # Step 4: Calculate lift values
    df_lift = calculate_lift(word_pairs)

    # Step 5: Save results
    save_results(df_lift)

# Run the script
if __name__ == "__main__":
    main()

  cleaned_post = clean_text(row[2])  # Clean and tokenize the post
