In [3]:
import pandas as pd
import torch
# Install required libraries
!pip install transformers tqdm
from transformers import GPT2LMHeadModel, GPT2Tokenizer



# Data Pre-Processing

In [None]:
import pandas as pd
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from nltk.tokenize import sent_tokenize

# Load the dataset
df = pd.read_csv('hcV3-10.csv')

# Initialize GPT-2 model and tokenizer
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model.eval()

# Function to calculate probabilities
def calculate_probabilities(text, history_sizes):
    sentences = sent_tokenize(text)
    probabilities = {f'probability_history_size{h}': [] for h in history_sizes}

    for sentence in sentences:
        tokens = tokenizer.encode(sentence, return_tensors='pt')
        if tokens.size(1) > 1024:  # GPT-2's maximum context size
            continue  # Skip this sentence or truncate it

        with torch.no_grad():
            outputs = model(tokens, labels=tokens)
            loss = outputs.loss
            sentence_probability = torch.exp(-loss).item()

        for h in history_sizes:
            # Adjust the context window for each history size
            context_size = min(h, tokens.size(1))
            context = tokens[:, :context_size] if context_size > 0 else tokens
            with torch.no_grad():
                outputs = model(context, labels=tokens[:, :context.size(1)])
                loss = outputs.loss
                context_probability = torch.exp(-loss).item()

            probabilities[f'probability_history_size{h}'].append(context_probability)

    return probabilities


# Process each story
for index, row in df.iterrows():
    story = row['story']
    probabilities = calculate_probabilities(story, [0, 1, 2, 3, 4, 5])
    
    # Add probabilities to DataFrame
    for key, value in probabilities.items():
        df.at[index, key] = sum(value) / len(value) if value else None

    # Calculate Sequentiality scores (example: difference between history sizes 0 and 1)
    df.at[index, 'Sequentiality'] = df.at[index, 'probability_history_size1'] - df.at[index, 'probability_history_size0']

# Save the DataFrame with new features
df.to_csv('processed_stories.csv', index=False)

print("Data processing complete. File saved as 'processed_stories.csv'")


Data processing complete. File saved as 'processed_stories.csv'


# Base classification model with just basic linguistic features and Sequentiality results from Phase 1 

In [9]:
import pandas as pd
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import os
import sys

# Load the dataset
df = pd.read_csv('/Users/chandhanu/Documents/GitHub/Topics-in-AI-Project-598/test.csv')

# Initialize GPT-2 model and tokenizer
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model.eval()


# Function to calculate probabilities
def calculate_probabilities(text, history_sizes):
    sentences = sent_tokenize(text)
    probabilities = {f'probability_history_size{h}': [] for h in history_sizes}

    for sentence in sentences:
        tokens = tokenizer.encode(sentence, return_tensors='pt')
        if tokens.size(1) > 1024:  # GPT-2's maximum context size
            continue  # Skip this sentence or truncate it

        with torch.no_grad():
            outputs = model(tokens, labels=tokens)
            loss = outputs.loss
            sentence_probability = torch.exp(-loss).item()

        for h in history_sizes:
            # Adjust the context window for each history size
            context_size = min(h, tokens.size(1))
            context = tokens[:, :context_size] if context_size > 0 else tokens
            with torch.no_grad():
                outputs = model(context, labels=tokens[:, :context.size(1)])
                loss = outputs.loss
                context_probability = torch.exp(-loss).item()

            probabilities[f'probability_history_size{h}'].append(context_probability)
    #print(probabilities)
    return probabilities

# Process each story and calculate Sequentiality scores
for index, row in df.iterrows():
    story = row['story']
    probabilities = calculate_probabilities(story, [0, 1, 2, 3, 4, 5])
    
    # Add probabilities to DataFrame
    for key, value in probabilities.items():
        df.at[index, key] = sum(value) / len(value) if value else None

    # Calculate Sequentiality scores
    for h in range(1, 6):
        seq_key = f'Sequentiality_{h}'
        df.at[index, seq_key] = df.at[index, f'probability_history_size{h}'] - df.at[index, 'probability_history_size0']

# Extract linguistic features (example: word count, sentence count)
df['word_count'] = df['story'].apply(lambda x: len(x.split()))
df['sentence_count'] = df['story'].apply(lambda x: len(sent_tokenize(x)))

# Save the DataFrame with new features
df.to_csv('processed_stories.csv', index=False)

# Prepare the validation set
X = df[['word_count', 'sentence_count'] + [f'probability_history_size{i}' for i in range(6)] + [f'Sequentiality_{i}' for i in range(1, 6)]]
y = df['memType']  # Assuming 'memType' is the column indicating recalled or imagined
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a classification model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train.fillna(0), y_train)

# Validate the model
y_pred = clf.predict(X_val.fillna(0))
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy}")

print("Data processing and model training complete. File saved as 'processed_stories.csv'")


Validation Accuracy: 0.3
Data processing and model training complete. File saved as 'processed_stories.csv'


Dont modify above 

In [11]:
import pandas as pd
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the dataset
df = pd.read_csv('/Users/chandhanu/Documents/GitHub/Topics-in-AI-Project-598/test.csv')

# Initialize GPT-2 model and tokenizer
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model.eval()

# Function to calculate probabilities
def calculate_probabilities(text, history_sizes):
    sentences = sent_tokenize(text)
    probabilities = {f'probability_history_size{h}': [] for h in history_sizes}

    for sentence in sentences:
        tokens = tokenizer.encode(sentence, return_tensors='pt')
        if tokens.size(1) > 1024:  # GPT-2's maximum context size
            continue  # Skip this sentence or truncate it

        with torch.no_grad():
            outputs = model(tokens, labels=tokens)
            loss = outputs.loss
            sentence_probability = torch.exp(-loss).item()

        for h in history_sizes:
            # Adjust the context window for each history size
            context_size = min(h, tokens.size(1))
            context = tokens[:, :context_size] if context_size > 0 else tokens
            with torch.no_grad():
                outputs = model(context, labels=tokens[:, :context.size(1)])
                loss = outputs.loss
                context_probability = torch.exp(-loss).item()

            probabilities[f'probability_history_size{h}'].append(context_probability)
    #print(probabilities)
    return probabilities



# Process each story and calculate Sequentiality scores
for index, row in df.iterrows():
    story = row['story']
    probabilities = calculate_probabilities(story, [0, 1, 2, 3, 4, 5])
    
    # Add probabilities to DataFrame
    for key, value in probabilities.items():
        df.at[index, key] = sum(value) / len(value) if value else None

    # Calculate Sequentiality scores
    for h in range(1, 6):
        seq_key = f'Sequentiality_{h}'
        df.at[index, seq_key] = df.at[index, f'probability_history_size{h}'] - df.at[index, 'probability_history_size0']

# Extract linguistic features (example: word count, sentence count)
# You can add more features as needed
df['word_count'] = df['story'].apply(lambda x: len(x.split()))
df['sentence_count'] = df['story'].apply(lambda x: len(sent_tokenize(x)))
df['avg_word_length'] = df['story'].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()) if x.split() else 0)
df['lexical_diversity'] = df['story'].apply(lambda x: len(set(x.split())) / len(x.split()) if x.split() else 0)
import string
df['punctuation_count'] = df['story'].apply(lambda x: sum(1 for char in x if char in string.punctuation))

# Save the DataFrame with new features
df.to_csv('processed_stories.csv', index=False)

# Prepare the validation set
X = df[['word_count', 'sentence_count','avg_word_length', 'lexical_diversity','punctuation_count'] + [f'probability_history_size{i}' for i in range(6)] + [f'Sequentiality_{i}' for i in range(1, 6)]]
y = df['memType']  # Assuming 'memType' is the column indicating recalled or imagined
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a classification model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train.fillna(0), y_train)

# Validate the model
y_pred = clf.predict(X_val.fillna(0))
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy}")

print("Data processing and model training complete. File saved as 'processed_stories.csv'")


Validation Accuracy: 0.4
Data processing and model training complete. File saved as 'processed_stories.csv'


# Adding Linguistic Features to the Data set 
1. word_count
2. sentence_count
3. avg_word_length
4. lexical_diversity
5. avg_sentence_length
6. sensory_word_count
7. first_person_pronoun_count
8. past_tense_verb_count
9. emotion_word_count
10. dialogue_tag_count



In [6]:
import pandas as pd
import torch
import numpy as np
import nltk
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Ensure necessary NLTK downloads
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

# Load the dataset
df = pd.read_csv('/Users/chandhanu/Documents/GitHub/Topics-in-AI-Project-598/hcV3-10.csv')

# Initialize GPT-2 model and tokenizer
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model.eval()

# Function to calculate probabilities
def calculate_probabilities(text, history_sizes):
    sentences = sent_tokenize(text)
    probabilities = {f'probability_history_size{h}': [] for h in history_sizes}

    for sentence in sentences:
        tokens = tokenizer.encode(sentence, return_tensors='pt')
        if tokens.size(1) > 1024:  # GPT-2's maximum context size
            continue  # Skip this sentence or truncate it

        with torch.no_grad():
            outputs = model(tokens, labels=tokens)
            loss = outputs.loss
            sentence_probability = torch.exp(-loss).item()

        for h in history_sizes:
            # Adjust the context window for each history size
            context_size = min(h, tokens.size(1))
            context = tokens[:, :context_size] if context_size > 0 else tokens
            with torch.no_grad():
                outputs = model(context, labels=tokens[:, :context.size(1)])
                loss = outputs.loss
                context_probability = torch.exp(-loss).item()

            probabilities[f'probability_history_size{h}'].append(context_probability)
    #print(probabilities)
    return probabilities

# Process each story and calculate Sequentiality scores
for index, row in df.iterrows():
    story = row['story']
    probabilities = calculate_probabilities(story, [0, 1, 2, 3, 4, 5])
    
    # Add probabilities to DataFrame
    for key, value in probabilities.items():
        df.at[index, key] = sum(value) / len(value) if value else None

    # Calculate Sequentiality scores
    for h in range(1, 6):
        seq_key = f'Sequentiality_{h}'
        df.at[index, seq_key] = df.at[index, f'probability_history_size{h}'] - df.at[index, 'probability_history_size0']

# Extract existing linguistic features
df['word_count'] = df['story'].apply(lambda x: len(x.split()))
df['sentence_count'] = df['story'].apply(lambda x: len(sent_tokenize(x)))
df['avg_word_length'] = df['story'].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()) if x.split() else 0)
df['lexical_diversity'] = df['story'].apply(lambda x: len(set(x.split())) / len(x.split()) if x.split() else 0)
import string
df['punctuation_count'] = df['story'].apply(lambda x: sum(1 for char in x if char in string.punctuation))

# Add new linguistic features
df['avg_sentence_length'] = df['story'].apply(lambda x: np.mean([len(sentence.split()) for sentence in nltk.sent_tokenize(x)]) if nltk.sent_tokenize(x) else 0)
df['sensory_word_count'] = df['story'].apply(lambda x: sum(word in {'see', 'hear', 'touch', 'taste', 'smell', 'sight', 'sound', 'texture', 'aroma', 'flavor'} for word in x.split()))
df['first_person_pronoun_count'] = df['story'].apply(lambda x: sum(word.lower() in {'i', 'me', 'my', 'mine', 'we', 'us', 'our', 'ours'} for word in x.split()))
df['past_tense_verb_count'] = df['story'].apply(lambda x: sum(tag.startswith('VBD') for word, tag in nltk.pos_tag(nltk.word_tokenize(x))))
df['emotion_word_count'] = df['story'].apply(lambda x: sum(word.lower() in {'happy', 'sad', 'angry', 'joyful', 'depressed', 'excited', 'fearful', 'anxious', 'content', 'disappointed'} for word in x.split()))
df['dialogue_tag_count'] = df['story'].apply(lambda x: sum(word.lower() in {'said', 'asked', 'replied', 'shouted', 'whispered', 'murmured', 'screamed', 'yelled', 'muttered', 'uttered', 'exclaimed'} for word in x.split()))


# Save the DataFrame with new features
df.to_csv('/Users/chandhanu/Documents/GitHub/Topics-in-AI-Project-598/processed_values.csv', index=False)

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/chandhanu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /Users/chandhanu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Raw Classification Model - with basic data set sample (100 entries)

In [23]:
# Correcting Feature Set Preparation
feature_columns = [
    'word_count', 'sentence_count', 'avg_word_length', 'lexical_diversity', 
    'punctuation_count', 'avg_sentence_length', 'sensory_word_count', 
    'first_person_pronoun_count', 'past_tense_verb_count', 'emotion_word_count', 
    'dialogue_tag_count'
] + [f'probability_history_size{i}' for i in range(6)] + [f'Sequentiality_{i}' for i in range(1, 6)]

X = df[feature_columns]
y = df['memType']  # Assuming 'memType' is the column indicating recalled or imagined

# Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# Train the RandomForest Classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train.fillna(0), y_train)

# Validate the model
y_pred = clf.predict(X_val.fillna(0))
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy}")

print("Data processing and model training complete.")


Validation Accuracy: 0.42
Data processing and model training complete.


# Raw Classification model - 1000 entries with tweaked classifiers 

In [5]:

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectFromModel
import joblib

# Load the dataset
file_path = '/Users/chandhanu/Documents/GitHub/Topics-in-AI-Project-598/processed_values_hcV3-10.csv'
#file_path = 'test_processed_stories.csv'
#file_path = 'test_processed_stories.csv'
data = pd.read_csv(file_path)

# Define features and target
selected_features = [
    "stressful", "probability_history_size0", "probability_history_size2",
    "probability_history_size3", "probability_history_size4", "probability_history_size5",
    "Sequentiality_2", "Sequentiality_3", "Sequentiality_4", "Sequentiality_5",
    "word_count", "sentence_count", "avg_word_length", "lexical_diversity",
    "punctuation_count", "avg_sentence_length", "sensory_word_count",
    "first_person_pronoun_count", "past_tense_verb_count", "emotion_word_count",
    "dialogue_tag_count"
]

target_column = 'memType'
class_mapping = {'imagined': 0, 'recalled': 1, 'retold': 1}

# Preprocessing data
relevant_data = data[selected_features + [target_column]]

# Impute missing values
imputer = SimpleImputer(strategy='median')
relevant_data[selected_features] = imputer.fit_transform(relevant_data[selected_features])

# Map 'memType' to binary classes
relevant_data[target_column] = relevant_data[target_column].map(class_mapping)

# Splitting the dataset into features (X) and target (y)
X = relevant_data[selected_features]
y = relevant_data[target_column]

# Stratified splitting data into training (60%), validation (20%), and testing (20%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# Standardizing the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Adding Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train_scaled)
X_val_poly = poly.transform(X_val_scaled)
X_test_poly = poly.transform(X_test_scaled)

param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 6],
    'max_features': ['sqrt', 'log2', None, 0.5],
    'bootstrap': [True, False]
}

# Initialize classifiers including Random Forest with GridSearchCV
classifiers = {
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Random_GridSearchCV": GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=StratifiedKFold(5), scoring='accuracy')
}
# Feature selection based on Random Forest importance
rf = RandomForestClassifier(random_state=42).fit(X_train_poly, y_train)
selector = SelectFromModel(rf, prefit=True)
X_train_selected = selector.transform(X_train_poly)
X_val_selected = selector.transform(X_val_poly)

# Retrain and evaluate with selected features
rf_selected = RandomForestClassifier(random_state=42)
rf_selected.fit(X_train_selected, y_train)
y_val_pred_selected = rf_selected.predict(X_val_selected)
val_accuracy_selected = accuracy_score(y_val, y_val_pred_selected)
print(f"Validation Accuracy with Feature Selection: {val_accuracy_selected:.2f}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_data[selected_features] = imputer.fit_transform(relevant_data[selected_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_data[target_column] = relevant_data[target_column].map(class_mapping)


Validation Accuracy with Feature Selection: 0.64


# Check the Classification.ipynb for more classification modifications