# Deriving more Linguitic features and adding to the pre-processed data set

In [None]:
import pandas as pd
import torch
import numpy as np
import nltk
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Ensure necessary NLTK downloads
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

# Load the dataset
df = pd.read_csv('/Users/chandhanu/Documents/GitHub/Topics-in-AI-Project-598/hcV3-10.csv')

# Initialize GPT-2 model and tokenizer
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model.eval()

# Function to calculate probabilities
def calculate_probabilities(text, history_sizes):
    sentences = sent_tokenize(text)
    probabilities = {f'probability_history_size{h}': [] for h in history_sizes}

    for sentence in sentences:
        tokens = tokenizer.encode(sentence, return_tensors='pt')
        if tokens.size(1) > 1024:  # GPT-2's maximum context size
            continue  # Skip this sentence or truncate it

        with torch.no_grad():
            outputs = model(tokens, labels=tokens)
            loss = outputs.loss
            sentence_probability = torch.exp(-loss).item()

        for h in history_sizes:
            # Adjust the context window for each history size
            context_size = min(h, tokens.size(1))
            context = tokens[:, :context_size] if context_size > 0 else tokens
            with torch.no_grad():
                outputs = model(context, labels=tokens[:, :context.size(1)])
                loss = outputs.loss
                context_probability = torch.exp(-loss).item()

            probabilities[f'probability_history_size{h}'].append(context_probability)
    #print(probabilities)
    return probabilities

# Process each story and calculate Sequentiality scores
for index, row in df.iterrows():
    story = row['story']
    probabilities = calculate_probabilities(story, [0, 1, 2, 3, 4, 5])
    
    # Add probabilities to DataFrame
    for key, value in probabilities.items():
        df.at[index, key] = sum(value) / len(value) if value else None

    # Calculate Sequentiality scores
    for h in range(1, 6):
        seq_key = f'Sequentiality_{h}'
        df.at[index, seq_key] = df.at[index, f'probability_history_size{h}'] - df.at[index, 'probability_history_size0']

# Extract existing linguistic features
df['word_count'] = df['story'].apply(lambda x: len(x.split()))
df['sentence_count'] = df['story'].apply(lambda x: len(sent_tokenize(x)))
df['avg_word_length'] = df['story'].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()) if x.split() else 0)
df['lexical_diversity'] = df['story'].apply(lambda x: len(set(x.split())) / len(x.split()) if x.split() else 0)
import string
df['punctuation_count'] = df['story'].apply(lambda x: sum(1 for char in x if char in string.punctuation))

# Add new linguistic features
df['avg_sentence_length'] = df['story'].apply(lambda x: np.mean([len(sentence.split()) for sentence in nltk.sent_tokenize(x)]) if nltk.sent_tokenize(x) else 0)
df['sensory_word_count'] = df['story'].apply(lambda x: sum(word in {'see', 'hear', 'touch', 'taste', 'smell', 'sight', 'sound', 'texture', 'aroma', 'flavor'} for word in x.split()))
df['first_person_pronoun_count'] = df['story'].apply(lambda x: sum(word.lower() in {'i', 'me', 'my', 'mine', 'we', 'us', 'our', 'ours'} for word in x.split()))
df['past_tense_verb_count'] = df['story'].apply(lambda x: sum(tag.startswith('VBD') for word, tag in nltk.pos_tag(nltk.word_tokenize(x))))
df['emotion_word_count'] = df['story'].apply(lambda x: sum(word.lower() in {'happy', 'sad', 'angry', 'joyful', 'depressed', 'excited', 'fearful', 'anxious', 'content', 'disappointed'} for word in x.split()))
df['dialogue_tag_count'] = df['story'].apply(lambda x: sum(word.lower() in {'said', 'asked', 'replied', 'shouted', 'whispered', 'murmured', 'screamed', 'yelled', 'muttered', 'uttered', 'exclaimed'} for word in x.split()))

# Save the DataFrame with new features
df.to_csv('processed_values.csv', index=False)

# Prepare the validation set
X = df[['word_count', 'sentence_count', 'avg_word_length', 'lexical_diversity', 'punctuation_count', 'avg_sentence_length', 'sensory_word_count', 'first_person_pronoun_count', 'past_tense_verb_count', 'emotion_word_count', 'dialogue_tag_count'] + [f'probability_history_size{i}' for i in range(6)] + [f'Sequentiality_{i}' for i in range(1, 6)]]
y = df['memType']  # Assuming 'memType' is the column indicating recalled or imagined
# Splitting data into training (60%), validation (20%), and testing (20%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Train and validate the classification model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train.fillna(0), y_train)

# Validate the model
y_pred = clf.predict(X_val.fillna(0))
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy}")

print("Data processing and model training complete. File saved as 'processed_values.csv'")


# Constructing Pre-Determined Validation set 

The validationg is pre-determined and it is not randomly sampled, but the usage of stratified split ensures the dataset's overal comprehensiveness. 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

# Load the dataset
file_path = 'test_processed_stories.csv'
data = pd.read_csv(file_path)

# Define features and target
selected_features = [
    "stressful", "probability_history_size0", "probability_history_size2",
    "probability_history_size3", "probability_history_size4", "probability_history_size5",
    "Sequentiality_2", "Sequentiality_3", "Sequentiality_4", "Sequentiality_5",
    "word_count", "sentence_count", "avg_word_length", "lexical_diversity",
    "punctuation_count", "avg_sentence_length", "sensory_word_count",
    "first_person_pronoun_count", "past_tense_verb_count", "emotion_word_count",
    "dialogue_tag_count"
]
target_column = 'memType'
class_mapping = {'imagined': 0, 'recalled': 1, 'retold': 1}

# Preprocessing data
relevant_data = data[selected_features + [target_column]]

# Impute missing values
imputer = SimpleImputer(strategy='median')
relevant_data[selected_features] = imputer.fit_transform(relevant_data[selected_features])

# Map 'memType' to binary classes
relevant_data[target_column] = relevant_data[target_column].map(class_mapping)

# Splitting the dataset into features (X) and target (y)
X = relevant_data[selected_features]
y = relevant_data[target_column]

# Splitting data into training (60%), validation (20%), and testing (20%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Standardizing the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Adding Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train_scaled)
X_val_poly = poly.transform(X_val_scaled)
X_test_poly = poly.transform(X_test_scaled)

# Training a RandomForest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_poly, y_train)

# Evaluating the model on the validation set
y_val_pred = rf_model.predict(X_val_poly)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_accuracy:.2f}")

# Optionally, save the model
# joblib.dump(rf_model, 'random_forest_classifier.pkl')


# Testing the best classifier model and optimization

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

# Load the dataset
file_path = 'test_processed_stories.csv'
data = pd.read_csv(file_path)

# Define features and target
selected_features = [
    "stressful", "probability_history_size0", "probability_history_size2",
    "probability_history_size3", "probability_history_size4", "probability_history_size5",
    "Sequentiality_2", "Sequentiality_3", "Sequentiality_4", "Sequentiality_5",
    "word_count", "sentence_count", "avg_word_length", "lexical_diversity",
    "punctuation_count", "avg_sentence_length", "sensory_word_count",
    "first_person_pronoun_count", "past_tense_verb_count", "emotion_word_count",
    "dialogue_tag_count"
]
target_column = 'memType'
class_mapping = {'imagined': 0, 'recalled': 1, 'retold': 1}

# Preprocessing data
relevant_data = data[selected_features + [target_column]]

# Impute missing values
imputer = SimpleImputer(strategy='median')
relevant_data[selected_features] = imputer.fit_transform(relevant_data[selected_features])

# Map 'memType' to binary classes
relevant_data[target_column] = relevant_data[target_column].map(class_mapping)

# Splitting the dataset into features (X) and target (y)
X = relevant_data[selected_features]
y = relevant_data[target_column]

# Stratified splitting data into training (60%), validation (20%), and testing (20%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# Standardizing the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Adding Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train_scaled)
X_val_poly = poly.transform(X_val_scaled)
X_test_poly = poly.transform(X_test_scaled)

# RandomForest Classifier with Hyperparameter Tuning using Grid Search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=StratifiedKFold(5), scoring='accuracy')
grid_search.fit(X_train_poly, y_train)

best_rf_model = grid_search.best_estimator_

# Evaluating the best model on the validation set
y_val_pred = best_rf_model.predict(X_val_poly)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_accuracy:.2f}")

# Optionally, save the best model
# joblib.dump(best_rf_model, 'random_forest_classifier_optimized.pkl')


# Comparing all classification models 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import joblib

# Load the dataset
#file_path = 'fake.csv'
file_path = 'test_processed_stories.csv'
#file_path = 'processed_values.csv'
data = pd.read_csv(file_path)

# Define features and target
selected_features = [
    "stressful", "probability_history_size0", "probability_history_size2",
    "probability_history_size3", "probability_history_size4", "probability_history_size5",
    "Sequentiality_2", "Sequentiality_3", "Sequentiality_4", "Sequentiality_5",
    "word_count", "sentence_count", "avg_word_length", "lexical_diversity",
    "punctuation_count", "avg_sentence_length", "sensory_word_count",
    "first_person_pronoun_count", "past_tense_verb_count", "emotion_word_count",
    "dialogue_tag_count"
]
target_column = 'memType'
class_mapping = {'imagined': 0, 'recalled': 1, 'retold': 1}

# Preprocessing data
relevant_data = data[selected_features + [target_column]]

# Impute missing values
imputer = SimpleImputer(strategy='median')
relevant_data[selected_features] = imputer.fit_transform(relevant_data[selected_features])

# Map 'memType' to binary classes
relevant_data[target_column] = relevant_data[target_column].map(class_mapping)

# Splitting the dataset into features (X) and target (y)
X = relevant_data[selected_features]
y = relevant_data[target_column]

# Stratified splitting data into training (60%), validation (20%), and testing (20%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# Standardizing the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Adding Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train_scaled)
X_val_poly = poly.transform(X_val_scaled)
X_test_poly = poly.transform(X_test_scaled)

param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 6],
    'max_features': ['sqrt', 'log2', None, 0.5],
    'bootstrap': [True, False]
}

# Initialize classifiers
classifiers = {
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    #"Random Forest": RandomForestClassifier(random_state=42)
    "Random Forest": RandomForestClassifier(random_state=42),
    #"Random_GridSearchCV": GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=StratifiedKFold(5), scoring='accuracy')
    #"Random_GridSearchCV": GridSearchCV(RandomForestClassifier(random_state=42), param_grid, scoring='accuracy')
}

# Train, evaluate, and save each model
for name, clf in classifiers.items():
    # Fit the classifier
    clf.fit(X_train_poly, y_train)  # Polynomial features used here

    # Make predictions and evaluate
    y_val_pred = clf.predict(X_val_poly)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    print(f"{name} Validation Accuracy: {val_accuracy:.2f}")

    # Save the model
    joblib.dump(clf, f'{name.lower().replace(" ", "_")}_classifier.pkl')

# Feature selection based on Random Forest importance
rf = RandomForestClassifier(random_state=42).fit(X_train_poly, y_train)
selector = SelectFromModel(rf, prefit=True)
X_train_selected = selector.transform(X_train_poly)
X_val_selected = selector.transform(X_val_poly)

# Retrain and evaluate with selected features
rf_selected = RandomForestClassifier(random_state=42)
rf_selected.fit(X_train_selected, y_train)
y_val_pred_selected = rf_selected.predict(X_val_selected)
val_accuracy_selected = accuracy_score(y_val, y_val_pred_selected)
print(f"Validation Accuracy with Feature Selection: {val_accuracy_selected:.2f}")


# Optimizing using ensemble and voting classification

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectFromModel
import joblib

# Load the dataset
file_path = 'test_processed_stories.csv'
data = pd.read_csv(file_path)

# Define features and target
selected_features = [
    "stressful", "probability_history_size0", "probability_history_size2",
    "probability_history_size3", "probability_history_size4", "probability_history_size5",
    "Sequentiality_2", "Sequentiality_3", "Sequentiality_4", "Sequentiality_5",
    "word_count", "sentence_count", "avg_word_length", "lexical_diversity",
    "punctuation_count", "avg_sentence_length", "sensory_word_count",
    "first_person_pronoun_count", "past_tense_verb_count", "emotion_word_count",
    "dialogue_tag_count"
]

target_column = 'memType'
class_mapping = {'imagined': 0, 'recalled': 1, 'retold': 1}

# Preprocessing data
relevant_data = data[selected_features + [target_column]]

# Impute missing values
imputer = SimpleImputer(strategy='median')
relevant_data[selected_features] = imputer.fit_transform(relevant_data[selected_features])

# Map 'memType' to binary classes
relevant_data[target_column] = relevant_data[target_column].map(class_mapping)

# Splitting the dataset into features (X) and target (y)
X = relevant_data[selected_features]
y = relevant_data[target_column]

# Stratified splitting data into training (60%), validation (20%), and testing (20%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# Standardizing the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Adding Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train_scaled)
X_val_poly = poly.transform(X_val_scaled)
X_test_poly = poly.transform(X_test_scaled)

param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 6],
    'max_features': ['sqrt', 'log2', None, 0.5],
    'bootstrap': [True, False]
}

# Initialize classifiers including Random Forest with GridSearchCV
classifiers = {
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Random_GridSearchCV": GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=StratifiedKFold(5), scoring='accuracy')
}

# Train, evaluate, and save each model
for name, clf in classifiers.items():
    # Fit the classifier
    clf.fit(X_train_poly, y_train)  # Polynomial features used here

    # Make predictions and evaluate
    y_val_pred = clf.predict(X_val_poly)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    print(f"{name} Validation Accuracy: {val_accuracy:.2f}")

    # Detailed classification report for error analysis
    print(f"\nClassification Report for {name}:")
    print(classification_report(y_val, y_val_pred))

    # Save the model
    joblib.dump(clf, f'{name.lower().replace(" ", "_")}_classifier.pkl')

# Feature selection based on Random Forest importance
rf = RandomForestClassifier(random_state=42).fit(X_train_poly, y_train)
selector = SelectFromModel(rf, prefit=True)
X_train_selected = selector.transform(X_train_poly)
X_val_selected = selector.transform(X_val_poly)

# Retrain and evaluate with selected features
rf_selected = RandomForestClassifier(random_state=42)
rf_selected.fit(X_train_selected, y_train)
y_val_pred_selected = rf_selected.predict(X_val_selected)
val_accuracy_selected = accuracy_score(y_val, y_val_pred_selected)
print(f"Validation Accuracy with Feature Selection: {val_accuracy_selected:.2f}")


# Feature Selection increases the base acuracy by 25%

Random Forest Classifier provided the base accuracy of 70% and used as base to increase the accuracy of the classifications.
By tweaking the parameters of Random Forest classifier using GridSearchCV and StratifiedFold

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectFromModel
import joblib
import matplotlib.pyplot as plt

# Load the dataset
file_path = 'test_processed_stories.csv'
#file_path = 'processed_values_hcV3-10.csv'
data = pd.read_csv(file_path)

# Define features and target
selected_features = [
    "stressful", "probability_history_size0", "probability_history_size2",
    "probability_history_size3", "probability_history_size4", "probability_history_size5",
    "Sequentiality_2", "Sequentiality_3", "Sequentiality_4", "Sequentiality_5",
    "word_count", "sentence_count", "avg_word_length", "lexical_diversity",
    "punctuation_count", "avg_sentence_length", "sensory_word_count",
    "first_person_pronoun_count", "past_tense_verb_count", "emotion_word_count",
    "dialogue_tag_count"
]

target_column = 'memType'
class_mapping = {'imagined': 0, 'recalled': 1, 'retold': 1}

# Preprocessing data
relevant_data = data[selected_features + [target_column]]
imputer = SimpleImputer(strategy='median')
relevant_data[selected_features] = imputer.fit_transform(relevant_data[selected_features])
relevant_data[target_column] = relevant_data[target_column].map(class_mapping)

# Splitting the dataset into features (X) and target (y)
X = relevant_data[selected_features]
y = relevant_data[target_column]
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# Standardizing the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Adding Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train_scaled)
X_val_poly = poly.transform(X_val_scaled)
X_test_poly = poly.transform(X_test_scaled)

# Initialize classifiers
classifiers = {
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Random Forest": RandomForestClassifier(random_state=42)
}

# Dictionary to store accuracy of each model
model_accuracies = {}

# Train, evaluate, and save each model
for name, clf in classifiers.items():
    clf.fit(X_train_poly, y_train)
    y_val_pred = clf.predict(X_val_poly)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    print(f"{name} Validation Accuracy: {val_accuracy:.2f}")
    model_accuracies[name] = val_accuracy
    joblib.dump(clf, f'{name.lower().replace(" ", "_")}_classifier.pkl')

# Creating lists of classifiers and their accuracies
classifiers = list(model_accuracies.keys())
accuracy_values = list(model_accuracies.values())

# Generating the plot
plt.figure(figsize=(10, 6))
plt.plot(classifiers, accuracy_values, marker='o', color='b')
plt.title('Model Accuracies')
plt.xlabel('Classifiers')
plt.ylabel('Accuracy')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()



# Optimised Random Forest 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import joblib

# Load the dataset
file_path = 'test_processed_stories.csv'
data = pd.read_csv(file_path)

# Define features and target
# Define features and target
selected_features = [
    "stressful", "probability_history_size0", "probability_history_size2",
    "probability_history_size3", "probability_history_size4", "probability_history_size5",
    "Sequentiality_2", "Sequentiality_3", "Sequentiality_4", "Sequentiality_5",
    "word_count", "sentence_count", "avg_word_length", "lexical_diversity",
    "punctuation_count", "avg_sentence_length", "sensory_word_count",
    "first_person_pronoun_count", "past_tense_verb_count", "emotion_word_count",
    "dialogue_tag_count"
]
target_column = 'memType'
class_mapping = {'imagined': 0, 'recalled': 1, 'retold': 1}

# Preprocessing data
relevant_data = data[selected_features + [target_column]]

# Impute missing values
imputer = SimpleImputer(strategy='median')
relevant_data[selected_features] = imputer.fit_transform(relevant_data[selected_features])

# Map 'memType' to binary classes
relevant_data[target_column] = relevant_data[target_column].map(class_mapping)

# Splitting the dataset into features (X) and target (y)
X = relevant_data[selected_features]
y = relevant_data[target_column]

# Stratified splitting data into training (60%), validation (20%), and testing (20%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# Standardizing the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Adding Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train_scaled)
X_val_poly = poly.transform(X_val_scaled)
X_test_poly = poly.transform(X_test_scaled)

# Initialize classifiers
classifiers = {
    "Logistic Regression": LogisticRegression(),
    #"Gradient Boosting": GradientBoostingClassifier(),
    "SVM": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    #"Random Forest": RandomForestClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    #"Random Forest-191": RandomForestClassifier(random_state=191),
    "Random_GridSearchCV": GridSearchCV(RandomForestClassifier(random_state=154), param_grid, cv=StratifiedKFold(5), scoring='accuracy'),
    "Random_GridSearchCV": RandomForestClassifier(random_state=154),
}

# Dictionary to store accuracy of each classifier
accuracy_dict = {}

# Train, evaluate, and save each model
for name, clf in classifiers.items():
    # Fit the classifier
    clf.fit(X_train_poly, y_train)  # Polynomial features used here

    # Make predictions and evaluate
    y_val_pred = clf.predict(X_val_poly)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    accuracy_dict[name] = val_accuracy  # Store the accuracy in the dictionary

    # Save the model
    joblib.dump(clf, f'{name.lower().replace(" ", "_")}_classifier.pkl')


# Generating the accuracy graph
plt.figure(figsize=(10, 6))
plt.plot(list(accuracy_dict.keys()), list(accuracy_dict.values()), marker='o', color='b')
plt.title('Model Accuracies')
plt.xlabel('Classifiers')
plt.ylabel('Accuracy')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()



In [None]:
import matplotlib.pyplot as plt

# Data
classifications = ['(1)Basic Classification', '(2)With Sequentiality (1-5)', '(3)RF with Linguistic Features on (2)', '(4) Optimized Random Forest']
accuracies = [0.3, 0.42, 0.75, 0.87] # Please check the individual components from other blocks 
colors = ['blue', 'blue', 'blue', 'blue']

# Plotting
plt.figure(figsize=(10, 6))
for i in range(len(classifications) - 1):
    plt.plot(classifications[i:i+2], accuracies[i:i+2], marker='o', color=colors[i], linestyle='-', label=f'{classifications[i]} to {classifications[i+1]}')

# Adding titles and labels
plt.title('Classification Accuracies on the Hippocorpus Dataset')
plt.xlabel('Classification Type')
plt.ylabel('Accuracy')
plt.xticks(classifications, rotation=45)
plt.ylim(0, 1)  # Setting the limit for y-axis

# Adding grid
plt.grid(True, which='both', linestyle='--', linewidth=0.5)

# Adding legend
plt.legend()

# Show plot
plt.tight_layout()
plt.show()
