In [19]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

from sklearn.model_selection import learning_curve, train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# File path (adjust the path as needed)
file_path = r"D:\WorkSpace\curaJOY\concerns_classification_algorithms\data samples-02-28-2024\Concerns-2-28-2024-V0.2.csv"

# Load data into a pandas DataFrame
data_concerns = pd.read_csv(file_path, encoding="ISO-8859-1")  # Specify encoding to avoid potential issues

# Define a mapping of original column names to new names
column_mapping = {
    'Concerns': 'concerns',
    'More detailed descriptions': 'detailed_descriptions',
    'User Inputs': 'user_inputs',
    'Status': 'status',
    'Intents': 'intents',
    'Quests (Goals)': 'quests_goals',
    'Scoring Groups': 'scoring_groups',
    'Intent Groups': 'intent_groups',
    'Utterances': 'utterances',
    'Coaches Responses': 'coaches_responses',
    'Linked Behaviors': 'linked_behaviors',
    'Linked Activities': 'linked_activities'
}

# Rename columns
data_concerns.rename(columns=column_mapping, inplace=True)

# Display top 5 rows
data_concerns.head(5)

Unnamed: 0,ï»¿Concerns,detailed_descriptions,user_inputs,intents,quests_goals,status,scoring_groups,intent_groups,coaches_responses,linked_behaviors,linked_activities
0,Better academics,want/need to do better in school. If just not...,Better academics,"MoodFrustrated, MoodStressed, Mood-Journal, Pe...","Plan better (time management), Finish what I s...",,,,,Doing something other than assignment,"apply to 5 colleges, apply to 5 jobs, assemble..."
1,Active Quest,user is a primary or participating quester,Active Quest,,,,,,,,
2,ADHD,diagnosed with ADHD or suspects ADHD,ADHD,"Tardies, Absences, MoodFrustrated, Invite Chec...","Take care of my body, Healthy relationship wit...",,"Special Ed, Access/Tangible Function, Escape/A...",,,Doing something other than assignment,"take my medication(s) as prescribed, Be ready ..."
3,Excess Alcohol,"dealing with stress by excessive drinking, bin...",Excess Alcohol,Absences,"Practice impulse control and moderation, Cut m...",,"Alcohol Suspected, High difficulties, Slightly...",,,Binge drinking,Arrange childcare for a night
4,Anger,"difficulty controlling anger or rage, needs he...",Anger,"MoodAngry, MoodFrustrated","Winning and losing graciously, Practice impuls...",,"High difficulties, Slightly raised difficultie...",,,,"do a body check in, Become more self aware by ..."


In [20]:
print(data_concerns.shape)
print(data_concerns.describe())  # Include descriptions along with summary statistics


(70, 11)
       status  coaches_responses
count     0.0                0.0
mean      NaN                NaN
std       NaN                NaN
min       NaN                NaN
25%       NaN                NaN
50%       NaN                NaN
75%       NaN                NaN
max       NaN                NaN


In [21]:
# Preprocess Text Data
data_concerns['user_inputs'] = data_concerns['user_inputs'].str.lower()
data_concerns['user_inputs'] = data_concerns['user_inputs'].str.replace('[^\w\s]', '', regex=True)  # Use regex for better pattern matching

# Remove rows with NaN values
data_concerns = data_concerns.dropna(subset=['user_inputs'])

# Vectorize Text Data
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(data_concerns['user_inputs'])
y = data_concerns['concerns']

# Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


KeyError: 'concerns'

In [None]:
print(data_concerns.shape)
print(data_concerns.describe())  # Include descriptions along with summary statistics


In [5]:
classifiers = [
    SVC(random_state=1, kernel='rbf'),
    DecisionTreeClassifier(random_state=1, criterion='gini'),
    RandomForestClassifier(random_state=1, criterion='gini'),
    KNeighborsClassifier(metric='minkowski'),
]

classifier_names = [
    'svc',
    'decisiontreeclassifier',
    'randomforestclassifier',
    'kneighborsclassifier',
]

classifier_param_grid = [
    {'svc__C': [1], 'svc__gamma': [0.01]},
    {'decisiontreeclassifier__max_depth': [6, 9, 11]},
    {'randomforestclassifier__n_estimators': [3, 5, 6]},
    {'kneighborsclassifier__n_neighbors': [4, 6, 8]},
]



In [7]:
# Create a pipeline for cleaner and more maintainable code
pipe = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', classifiers[0])  # Start with SVC, can be replaced with other classifiers in the list
])

# Remove the 'memory' parameter from the parameter grid as it doesn't require tuning
#param_grid = dict(zip(pipe.get_params().keys()[:-1], classifier_param_grid))  # Exclude 'memory' parameter
param_grid = dict(zip(list(pipe.get_params().keys())[:-1], classifier_param_grid))  # Convert keys to list, then slice




The error now stems from an attempt to slice a dict_keys object in the line param_grid = dict(zip(pipe.get_params().keys()[:-1], classifier_param_grid)). This object is not directly subscriptable because it represents the view keys of a dictionary, not a list.

This code now correctly creates a dictionary param_grid by:

1. Converting the dict_keys object to a list using list(pipe.get_params().keys()). This creates a list containing the parameter names.
2. Slicing the list to exclude the last element (which would correspond to the 'memory' parameter).
3. Zipping the modified list with the classifier_param_grid dictionary to create a new dictionary param_grid that only includes the desired parameters and their corresponding values.

In [8]:
grid_search = GridSearchCV(
    pipe, param_grid, cv=5, scoring='accuracy'
)
grid_search.fit(X_train, y_train)



NameError: name 'X_train' is not defined

In [None]:
# Get the best model and its parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Evaluate the best model on the test set
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Best Model Parameters:", best_params)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

# (Optional) Further analysis:
# - Confusion matrix
# - Visualization of results
# - Comparison with other models