In [7]:
# import pandas as pd
# import numpy as np
# from sklearn.utils import resample
# from sklearn.model_selection import train_test_split
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import classification_report
# from nltk.corpus import stopwords
# from nltk.stem import PorterStemmer, WordNetLemmatizer
# import nltk


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk

#import necessary libraries

In [12]:
nltk.download('stopwords')
nltk.download('wordnet')

# Load the data
df = pd.read_csv('../data/cleaned_data/final_notes.csv')

# Display class distribution
print(df['classification'].value_counts())

# Preprocessing steps
df['notes'] = df['notes'].str.lower()  # Lowercasing
df['notes'] = df['notes'].str.replace('[^\w\s]', '', regex=True)  # Remove punctuation
stop = set(stopwords.words('english'))
df['notes'] = df['notes'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop]))  # Remove stopwords
stemmer = PorterStemmer()
df['notes'] = df['notes'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))  # Stemming

# Split the dataset into features and target
X = df['notes']
y = df['classification']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40, stratify=y)

# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
X_train_vect = vectorizer.fit_transform(X_train).toarray()
X_test_vect = vectorizer.transform(X_test).toarray()

# Train a model with hyperparameter tuning
param_grid = {'C': [0.01, 0.1, 1, 10, 100]}
modelLR = GridSearchCV(LogisticRegression(max_iter=200), param_grid, cv=5, scoring='f1_weighted')
modelLR.fit(X_train_vect, y_train)

# Evaluate the model


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


classification
1    847
0    700
3    320
2    141
Name: count, dtype: int64


In [13]:
y_pred = modelLR.predict(X_test_vect)
print(classification_report(y_test, y_pred))

# Display the best parameters
print(f"Best parameters: {modelLR.best_params_}")

              precision    recall  f1-score   support

           0       0.92      0.94      0.93       140
           1       0.89      0.94      0.91       170
           2       0.95      0.68      0.79        28
           3       0.93      0.86      0.89        64

    accuracy                           0.91       402
   macro avg       0.92      0.85      0.88       402
weighted avg       0.91      0.91      0.91       402

Best parameters: {'C': 100}


In [10]:
df.value_counts(['classification'])

classification
1                 847
0                 700
3                 320
2                 141
Name: count, dtype: int64

In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Download NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Load the data
df = pd.read_csv('../data/cleaned_data/final_notes.csv')

# Display class distribution
print(df['classification'].value_counts())

# Preprocessing steps
df['notes'] = df['notes'].str.lower()  # Lowercasing
df['notes'] = df['notes'].str.replace('[^\w\s]', '', regex=True)  # Remove punctuation
stop = set(stopwords.words('english'))
df['notes'] = df['notes'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop]))  # Remove stopwords
lemmatizer = WordNetLemmatizer()
df['notes'] = df['notes'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))  # Lemmatization

# Split the dataset into features and target
X = df['notes']
y = df['classification']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40, stratify=y)

# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
X_train_vect = vectorizer.fit_transform(X_train).toarray()
X_test_vect = vectorizer.transform(X_test).toarray()

# Train a Random Forest model with hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
model_RF = GridSearchCV(RandomForestClassifier(random_state=40), param_grid, cv=5, scoring='f1_weighted')
model_RF.fit(X_train_vect, y_train)




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


classification
1    847
0    700
3    320
2    141
Name: count, dtype: int64


In [16]:
y_pred = model_RF.predict(X_test_vect)
print(classification_report(y_test, y_pred))

# Display the best parameters
print(f"Best parameters: {model_RF.best_params_}")

              precision    recall  f1-score   support

           0       0.95      0.96      0.96       140
           1       0.89      0.97      0.93       170
           2       1.00      0.79      0.88        28
           3       0.98      0.81      0.89        64

    accuracy                           0.93       402
   macro avg       0.96      0.88      0.91       402
weighted avg       0.93      0.93      0.93       402

Best parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 300}
