# TAHLR Week 9: Text Classification Algorithms

Code notebook for TAHLR course at ISAW (Fall 2023) based on Albrecht et al. 2022 (Blueprints) Ch. 6

In [None]:
# imports

import pandas as pd
from blueprints import clean

## Introducing the Java Development Tools Bug Dataset

In [None]:
df = pd.read_csv('../data/blueprints_6/eclipse_jdt.csv')
print (df.columns)
df[['Issue_id','Priority','Component','Title','Description']].sample(2)

In [None]:
df.sample(1).T

In [None]:
df['Priority'].value_counts().sort_index().plot(kind='bar')

In [None]:
df['Component'].value_counts()

### Blueprint: Building a Text Classification System

#### Step 1: Data preparation

In [None]:
df = df[['Title','Description','Priority']]
df = df.dropna()
df['text'] = df['Title'] + ' ' + df['Description']
df = df.drop(columns=['Title','Description'])
df.columns

In [None]:
df['text'] = df['text'].apply(clean)
df = df[df['text'].str.len() > 50]
df.sample(2)

#### Step 2: Train-test split

In [None]:
# imports

from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(df['text'],
                                                    df['Priority'],
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=df['Priority'])

print('Size of Training Data ', X_train.shape[0])
print('Size of Test Data ', X_test.shape[0])

#### Step 3: Training the machine learning model

In [None]:
# imports

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, classification_report

In [None]:
tfidf = TfidfVectorizer(min_df = 10, ngram_range=(1,2), stop_words="english")
X_train_tf = tfidf.fit_transform(X_train)

In [None]:
# Train model
# NB: "The SVM algorithm is preferred when working with text data because it is more suited to work with sparse data compared to other algorithms like Random Forest."

model1 = LinearSVC(random_state=0, tol=1e-5, dual=True)
model1.fit(X_train_tf, Y_train)

In [None]:
# Show parameters

model1.get_params()

#### Step 4: Model evaluation

In [None]:
X_test_tf = tfidf.transform(X_test)

Y_pred = model1.predict(X_test_tf)
print ('Accuracy Score - ', accuracy_score(Y_test, Y_pred))

In [None]:
clf = DummyClassifier(strategy='most_frequent')
clf.fit(X_train, Y_train)
Y_pred_baseline = clf.predict(X_test)
print ('Accuracy Score - ', accuracy_score(Y_test, Y_pred_baseline))

In [None]:
Y_pred = model1.predict(X_test_tf)
cm = confusion_matrix(Y_test, Y_pred)
cm

In [None]:
# nb: plot_confusion_matrix as shown in *Blueprints* is deprecated; use ConfusionMatrixDisplay instead as shown below [PJB 11.3.2023]

CMD = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model1.classes_)
CMD.plot(cmap='Blues');

In [None]:
print(classification_report(Y_test, Y_pred, zero_division=0))

### Class imbalance

In [None]:
# Filter bug reports with priority P3 and sample 4000 rows from it
df_sampleP3 = df[df['Priority'] == 'P3'].sample(n=4000)

# Create a separate DataFrame containing all other bug reports
df_sampleRest = df[df['Priority'] != 'P3']

# Concatenate the two DataFrame to create the new balanced bug reports dataset
df_balanced = pd.concat([df_sampleRest, df_sampleP3])

# Check the status of the class imbalance
df_balanced['Priority'].value_counts()

## Final Blueprint for Text Classification

In [None]:
# Loading the balanced DataFrame

df = df_balanced[['text', 'Priority']]
df = df.dropna()

# Step 1 - Data Preparation

df['text'] = df['text'].apply(clean)

# Step 2 - Train-Test Split
X_train, X_test, Y_train, Y_test = train_test_split(df['text'],
                                                    df['Priority'],
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=df['Priority'])
print('Size of Training Data ', X_train.shape[0])
print('Size of Test Data ', X_test.shape[0])

# Step 3 - Training the Machine Learning model

tfidf = TfidfVectorizer(min_df=10, ngram_range=(1, 2), stop_words="english")
X_train_tf = tfidf.fit_transform(X_train)

model1 = LinearSVC(random_state=0, tol=1e-5, dual=True)
model1.fit(X_train_tf, Y_train)

# Step 4 - Model Evaluation

X_test_tf = tfidf.transform(X_test)
Y_pred = model1.predict(X_test_tf)
print('Accuracy Score - ', accuracy_score(Y_test, Y_pred))
print()
print(classification_report(Y_test, Y_pred))

In [None]:
clf = DummyClassifier(strategy='stratified')
clf.fit(X_train, Y_train)
Y_pred_baseline = clf.predict(X_test)
print ('Accuracy Score - ', accuracy_score(Y_test, Y_pred_baseline))

In [None]:
# Create a DataFrame combining the Title and Description,
# Actual and Predicted values that we can explore

frame = { 'text': X_test, 'actual': Y_test, 'predicted': Y_pred }
result = pd.DataFrame(frame)

result[((result['actual'] == 'P1') | (result['actual'] == 'P2')) &
       (result['actual'] == result['predicted'])].sample(2)

In [None]:
result[((result['actual'] == 'P1') | (result['actual'] == 'P2')) &
       (result['actual'] != result['predicted'])].sample(2)

## Blueprint: Using Cross-Validation to Estimate Realistic Accuracy Metrics

In [None]:
# Imports

from sklearn.model_selection import cross_val_score

In [None]:
# Vectorization

tfidf = TfidfVectorizer(min_df = 10, ngram_range=(1,2), stop_words="english")
df_tf = tfidf.fit_transform(df['text']).toarray()

# Cross Validation with 5 folds

scores = cross_val_score(estimator=model1,
                         X=df_tf,
                         y=df['Priority'],
                         cv=5)

print ("Validation scores from each iteration of the cross validation ", scores)
print ("Mean value across of validation scores ", scores.mean())
print ("Standard deviation of validation scores ", scores.std())


## Blueprint: Using Cross-Validation to Estimate Realistic Accuracy Metrics

In [None]:
# Imports

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [None]:
training_pipeline = Pipeline(
    steps=[('tfidf', TfidfVectorizer(stop_words="english")),
            ('model', LinearSVC(random_state=42, tol=1e-5, dual=True))])

grid_param = [{
    'tfidf__min_df': [5, 10],
    'tfidf__ngram_range': [(1, 3), (1, 6)],
    'model__penalty': ['l2'],
    'model__loss': ['hinge'],
    'model__max_iter': [10000]
}, {
    'tfidf__min_df': [5, 10],
    'tfidf__ngram_range': [(1, 3), (1, 6)],
    'model__C': [1, 10],
    'model__tol': [1e-2, 1e-3]
}]

gridSearchProcessor = GridSearchCV(estimator=training_pipeline,
                                   param_grid=grid_param,
                                   cv=5)
gridSearchProcessor.fit(df['text'], df['Priority'])

best_params = gridSearchProcessor.best_params_
print("Best alpha parameter identified by grid search ", best_params)

best_result = gridSearchProcessor.best_score_
print("Best result identified by grid search ", best_result)

In [None]:
gridsearch_results = pd.DataFrame(gridSearchProcessor.cv_results_)
gridsearch_results[['rank_test_score', 'mean_test_score',
                    'params']].sort_values(by=['rank_test_score'])[:5]