In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/codeReview/5_models/DSP/DSP.csv')
df.head()

Unnamed: 0,description,SID
0,But you still have the data() used as null-ter...,1
1,consider rename to current_l3_agents,1
2,Die. Die. Die XML. I'd remove this hideous rem...,1
3,"I really don't like this, we should remove all...",1
4,I'd like to see a lot of these mocks moved to ...,1


In [None]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

def evaluate_model(y_pred, y_test, model_name):
  accuracy = accuracy_score(y_test, y_pred)
  print(f'Accuracy {model_name}: {accuracy:.2f}')
  report = classification_report(y_test, y_pred)
  print(f'Classification Report of {model_name}:\n{report}')


In [None]:
def train_model(model, model_name, X_train, X_test, y_train, y_test):
  model.fit(X_train, y_train)
  y_pred=model.predict(X_test)
  evaluate_model(y_pred, y_test, model_name)


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

def implement_models(X_train, X_test, y_train, y_test):
  train_model(DecisionTreeClassifier(), 'DecisionTree', X_train, X_test, y_train, y_test)
  train_model(SVC(), 'SVM', X_train, X_test, y_train, y_test)
  train_model(GaussianNB(), 'Naive Bayes', X_train, X_test, y_train, y_test)
  train_model(XGBClassifier(), 'XGBOOST', X_train, X_test, y_train, y_test)
  train_model(MLPClassifier(), 'MLP', X_train, X_test, y_train, y_test)


In [None]:
from imblearn.over_sampling import SMOTE

def overSample(X_train,y_train):
    # Display the class distribution before oversampling
    print("Class distribution before oversampling:")
    print(pd.Series(y_train).value_counts())
    # Apply SMOTE for oversampling
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    # Display the class distribution after oversampling
    print("\nClass distribution after oversampling:")
    print(pd.Series(y_train_resampled).value_counts())
    return X_train_resampled, y_train_resampled


In [None]:
from sklearn.model_selection import train_test_split

# Model Training
# Load BoW representation from the saved file
csv_file_path = '/content/drive/MyDrive/codeReview/5_models/DSP/bow_representation.csv'
loaded_bow_representation = pd.read_csv(csv_file_path)

# Combine the loaded BoW representation with the labels
X = loaded_bow_representation
y = df['DSP']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# Create a KFold object
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform K-Fold cross-validation
cross_val_results = cross_val_score(MLPClassifier(), X, y, cv=kf, scoring='accuracy')

# Display the cross-validation results
print(f'Cross-Validation Results: {cross_val_results}')
print(f'Mean Accuracy: {cross_val_results.mean()}')



Cross-Validation Results: [0.70175439 0.73684211 0.73684211 0.68421053 0.67857143]
Mean Accuracy: 0.7076441102756892




In [None]:
X_train_resampled, y_train_resampled = overSample(X_train,y_train)

Class distribution before oversampling:
0    157
1     70
Name: SID, dtype: int64

Class distribution after oversampling:
1    157
0    157
Name: SID, dtype: int64


In [None]:
print('implement models for BoW representation')
implement_models(X_train_resampled, X_test, y_train_resampled, y_test)

implement models for BoW representation
Accuracy DecisionTree: 0.68
Classification Report of DecisionTree:
              precision    recall  f1-score   support

           0       0.80      0.72      0.76        39
           1       0.50      0.61      0.55        18

    accuracy                           0.68        57
   macro avg       0.65      0.66      0.65        57
weighted avg       0.71      0.68      0.69        57

Accuracy SVD: 0.39
Classification Report of SVD:
              precision    recall  f1-score   support

           0       0.75      0.15      0.26        39
           1       0.33      0.89      0.48        18

    accuracy                           0.39        57
   macro avg       0.54      0.52      0.37        57
weighted avg       0.62      0.39      0.33        57

Accuracy Naive Bayes: 0.68
Classification Report of Naive Bayes:
              precision    recall  f1-score   support

           0       0.74      0.82      0.78        39
           1    



In [None]:
# Load TF_IDF representation from the saved file
csv_file_path = '/content/drive/MyDrive/codeReview/5_models/DSP/tfidf_representation.csv'
loaded_tfidf_representation = pd.read_csv(csv_file_path)

# Combine the loaded TF_IDF representation with the labels
X = loaded_tfidf_representation
y = df['DSP']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train_resampled, y_train_resampled = overSample(X_train,y_train)

Class distribution before oversampling:
0    157
1     70
Name: SID, dtype: int64

Class distribution after oversampling:
1    157
0    157
Name: SID, dtype: int64


In [None]:
print('implement models for TF_IDF representation\n')
implement_models(X_train_resampled, X_test, y_train_resampled, y_test)

implement models for TF_IDF representation

Accuracy DecisionTree: 0.63
Classification Report of DecisionTree:
              precision    recall  f1-score   support

           0       0.72      0.74      0.73        39
           1       0.41      0.39      0.40        18

    accuracy                           0.63        57
   macro avg       0.57      0.57      0.57        57
weighted avg       0.63      0.63      0.63        57

Accuracy SVD: 0.70
Classification Report of SVD:
              precision    recall  f1-score   support

           0       0.70      1.00      0.82        39
           1       1.00      0.06      0.11        18

    accuracy                           0.70        57
   macro avg       0.85      0.53      0.46        57
weighted avg       0.79      0.70      0.60        57

Accuracy Naive Bayes: 0.74
Classification Report of Naive Bayes:
              precision    recall  f1-score   support

           0       0.79      0.85      0.81        39
           1

In [None]:
# Load Word2Vec representation from the saved file
csv_file_path = '/content/drive/MyDrive/codeReview/5_models/DSP/word2vec_representation.csv'
loaded_word2vec_representation = pd.read_csv(csv_file_path)

# Combine the loaded Word2Vec representation with the labels
X = loaded_word2vec_representation
y = df['DSP']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train_resampled, y_train_resampled = overSample(X_train,y_train)

Class distribution before oversampling:
0    157
1     70
Name: SID, dtype: int64

Class distribution after oversampling:
1    157
0    157
Name: SID, dtype: int64


In [None]:
print('implement models for Word2Vec representation\n')
implement_models(X_train_resampled, X_test, y_train_resampled, y_test)

implement models for Word2Vec representation

Accuracy DecisionTree: 0.63
Classification Report of DecisionTree:
              precision    recall  f1-score   support

           0       0.76      0.67      0.71        39
           1       0.43      0.56      0.49        18

    accuracy                           0.63        57
   macro avg       0.60      0.61      0.60        57
weighted avg       0.66      0.63      0.64        57

Accuracy SVD: 0.68
Classification Report of SVD:
              precision    recall  f1-score   support

           0       0.73      0.85      0.79        39
           1       0.50      0.33      0.40        18

    accuracy                           0.68        57
   macro avg       0.62      0.59      0.59        57
weighted avg       0.66      0.68      0.66        57

Accuracy Naive Bayes: 0.53
Classification Report of Naive Bayes:
              precision    recall  f1-score   support

           0       0.70      0.54      0.61        39
          



In [None]:
# Load Glove representation from the saved file
csv_file_path = '/content/drive/MyDrive/codeReview/5_models/DSP/glove_representation.csv'
loaded_glove_representation = pd.read_csv(csv_file_path)

# Combine the loaded Glove representation with the labels
X = loaded_glove_representation
y = df['DSP']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train_resampled, y_train_resampled = overSample(X_train,y_train)

Class distribution before oversampling:
0    157
1     70
Name: SID, dtype: int64

Class distribution after oversampling:
1    157
0    157
Name: SID, dtype: int64


In [None]:
print('implement models for Glove representation\n')
implement_models(X_train_resampled, X_test, y_train_resampled, y_test)

implement models for Glove representation

Accuracy DecisionTree: 0.51
Classification Report of DecisionTree:
              precision    recall  f1-score   support

           0       0.69      0.51      0.59        39
           1       0.32      0.50      0.39        18

    accuracy                           0.51        57
   macro avg       0.51      0.51      0.49        57
weighted avg       0.57      0.51      0.53        57

Accuracy SVD: 0.60
Classification Report of SVD:
              precision    recall  f1-score   support

           0       0.74      0.64      0.68        39
           1       0.39      0.50      0.44        18

    accuracy                           0.60        57
   macro avg       0.56      0.57      0.56        57
weighted avg       0.63      0.60      0.61        57

Accuracy Naive Bayes: 0.49
Classification Report of Naive Bayes:
              precision    recall  f1-score   support

           0       0.67      0.51      0.58        39
           1 



In [None]:
# Load FastText representation from the saved file
csv_file_path = '/content/drive/MyDrive/codeReview/5_models/DSP/fasttext_representation.csv'
loaded_fasttext_representation = pd.read_csv(csv_file_path)

# Combine the loaded FastText representation with the labels
X = loaded_fasttext_representation
y = df['DSP']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train_resampled, y_train_resampled = overSample(X_train,y_train)

Class distribution before oversampling:
0    157
1     70
Name: SID, dtype: int64

Class distribution after oversampling:
1    157
0    157
Name: SID, dtype: int64


In [None]:
print('implement models for FastText representation\n')
implement_models(X_train_resampled, X_test, y_train_resampled, y_test)

implement models for FastText representation

Accuracy DecisionTree: 0.54
Classification Report of DecisionTree:
              precision    recall  f1-score   support

           0       0.67      0.67      0.67        39
           1       0.28      0.28      0.28        18

    accuracy                           0.54        57
   macro avg       0.47      0.47      0.47        57
weighted avg       0.54      0.54      0.54        57

Accuracy SVD: 0.61
Classification Report of SVD:
              precision    recall  f1-score   support

           0       0.81      0.56      0.67        39
           1       0.43      0.72      0.54        18

    accuracy                           0.61        57
   macro avg       0.62      0.64      0.60        57
weighted avg       0.69      0.61      0.63        57

Accuracy Naive Bayes: 0.61
Classification Report of Naive Bayes:
              precision    recall  f1-score   support

           0       0.84      0.54      0.66        39
          

In [None]:
# Load USE representation from the saved file
csv_file_path = '/content/drive/MyDrive/codeReview/5_models/DSP/use_representation.csv'
loaded_use_representation = pd.read_csv(csv_file_path)

# Combine the loaded USE representation with the labels
X = loaded_use_representation
y = df['DSP']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:

# Create a KFold object
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform K-Fold cross-validation
cross_val_results = cross_val_score(MLPClassifier(), X, y, cv=kf, scoring='accuracy')

# Display the cross-validation results
print(f'Cross-Validation Results: {cross_val_results}')
print(f'Mean Accuracy: {cross_val_results.mean()}')



Cross-Validation Results: [0.71929825 0.75438596 0.71929825 0.66666667 0.67857143]
Mean Accuracy: 0.7076441102756892




In [None]:
X_train_resampled, y_train_resampled = overSample(X_train,y_train)

Class distribution before oversampling:
0    157
1     70
Name: SID, dtype: int64

Class distribution after oversampling:
1    157
0    157
Name: SID, dtype: int64


In [None]:
print('implement models for USE representation\n')
implement_models(X_train_resampled, X_test, y_train_resampled, y_test)

implement models for USE representation

Accuracy DecisionTree: 0.68
Classification Report of DecisionTree:
              precision    recall  f1-score   support

           0       0.82      0.69      0.75        39
           1       0.50      0.67      0.57        18

    accuracy                           0.68        57
   macro avg       0.66      0.68      0.66        57
weighted avg       0.72      0.68      0.69        57

Accuracy SVD: 0.74
Classification Report of SVD:
              precision    recall  f1-score   support

           0       0.73      0.97      0.84        39
           1       0.80      0.22      0.35        18

    accuracy                           0.74        57
   macro avg       0.77      0.60      0.59        57
weighted avg       0.75      0.74      0.68        57

Accuracy Naive Bayes: 0.65
Classification Report of Naive Bayes:
              precision    recall  f1-score   support

           0       0.70      0.85      0.77        39
           1   



In [None]:
# Load BERT representation from the saved file
csv_file_path = '/content/drive/MyDrive/codeReview/5_models/DSP/bert_representation.csv'
loaded_bert_representation = pd.read_csv(csv_file_path)

# Combine the loaded BERT representation with the labels
X = loaded_bert_representation
y = df['DSP']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train_resampled, y_train_resampled = overSample(X_train,y_train)

Class distribution before oversampling:
0    157
1     70
Name: SID, dtype: int64

Class distribution after oversampling:
1    157
0    157
Name: SID, dtype: int64


In [None]:
print('implement models for BERT representation\n')
implement_models(X_train_resampled, X_test, y_train_resampled, y_test)

implement models for BERT representation

Accuracy DecisionTree: 0.58
Classification Report of DecisionTree:
              precision    recall  f1-score   support

           0       0.70      0.67      0.68        39
           1       0.35      0.39      0.37        18

    accuracy                           0.58        57
   macro avg       0.53      0.53      0.53        57
weighted avg       0.59      0.58      0.58        57

Accuracy SVD: 0.60
Classification Report of SVD:
              precision    recall  f1-score   support

           0       0.72      0.67      0.69        39
           1       0.38      0.44      0.41        18

    accuracy                           0.60        57
   macro avg       0.55      0.56      0.55        57
weighted avg       0.61      0.60      0.60        57

Accuracy Naive Bayes: 0.65
Classification Report of Naive Bayes:
              precision    recall  f1-score   support

           0       0.74      0.74      0.74        39
           1  