In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/codeReview/5_models/Mockery/mockery.csv')
df.head()

Unnamed: 0,description,Mockery
0,You must have a black belt in copy-pasting. Br...,1
1,Did you invent a new programming language? Bec...,1
2,Are you a fan of obfuscation? Because this cod...,1
3,"Did you misspell ""bug"" as ""feature"" in the com...",1
4,"I see you're a fan of the ""WTF per minute"" cod...",1


In [5]:
from sklearn.metrics import accuracy_score, classification_report

def evaluate_model(y_pred, y_test, model_name):
  accuracy = accuracy_score(y_test, y_pred)
  print(f'Accuracy {model_name}: {accuracy:.2f}')
  report = classification_report(y_test, y_pred)
  print(f'Classification Report of {model_name}:\n{report}')

In [6]:
def train_model(model, model_name, X_train, X_test, y_train, y_test):
  model.fit(X_train, y_train)
  y_pred=model.predict(X_test)
  evaluate_model(y_pred, y_test, model_name)

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB


def implement_models(X_train, X_test, y_train, y_test):
  train_model(DecisionTreeClassifier(), 'DecisionTree', X_train, X_test, y_train, y_test)
  train_model(SVC(), 'SVD', X_train, X_test, y_train, y_test)
  train_model(GaussianNB(), 'Naive Bayes', X_train, X_test, y_train, y_test)
  train_model(XGBClassifier(), 'XGBOOST', X_train, X_test, y_train, y_test)
  train_model(MLPClassifier(), 'MLP', X_train, X_test, y_train, y_test)


In [9]:
from sklearn.model_selection import train_test_split

# Load FastText representation from the saved file
csv_file_path = '/content/drive/MyDrive/codeReview/5_models/Mockery/fasttext_representation.csv'
loaded_fasttext_representation = pd.read_csv(csv_file_path)

# Combine the loaded FastText representation with the labels
X = loaded_fasttext_representation
y = df['Mockery']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Display the class distribution before oversampling
print("Class distribution before oversampling:")
print(pd.Series(y_train).value_counts())

# Apply SMOTE for oversampling
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Display the class distribution after oversampling
print("\nClass distribution after oversampling:")
print(pd.Series(y_train_resampled).value_counts())



Class distribution before oversampling:
0    163
1     76
Name: Mockery, dtype: int64

Class distribution after oversampling:
1    163
0    163
Name: Mockery, dtype: int64


In [14]:
print('implement models for FastText representation\n')
implement_models(X_train_resampled, X_test, y_train_resampled, y_test)

implement models for FastText representation

Accuracy DecisionTree: 0.75
Classification Report of DecisionTree:
              precision    recall  f1-score   support

           0       0.76      0.86      0.81        37
           1       0.72      0.57      0.63        23

    accuracy                           0.75        60
   macro avg       0.74      0.72      0.72        60
weighted avg       0.75      0.75      0.74        60

Accuracy SVD: 0.85
Classification Report of SVD:
              precision    recall  f1-score   support

           0       0.80      1.00      0.89        37
           1       1.00      0.61      0.76        23

    accuracy                           0.85        60
   macro avg       0.90      0.80      0.82        60
weighted avg       0.88      0.85      0.84        60

Accuracy Naive Bayes: 0.53
Classification Report of Naive Bayes:
              precision    recall  f1-score   support

           0       0.70      0.43      0.53        37
          

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
# Load USE representation from the saved file
csv_file_path = '/content/drive/MyDrive/codeReview/5_models/Mockery/use_representation.csv'
loaded_use_representation = pd.read_csv(csv_file_path)

# Combine the loaded USE representation with the labels
X = loaded_use_representation
y = df['Mockery']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Display the class distribution before oversampling
print("Class distribution before oversampling:")
print(pd.Series(y_train).value_counts())

# Apply SMOTE for oversampling
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Display the class distribution after oversampling
print("\nClass distribution after oversampling:")
print(pd.Series(y_train_resampled).value_counts())

Class distribution before oversampling:
0    163
1     76
Name: Mockery, dtype: int64

Class distribution after oversampling:
1    163
0    163
Name: Mockery, dtype: int64


In [17]:
print('implement models for USE representation\n')
implement_models(X_train, X_test, y_train, y_test)

implement models for USE representation

Accuracy DecisionTree: 0.75
Classification Report of DecisionTree:
              precision    recall  f1-score   support

           0       0.78      0.84      0.81        37
           1       0.70      0.61      0.65        23

    accuracy                           0.75        60
   macro avg       0.74      0.72      0.73        60
weighted avg       0.75      0.75      0.75        60

Accuracy SVD: 0.85
Classification Report of SVD:
              precision    recall  f1-score   support

           0       0.82      0.97      0.89        37
           1       0.94      0.65      0.77        23

    accuracy                           0.85        60
   macro avg       0.88      0.81      0.83        60
weighted avg       0.86      0.85      0.84        60

Accuracy Naive Bayes: 0.87
Classification Report of Naive Bayes:
              precision    recall  f1-score   support

           0       0.84      0.97      0.90        37
           1   

