In [30]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [31]:
df = pd.read_excel('../generated_data/similar_sentences_large.xlsx')
df_stage1 = df[['label' , 'sentence1', 'sentence2']]
df_stage2 = pd.DataFrame(columns=['label' , 'sentence_pair'])
df_stage2['label'] = df_stage1['label']
df_stage2['sentence_pair'] = df_stage1['sentence1'] + '<SEP>' + df_stage1['sentence2']

In [32]:
df.shape    

(12968, 7)

In [33]:
# Defining a Function to clean up the reviews 
def text_preprocess(ds: pd.Series) -> pd.Series:
    """
    Apply NLP Preprocessing Techniques to the reviews.
    """
    for m in range(len(ds)):
        
        main_words = re.sub('[^a-zA-Z]', ' ', ds[m])                                      # Retain only alphabets
        main_words = (main_words.lower()).split()
        main_words = [w for w in main_words if not w in set(stopwords.words('english'))]  # Remove stopwords
        
        lem = WordNetLemmatizer()
        main_words = [lem.lemmatize(w) for w in main_words if len(w) > 1]                 # Group different forms of the same word
        
        main_words = ' '.join(main_words)
        ds[m] = main_words

    return ds

#df_stage2['sentence_pair'] = text_preprocess(df_stage2['sentence_pair'])

In [34]:
X = df_stage2.iloc[:, 1].values
y = df_stage2.iloc[:, 0].values

In [35]:

td = TfidfVectorizer(max_features = 500)
X = td.fit_transform(X).toarray()

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [39]:
#upsampling the minority class
from imblearn.under_sampling import RandomUnderSampler
ros = RandomUnderSampler(random_state=42)
X_train, y_train = ros.fit_resample(X_train, y_train)

In [40]:
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

# Classification metrics
from sklearn.metrics import accuracy_score, classification_report
classification_report = classification_report(y_test, y_pred)

print('\n Accuracy: ', accuracy_score(y_test, y_pred))
print('\nClassification Report')
print('======================================================')
print('\n', classification_report)


 Accuracy:  0.7652274479568234

Classification Report

               precision    recall  f1-score   support

  IRRELEVANT       0.66      0.84      0.74       475
    RELEVANT       0.49      0.88      0.63       417
   STYLYSTIC       0.97      0.72      0.83      1702

    accuracy                           0.77      2594
   macro avg       0.71      0.81      0.73      2594
weighted avg       0.84      0.77      0.78      2594

