In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [2]:
df = pd.read_excel('../generated_data/similar_sentences_large.xlsx')
df_stage1 = df[['label' , 'sentence1', 'sentence2']]
df_stage2 = pd.DataFrame(columns=['label' , 'sentence_pair'])
df_stage2['label'] = df_stage1['label']
df_stage2['sentence_pair'] = df_stage1['sentence1'] + '<SEP>' + df_stage1['sentence2']

In [26]:
df.shape    

(4299, 7)

In [27]:
df.label.value_counts()

STYLYSTIC     2656
IRRELEVANT     838
RELEVANT       805
Name: label, dtype: int64

In [33]:
# Defining a Function to clean up the reviews 
def text_preprocess(ds: pd.Series) -> pd.Series:
    """
    Apply NLP Preprocessing Techniques to the reviews.
    """
    for m in range(len(ds)):
        
        main_words = re.sub('[^a-zA-Z]', ' ', ds[m])                                      # Retain only alphabets
        main_words = (main_words.lower()).split()
        main_words = [w for w in main_words if not w in set(stopwords.words('english'))]  # Remove stopwords
        
        lem = WordNetLemmatizer()
        main_words = [lem.lemmatize(w) for w in main_words if len(w) > 1]                 # Group different forms of the same word
        
        main_words = ' '.join(main_words)
        ds[m] = main_words

    return ds

#df_stage2['sentence_pair'] = text_preprocess(df_stage2['sentence_pair'])

In [5]:
X = df_stage2.iloc[:, 1].values
y = df_stage2.iloc[:, 0].values

In [6]:

td = TfidfVectorizer(max_features = 100)
X = td.fit_transform(X).toarray()

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [8]:
#upsampling the minority class
from imblearn.under_sampling import RandomUnderSampler
ros = RandomUnderSampler(random_state=42)
X_train, y_train = ros.fit_resample(X_train, y_train)

In [12]:
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

# Classification metrics
from sklearn.metrics import accuracy_score, classification_report
classification_report = classification_report(y_test, y_pred, digits=3)

print('\n Accuracy: ', accuracy_score(y_test, y_pred))
print('\nClassification Report')
print('======================================================')
print('\n', classification_report)


 Accuracy:  0.5983037779491134

Classification Report

               precision    recall  f1-score   support

  IRRELEVANT      0.587     0.792     0.674       475
    RELEVANT      0.321     0.837     0.464       417
   STYLYSTIC      0.954     0.486     0.644      1702

    accuracy                          0.598      2594
   macro avg      0.621     0.705     0.594      2594
weighted avg      0.785     0.598     0.620      2594



In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [15]:
def simple_logistic_classify(X_tr, y_tr, X_test, y_test, description, _C=1.0):
    model = LogisticRegression(C=_C).fit(X_tr, y_tr)
    #classification report
    print('Classification Report for ', description)
    print('======================================================')
    print(classification_report(y_test, model.predict(X_test)))
    print('======================================================')
    print('Accuracy: ', accuracy_score(y_test, model.predict(X_test)))
    print('======================================================')
    return model

simple_logistic_classify(X_train, y_train, X_test, y_test, 'tfidf')

Classification Report for  tfidf
              precision    recall  f1-score   support

  IRRELEVANT       0.77      0.86      0.81       475
    RELEVANT       0.55      0.88      0.68       417
   STYLYSTIC       0.97      0.79      0.87      1702

    accuracy                           0.82      2594
   macro avg       0.76      0.84      0.79      2594
weighted avg       0.86      0.82      0.83      2594

Accuracy:  0.818041634541249
