In [1]:
import nltk
import os
import re
import sklearn
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2,SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,classification_report

In [6]:
def preprocess(line):
    line = re.sub('[^\w\s]'," ",str(line))
    line = re.sub('[^a-zA-Z]'," ",str(line))
    _line = word_tokenize(line)
    final = " "
    for i in range(len(_line)):
        if(_line[i].lower() not in stopwords.words('english')):
            final+=_line[i].lower()+" "
    return final

In [7]:
## Source Domain: Books
print("Source Domain: Books")

## reading positive reviews
with open('books/positive.review','r',encoding='utf-8') as f:
    source_pos = f.readlines()

## cleaning data
for i in range(0,len(source_pos)):
    source_pos[i] = re.sub(r'\d+', '', source_pos[i])
    source_pos[i] = source_pos[i].replace(' #label#:positive\n', '')
    source_pos[i] = preprocess(source_pos[i])
print("number of positive reviews = ", len(source_pos))

## reading negative reviews
with open('books/negative.review','r',encoding='utf-8') as f:
    source_neg = f.readlines()

## cleaning data
for i in range(0,len(source_neg)):
    source_neg[i] = re.sub(r'\d+', '', source_neg[i])
    source_neg[i] = source_neg[i].replace(' #label#:negative\n', '')
    source_neg[i] = preprocess(source_neg[i])
print("number of negative reviews = ", len(source_neg))

Source Domain: Books
number of positive reviews =  1000
number of negative reviews =  1000


In [8]:
## source domain training set
source_X_train = source_pos + source_neg
source_y_train = np.zeros(2000)
source_y_train[0:1000] = 1

## vectorizing the source domain training set
vectorizer = TfidfVectorizer(min_df = 2,max_df=0.8,use_idf=True,sublinear_tf=True,stop_words='english')
vectorizer_chi2 = SelectKBest(chi2,k=4500)
source_vector_X_train = vectorizer.fit_transform(source_X_train)
source_vector_X_train = vectorizer_chi2.fit_transform(source_vector_X_train,source_y_train)
print("source domain training set vector: ", source_vector_X_train.shape)

source domain training set vector:  (2000, 4500)


In [9]:
## Target Domain: Electronics
print("Target Domain: Electronics")

## reading positive reviews
with open('electronics/positive.review','r',encoding='utf-8') as f:
    target_pos = f.readlines()

## cleaning data
for i in range(0,len(target_pos)):
    target_pos[i] = re.sub(r'\d+', '', target_pos[i])
    target_pos[i] = target_pos[i].replace(' #label#:positive\n', '').replace(':', '').replace('_', ' ')
    target_pos[i] = ' '.join(target_pos[i].split())
print("number of positive reviews = ", len(target_pos))

## reading negative reviews
with open('electronics/negative.review','r',encoding='utf-8') as f:
    target_neg = f.readlines()
    
## cleaning data
for i in range(0,len(target_neg)):
    target_neg[i] = re.sub(r'\d+', '', target_neg[i])
    target_neg[i] = target_neg[i].replace(' #label#:negative\n', '').replace(':', '').replace('_', ' ')
    target_neg[i] = ' '.join(target_neg[i].split())
print("number of negative reviews = ", len(target_neg))

Target Domain: Electronics
number of positive reviews =  1000
number of negative reviews =  1000


In [10]:
## target domain training and test sets
target_X_train = target_pos[0:800] + target_neg[0:800]
target_y_train = np.zeros(1600)
target_y_train[0:800] = 1
target_X_test = target_pos[800:1000] + target_neg[800:1000]
target_y_test = np.zeros(400)
target_y_test[0:200] = 1

## vectorizing the target domain training and test sets
target_vector_X_train = vectorizer.fit_transform(target_X_train)
target_vector_X_train = vectorizer_chi2.fit_transform(target_vector_X_train,target_y_train)
print("target domain training set vector: ", target_vector_X_train.shape)
target_vector_X_test = vectorizer.transform(target_X_test)
target_vector_X_test = vectorizer_chi2.transform(target_vector_X_test)
print("target domain test set vector: ", target_vector_X_test.shape)

target domain training set vector:  (1600, 4500)
target domain test set vector:  (400, 4500)


In [11]:
## Training a logistic regression classifier on the source domain training set
lr_model = LogisticRegression(penalty="l2", C=1.0)
lr_model.fit(source_vector_X_train, source_y_train)

## Evaluating it on the target domain test set
lr_pred = lr_model.predict(target_vector_X_test)
cross_domain_transfer_accuracy = lr_model.score(target_vector_X_test, target_y_test)
print("Cross-domain Transfer Accuracy: ", cross_domain_transfer_accuracy)
print("\nConfusion Matrix:\n", confusion_matrix(target_y_test,lr_pred))
print("\nEvaluation: \n0: negative \n1: positive\n", classification_report(target_y_test,lr_pred))

Cross-domain Transfer Accuracy:  0.485

Confusion Matrix:
 [[ 92 108]
 [ 98 102]]

Evaluation: 
0: negative 
1: positive
               precision    recall  f1-score   support

         0.0       0.48      0.46      0.47       200
         1.0       0.49      0.51      0.50       200

    accuracy                           0.48       400
   macro avg       0.48      0.48      0.48       400
weighted avg       0.48      0.48      0.48       400





In [12]:
## Training a logistic regression classifier on the target domain training set
lr_model = LogisticRegression(penalty="l2", C=1.0)
lr_model.fit(target_vector_X_train, target_y_train)

## Evaluating it on the target domain test set
lr_pred = lr_model.predict(target_vector_X_test)
direct_transfer_accuracy = lr_model.score(target_vector_X_test, target_y_test)
print("Direct Transfer Accuracy: ", direct_transfer_accuracy)
print("\nConfusion Matrix:\n", confusion_matrix(target_y_test,lr_pred))
print("\nEvaluation: \n0: negative \n1: positive\n", classification_report(target_y_test,lr_pred))

Direct Transfer Accuracy:  0.875

Confusion Matrix:
 [[176  24]
 [ 26 174]]

Evaluation: 
0: negative 
1: positive
               precision    recall  f1-score   support

         0.0       0.87      0.88      0.88       200
         1.0       0.88      0.87      0.87       200

    accuracy                           0.88       400
   macro avg       0.88      0.88      0.87       400
weighted avg       0.88      0.88      0.87       400





In [14]:
loss = direct_transfer_accuracy - cross_domain_transfer_accuracy
print("The transfer loss across domains = ", round(loss,4))

The transfer loss across domains =  0.39
