In [2]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir("/content/drive/My Drive/nlp")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Logistic regression

In [11]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer


x_train = pd.read_csv('x_train.csv', converters = {'review': str})
x_train = x_train['review'].values
x_test = pd.read_csv('x_test.csv', converters = {'review': str})
x_test = x_test['review'].values
y_train = pd.read_csv('y_train.csv').values.ravel()
y_test = pd.read_csv('y_test.csv').values.ravel()

tfidf = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0, ngram_range=(1,2), sublinear_tf=True)
x_train = tfidf.fit_transform(x_train)
x_test = tfidf.transform(x_test)

lr = LogisticRegression(penalty='l2', max_iter=500, C=1, random_state=123)
lr_fit = lr.fit(x_train, y_train)
print(lr_fit)
lr_predict = lr.predict(x_test)

report = classification_report(y_test, lr_predict, target_names=['Positive', 'Negative'])
print(report)
cm = confusion_matrix(y_test, lr_predict, labels=[1,0])
print(cm)

LogisticRegression(C=1, max_iter=500, random_state=123)
              precision    recall  f1-score   support

    Positive       0.91      0.88      0.89      3824
    Negative       0.88      0.91      0.89      3676

    accuracy                           0.89      7500
   macro avg       0.89      0.89      0.89      7500
weighted avg       0.89      0.89      0.89      7500

[[3352  324]
 [ 471 3353]]


In [10]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer

x_train = pd.read_csv('x_train.csv', converters = {'review': str})
x_train = x_train['review'].values
x_test = pd.read_csv('x_test.csv', converters = {'review': str})
x_test = x_test['review'].values
y_train = pd.read_csv('y_train.csv').values.ravel()
y_test = pd.read_csv('y_test.csv').values.ravel()

cv = CountVectorizer(binary=False, min_df=0.0, max_df=1.0, ngram_range=(1,2))
x_train = cv.fit_transform(x_train)
x_test = cv.transform(x_test)

lr = LogisticRegression(penalty='l2', max_iter=500, C=1, random_state=123)
lr_fit = lr.fit(x_train, y_train)
print(lr_fit)
lr_predict = lr.predict(x_test)

report = classification_report(y_test, lr_predict, target_names=['Positive', 'Negative'])
print(report)
cm = confusion_matrix(y_test, lr_predict, labels=[1,0])
print(cm)

LogisticRegression(C=1, max_iter=500, random_state=123)
              precision    recall  f1-score   support

    Positive       0.91      0.89      0.90      3824
    Negative       0.89      0.91      0.90      3676

    accuracy                           0.90      7500
   macro avg       0.90      0.90      0.90      7500
weighted avg       0.90      0.90      0.90      7500

[[3357  319]
 [ 416 3408]]


#SVM

In [3]:
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer


x_train = pd.read_csv('x_train.csv', converters = {'review': str})
x_train = x_train['review'].values
x_test = pd.read_csv('x_test.csv', converters = {'review': str})
x_test = x_test['review'].values
y_train = pd.read_csv('y_train.csv').values.ravel()
y_test = pd.read_csv('y_test.csv').values.ravel()

tfidf = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0, ngram_range=(1,2), sublinear_tf=True)
x_train = tfidf.fit_transform(x_train)
x_test = tfidf.transform(x_test)

svm = SGDClassifier(loss='hinge', max_iter=500, random_state=123)
svm = svm.fit(x_train, y_train)
svm_predict = svm.predict(x_test)

report = classification_report(y_test, svm_predict, target_names=['Positive', 'Negative'])
print(report)
cm = confusion_matrix(y_test, svm_predict, labels=[1,0])
print(cm)

              precision    recall  f1-score   support

    Positive       0.91      0.87      0.89      3824
    Negative       0.87      0.92      0.89      3676

    accuracy                           0.89      7500
   macro avg       0.89      0.89      0.89      7500
weighted avg       0.89      0.89      0.89      7500

[[3367  309]
 [ 498 3326]]


In [4]:
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer

x_train = pd.read_csv('x_train.csv', converters = {'review': str})
x_train = x_train['review'].values
x_test = pd.read_csv('x_test.csv', converters = {'review': str})
x_test = x_test['review'].values
y_train = pd.read_csv('y_train.csv').values.ravel()
y_test = pd.read_csv('y_test.csv').values.ravel()

cv = CountVectorizer(binary=False, min_df=0.0, max_df=1.0, ngram_range=(1,2))
x_train = cv.fit_transform(x_train)
x_test = cv.transform(x_test)

svm = SGDClassifier(loss='hinge', max_iter=500, random_state=123)
svm = svm.fit(x_train, y_train)
svm_predict = svm.predict(x_test)

report = classification_report(y_test, svm_predict, target_names=['Positive', 'Negative'])
print(report)
cm = confusion_matrix(y_test, svm_predict, labels=[1,0])
print(cm)

              precision    recall  f1-score   support

    Positive       0.91      0.90      0.90      3824
    Negative       0.89      0.91      0.90      3676

    accuracy                           0.90      7500
   macro avg       0.90      0.90      0.90      7500
weighted avg       0.90      0.90      0.90      7500

[[3331  345]
 [ 400 3424]]


In [19]:
from sklearn import svm
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer


x_train = pd.read_csv('x_train.csv', converters = {'review': str})
x_train = x_train['review'].values
x_test = pd.read_csv('x_test.csv', converters = {'review': str})
x_test = x_test['review'].values
y_train = pd.read_csv('y_train.csv').values.ravel()
y_test = pd.read_csv('y_test.csv').values.ravel()

tfidf = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0, ngram_range=(1,2), sublinear_tf=True)
x_train = tfidf.fit_transform(x_train)
x_test = tfidf.transform(x_test)

svm = svm.LinearSVC()
svm_predict = svm.fit(x_train, y_train).predict(x_test)

report = classification_report(y_test, svm_predict, target_names=['Positive', 'Negative'])
print(report)
cm = confusion_matrix(y_test, svm_predict, labels=[1,0])
print(cm)

              precision    recall  f1-score   support

    Positive       0.93      0.90      0.91      3824
    Negative       0.90      0.93      0.91      3676

    accuracy                           0.91      7500
   macro avg       0.91      0.91      0.91      7500
weighted avg       0.91      0.91      0.91      7500

[[3420  256]
 [ 399 3425]]
