In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import SnowballStemmer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics


from pytorch_pretrained_bert import BertTokenizer

In [2]:
#Extracting text data from pos and neg files and labeling the texts with the correct labels.
#Then splitting the data into x data and y data

#Preprocessing techniques will vary for different datasets

neg = pd.read_csv('freeletics/neg1.csv')
pos = pd.read_csv('freeletics/pos1.csv')

y_neg = np.zeros((len(neg),1)).astype(int)
y_pos = np.ones((len(pos),1)).astype(int)

print(len(y_neg)+len(y_pos))

neg_messages = neg.values[:,[2]]
pos_messages = pos.values[:,[2]]

x_data = np.vstack((neg_messages,pos_messages))
y_data = np.vstack((y_neg,y_pos))

3019


In [3]:
#The SnowballStemmer is able to tokenize words so that they are represented as vectors. Each word was turned into
#a vector and that became my x data.

stem_tokenizer = SnowballStemmer("english")
tfidf_params = {
            'ngram_range': (1, 2),
            'max_features': 1000,
            'tokenizer': BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False),
            'binary': True
        }

corpus = np.ravel(x_data)
vectorizer = TfidfVectorizer(tfidf_params)
X = vectorizer.fit_transform(corpus)

#splitting into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y_data, test_size=0.25)

#Logistic Regression

logreg = LogisticRegression()
logreg.fit(X_train,y_train)
y_pred=logreg.predict(X_test)


cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
print(cnf_matrix)
print(metrics.classification_report(y_pred=y_pred, y_true=y_test))

[[ 86 146]
 [ 19 504]]
              precision    recall  f1-score   support

           0       0.82      0.37      0.51       232
           1       0.78      0.96      0.86       523

    accuracy                           0.78       755
   macro avg       0.80      0.67      0.68       755
weighted avg       0.79      0.78      0.75       755



  y = column_or_1d(y, warn=True)


In [23]:
stem_tokenizer = SnowballStemmer("english")
tfidf_params = {
            'ngram_range': (1, 2),
            'max_features': 1000,
            'tokenizer': stem_tokenizer,
            'binary': True
        }

corpus = np.ravel(x_data)
vectorizer = TfidfVectorizer(tfidf_params)
X = vectorizer.fit_transform(corpus)

#splitting into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y_data, test_size=0.25)

#Logistic Regression

logreg = LogisticRegression()
logreg.fit(X_train,y_train)
y_pred=logreg.predict(X_test)


cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
print(cnf_matrix)
print(metrics.classification_report(y_pred=y_pred, y_true=y_test))

[[ 86 131]
 [ 13 525]]
              precision    recall  f1-score   support

           0       0.87      0.40      0.54       217
           1       0.80      0.98      0.88       538

    accuracy                           0.81       755
   macro avg       0.83      0.69      0.71       755
weighted avg       0.82      0.81      0.78       755



  y = column_or_1d(y, warn=True)


In [6]:
#Extracting text data from pos and neg files and labeling the texts with the correct labels.
#Then splitting the data into x data and y data

neg = pd.read_csv('coursehero/neg.csv')
pos = pd.read_csv('coursehero/pos.csv')

y_neg = np.zeros((len(neg),1)).astype(int)
y_pos = np.ones((len(pos),1)).astype(int)

print(len(y_neg)+len(y_pos))

neg_messages = neg.values[:,[9]]
pos_messages = pos.values[:,[10]]

x_data = np.vstack((neg_messages,pos_messages))
y_data = np.vstack((y_neg,y_pos))

10987


In [8]:
#The SnowballStemmer is able to tokenize words so that they are represented as vectors. Each word was turned into
#a vector and that became my x data.

stem_tokenizer = SnowballStemmer("english")
tfidf_params = {
            'ngram_range': (1, 2),
            'max_features': 1000,
            'tokenizer': BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False),
            'binary': True
        }

corpus = np.ravel(x_data)
vectorizer = TfidfVectorizer(tfidf_params)
X = vectorizer.fit_transform(corpus)

#splitting into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y_data, test_size=0.25)

#Logistic Regression

logreg = LogisticRegression()
logreg.fit(X_train,y_train)
y_pred=logreg.predict(X_test)


cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
print(cnf_matrix)
print(metrics.classification_report(y_pred=y_pred, y_true=y_test))

[[2419   12]
 [ 286   30]]
              precision    recall  f1-score   support

           0       0.89      1.00      0.94      2431
           1       0.71      0.09      0.17       316

    accuracy                           0.89      2747
   macro avg       0.80      0.55      0.55      2747
weighted avg       0.87      0.89      0.85      2747



  y = column_or_1d(y, warn=True)
