In [2]:
import pandas as pd
from tokenizers import decoders, models, normalizers, pre_tokenizers, processors, trainers, Tokenizer
from transformers import BertTokenizerFast
import numpy as np

In [3]:
df = pd.read_csv("/Users/pumpkin/Documents/Graduate_School/1st_Semester/ANLY_580/GU-ANLY580-PROJECT/cleaned_data/droppedDF.csv")
df = df.drop(columns=['Unnamed: 0'])

In [4]:
df['selftext'] = df['selftext'].astype('str') 

In [5]:
datasetLab = df

tokenizer = Tokenizer(models.WordPiece(unl_token="[UNK]"))

tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)

tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()]
)

tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()

tokenizer.pre_tokenizer.pre_tokenize_str("This is an example!")

special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)

batch_size = 1000

def batch_iterator():
    for i in range(0, len(datasetLab), batch_size):
        yield datasetLab[i : i + batch_size]["selftext"]

tokenizer.train_from_iterator(batch_iterator(), trainer=trainer)

cls_token_id = tokenizer.token_to_id("[CLS]")
sep_token_id = tokenizer.token_to_id("[SEP]")
print(cls_token_id, sep_token_id)

tokenizer.post_processor = processors.TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", cls_token_id),
        ("[SEP]", sep_token_id),
    ],
)

encoding = tokenizer.encode("This is one sentence.", "With this one we have a pair.")

tokenizer.decoder = decoders.WordPiece(prefix="##")

new_tokenizer = BertTokenizerFast(tokenizer_object=tokenizer)

2 3


In [6]:
o = new_tokenizer(datasetLab['selftext'].tolist())['input_ids']
bert_token = pd.DataFrame(o)
datasetLab['domain'].replace({"explainlikeimfive": 0, "Showerthoughts": 1, "worldnews": 2,
                              "funny": 3, "pics": 4, "woahdude": 5,
                              "food": 6, "Jokes": 7, "AskReddit": 8,
                              "LifeProTips": 9, "books": 10, "todayilearned": 11,
                              "GetMotivated": 12, "movies": 13, "IAmA": 14}, inplace=True)

In [7]:
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [13]:
X = bert_token
Y = np.nan_to_num(datasetLab['domain'])

In [20]:
# split data into train and test sets
seed = 7
test_size = 0.10
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
# fit model no training data
model = XGBClassifier(n_estimators = 100, max_depth = 6, min_child_weight = 1, eta = 0.5)
model.fit(X_train, y_train)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))



Accuracy: 43.58%


In [21]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(df['selftext'])
X

<4358x32772 sparse matrix of type '<class 'numpy.int64'>'
	with 358247 stored elements in Compressed Sparse Row format>

In [22]:
# split data into train and test sets
seed = 7
test_size = 0.10
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
# fit model no training data
model = XGBClassifier(n_estimators = 100, max_depth = 6, min_child_weight = 1, eta = 0.5)
model.fit(X_train, y_train)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))



Accuracy: 72.02%


In [14]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(df['selftext'])

svd = TruncatedSVD(n_components=2, n_iter=7, random_state=42)
X = svd.fit_transform(X)

In [15]:
# split data into train and test sets
seed = 7
test_size = 0.10
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
# fit model no training data
model = XGBClassifier(n_estimators = 100, max_depth = 6, min_child_weight = 1, eta = 0.5)
model.fit(X_train, y_train)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))



Accuracy: 48.85%


In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(df['selftext'])

svd = TruncatedSVD(n_components=2, n_iter=7, random_state=42)
X = svd.fit_transform(X)

In [17]:
# split data into train and test sets
seed = 7
test_size = 0.10
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
# fit model no training data
model = XGBClassifier(n_estimators = 100, max_depth = 6, min_child_weight = 1, eta = 0.5)
model.fit(X_train, y_train)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))



Accuracy: 28.44%
