In [52]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB,BernoulliNB,MultinomialNB,CategoricalNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler,Normalizer
from sklearn.feature_extraction.text import TfidfVectorizer

import torch
from transformers import AutoTokenizer, AutoModel

import torch.nn.functional as F

In [2]:
df=pd.read_csv('centes_marked.csv')

In [19]:
classes=df['Is venue, not person']
y=classes.dropna().values.astype('int')

desc=df['description']
corpus=desc.dropna().values.astype('str')[:y.shape[0]]

In [4]:
device = torch.device('cuda')
model_path='sentence-transformers/all-roberta-large-v1'

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModel.from_pretrained(model_path).to(device)
res=model.to(device)


In [5]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def vectorize(sentences):
    torch.cuda.empty_cache()
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt').to(device)

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)


    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
    embs=sentence_embeddings.detach().cpu().numpy()
    del sentence_embeddings
    torch.cuda.empty_cache()

    return embs

In [32]:
X=[]
for sentence in list(corpus):
    X.append(vectorize(sentence)[0])
X=np.array(X)

In [27]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
X=X.toarray()

In [58]:
# scaler = MinMaxScaler()
scaler=Normalizer()
scaler.fit(X)
x_scaled=scaler.transform(X)

In [33]:
x_train,x_test,y_train,y_test=train_test_split(X,y,random_state=8751,test_size=0.2,shuffle=True)

In [66]:
NB = GaussianNB()
NB.fit(x_train, y_train)

BNB=BernoulliNB()
BNB.fit(x_train, y_train)

# MNB=MultinomialNB()
# MNB.fit(x_train, y_train)

CNB=CategoricalNB()
CNB.fit(x_train, y_train)

clf = SVC(gamma='auto')
clf.fit(x_train, y_train)

neigh = KNeighborsClassifier(n_neighbors=2)
neigh.fit(x_train, y_train)

xgc = XGBClassifier(nthread=2)
xgc.fit(x_train, y_train)


XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=2, nthread=2, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=0, ...)

In [67]:
y_predict = NB.predict(x_test)
y2_predict=BNB.predict(x_test)
# y3_predict=MNB.predict(x_test)
y4_predict=CNB.predict(x_test)

y5_predict = clf.predict(x_test)
y6_predict=neigh.predict(x_test)
y7_predict = xgc.predict(x_test)


print("Accuracy Normal NB: {:.2f}".format(NB.score(x_test, y_test)))
print("Accuracy Bernulli NB: {:.2f}".format(BNB.score(x_test, y_test)))
# print("Accuracy Multinominal NB: {:.2f}".format(MNB.score(x_test, y_test)))
print("Accuracy Categorical NB: {:.2f}".format(CNB.score(x_test, y_test)))
print("Accuracy SVM NB: {:.2f}".format(accuracy_score(y_test, y5_predict)))
print("Accuracy Kmeans : {:.2f}".format(neigh.score(x_test, y_test)))
print("Accuracy xdboost : {:.2f}".format(accuracy_score(y_test, y7_predict)))

Accuracy Normal NB: 0.86
Accuracy Bernulli NB: 0.86
Accuracy Categorical NB: 0.71
Accuracy SVM NB: 0.71
Accuracy Kmeans : 0.76
Accuracy xdboost : 0.81


In [31]:
rf = RandomForestClassifier()

rf.fit(x_train, y_train)
score=rf.score(x_test,y_test)
print('Random forest accuracy',score)

Random forest accuracy 0.6666666666666666
