In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB,BernoulliNB,MultinomialNB,CategoricalNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler,Normalizer
from sklearn.feature_extraction.text import TfidfVectorizer

import torch
from transformers import AutoTokenizer, AutoModel

import torch.nn.functional as F

In [None]:
df=pd.read_csv('centes_marked.csv')

In [None]:
df.columns

In [None]:
classes=df['Is venue, not person']
y=classes.dropna().values.astype('int')

desc=df['description']
corpus=desc.dropna().values.astype('str')[:y.shape[0]]

In [None]:
device = torch.device('cuda')
model_path='sentence-transformers/all-roberta-large-v1'

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModel.from_pretrained(model_path).to(device)
res=model.to(device)


In [None]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def vectorize(sentences):
    torch.cuda.empty_cache()
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt').to(device)

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)


    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
    embs=sentence_embeddings.detach().cpu().numpy()
    del sentence_embeddings
    torch.cuda.empty_cache()

    return embs

In [None]:
X=[]
for sentence in list(corpus):
    X.append(vectorize(sentence)[0])
X=np.array(X)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.0001,shuffle=True)

In [None]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)

NB = GaussianNB()
NB.fit(x_train, y_train)

BNB=BernoulliNB()
BNB.fit(x_train, y_train)

# MNB=MultinomialNB()
# MNB.fit(x_train, y_train)

CNB=CategoricalNB()
CNB.fit(x_train, y_train)

clf = SVC(gamma='auto')
clf.fit(x_train, y_train)

neigh = KNeighborsClassifier(n_neighbors=2)
neigh.fit(x_train, y_train)

xgc = XGBClassifier(nthread=2)
gistory=xgc.fit(x_train, y_train)


In [None]:
y_predict = NB.predict(x_test)
y2_predict=BNB.predict(x_test)
# y3_predict=MNB.predict(x_test)
y4_predict=CNB.predict(x_test)

y5_predict = clf.predict(x_test)
y6_predict=neigh.predict(x_test)
y7_predict = xgc.predict(x_test)

print("Accuracy Random forest: {:.2f}".format(rf.score(x_test,y_test)))
print("Accuracy Normal NB: {:.2f}".format(NB.score(x_test, y_test)))
print("Accuracy Bernulli NB: {:.2f}".format(BNB.score(x_test, y_test)))
# print("Accuracy Multinominal NB: {:.2f}".format(MNB.score(x_test, y_test)))
print("Accuracy Categorical NB: {:.2f}".format(CNB.score(x_test, y_test)))
print("Accuracy SVM NB: {:.2f}".format(accuracy_score(y_test, y5_predict)))
print("Accuracy Kmeans : {:.2f}".format(neigh.score(x_test, y_test)))
print("Accuracy xdboost : {:.2f}".format(accuracy_score(y_test, y7_predict)))

In [None]:
aram_df=pd.read_csv('aram.csv')
aram_desc=aram_df['description']
del aram_df['Is venue, not person']

In [None]:
desc_vectors=[]
for sentence in list(aram_desc.values.astype('str')):
    desc_vectors.append(vectorize(sentence)[0])
desc_vectors=np.array(desc_vectors)

In [None]:
predicted_classes=BNB.predict(desc_vectors)

In [None]:
aram_df['Is venue, not person']=predicted_classes

In [None]:
aram_df.to_csv('aram_df_marked.csv')