In [None]:
%sh

pip install scikit-multilearn
pip install imbalanced-learn
pip install nltk
pip install spacy
pip install pickle
python -m spacy download en_core_web_lg

In [None]:
#tratamento de dados pandas
import pandas as pd

#nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords as sw

#Salvar modelos
import pickle

#Machine Learning
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.over_sampling import RandomOverSampler

#Spacy
import spacy
nlp = spacy.load("en_core_web_lg")

#Vect
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
#Especifique o caminho completo para o arquivo CSV
caminho_arquivo = "/eduardo.morais/train.csv"

#Use a função read_csv para carregar o arquivo
df = pd.read_csv(caminho_arquivo)

print(df.info())

In [None]:
#lematizar palavras
def lematiza(text):
  
  doc = nlp(text)
  tokens_alpha = [token.lemma_ for token in doc if token.is_alpha]
  return " ".join(tokens_alpha)

#lematizar aplicando funcao ja definida
df['text'] = df['comment_text'].map(lambda x : lematiza(x))

#dividir bases
df_treino, df_teste = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
#cria vect, treina e salva modelo
def vectTreino(X,nome_modelo):
     
 #define parametros vect
  vect = TfidfVectorizer(
                       stop_words=sw.words('english'), 
                       ngram_range=(1, 3), #considera trigrams
                       strip_accents='unicode', #desconsidera acentos
                       min_df=2, #min para considerar
                       max_df=0.8, #max ocorrencias
                       lowercase=True, #tudo em minusculo
                          )
  #treinando
  X_dtm = vect.fit_transform(X)
  #nome de pasta para salvar modelo
  folder = "/eduardo.morais/"+nome_modelo+"/vect/"
  #cria pasta para salvar vect
  dbutils.fs.mkdirs(folder)
  #salvando vect
  filename = folder  + 'vect.sav'
  pickle.dump(vect, open(filename, 'wb'))

  return X_dtm

#cria variavel X
X = df_treino.text

#executa vect
X_dtm = vectTreino(X, "wikipedia")

In [None]:
#modelo 
def TrainModel(X_dtm,df_pandas,nome_modelo,model):

  folder = "/eduardo.morais/"+nome_modelo+"/multi-label/"
  dbutils.fs.rm ( folder, True ) #exclui pasta
  dbutils.fs.mkdirs(folder) #cria pasta para salvar vect

  #for para criar modelos
  for label in categorias_list: 

    print('Processando: {}'.format(label))
    y = df_pandas[label]

    #Corrige desbalanceamento
    oversampler = RandomOverSampler(sampling_strategy='minority')
    X = X_dtm
    X, y = oversampler.fit_resample(X, y)

    #Treinar o modelo usando X & y
    model.fit(X, y)

    #salvar model
    filename = folder + label.replace("/", "|") + '.sav'
    pickle.dump(model, open(filename, 'wb'))
    
#modelo GradientBososting
model = GradientBoostingClassifier(n_estimators=100,loss='exponential')

#listar categorias
categorias_list = ['obscene','insult','toxic',
'severe_toxic','identity_hate','threat']

#modelo execucao
TrainModel(X_dtm,df_treino,"wikipedia",model)

In [None]:
#carrega e executa vect
def vectCarrega(X,nome_modelo):
     
  #nome de pasta para carregar o modelo
  folder = "/eduardo.morais/"+nome_modelo+"/vect/"
  #carregando o vect
  filename = folder  + 'vect.sav'
  vect = pickle.load(open(filename, 'rb'))
  #executar modelo carregado
  X_dtm = vect.transform(X)

  return X_dtm

#cria variavel X
X = df_teste.text

#executa vect
X_dtm = vectCarrega(X, "wikipedia")

In [None]:
#modelo carregamento
def ModelLoad(X_dtm,df_teste,nome_modelo):

  #nome de pasta para carregar modelo
  folder = "/eduardo.morais/"+nome_modelo+"/multi-label/"

  for label in categorias_list:
    print(label)

    filename = folder + label.replace("/", "|") + '.sav'
    model = pickle.load(open(filename, 'rb'))

    #Fazer previsões a partir do conjunto de teste (X_dtm)
    y_pred = model.predict(X_dtm)
    y = df_teste[label]

    print('Teste Acurácia: {}'.format(accuracy_score(y, y_pred)))
    df_teste[label] = y_pred

  return df_teste

#modelo
df_teste = ModelLoad(X_dtm,df_teste,"wiki")